<a href="https://colab.research.google.com/github/Amrit0726/sales-forecasting-using-Catboost-And-Upgini-ML-Project-/blob/main/Sales_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Sales Forecasting using CatBoost and Feature Enrichment

#This notebook documents my learning-based machine learning project on
#time-series sales forecasting. The goal is to build a baseline regression
#model and explore how feature enrichment can improve prediction accuracy.


In [None]:
%pip install -Uq upgini catboost
# NOTE:
# Upgini API key is required for feature enrichment.
# Due to API limits, enrichment was demonstrated on a 1000-row subset.


In [None]:
from os.path import exists
import pandas as pd

df_path = "train.csv.zip" if exists("train.csv.zip") else \
          "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"

df = pd.read_csv(df_path)
df = df.sample(n=19_000, random_state=0)
df.head()


In [None]:
#data preprocessing
df["store"]=df["store"].astype(str)
df["item"]=df["item"].astype(str)
df["date"]=pd.to_datetime(df["date"])
df.sort_values("date",inplace=True)
df.reset_index(inplace=True,drop=True)
df.head()

In [None]:
#splitting for training
train=df[df["date"]<"2017-01-01"]
test=df[df["date"]>="2017-01-01"]

In [None]:
train_features=train.drop(columns=["sales"])
train_target=train["sales"]
test_features=test.drop(columns=["sales"])
test_target=test["sales"]

In [None]:
#making new features for data set(feature enricher)
from upgini import FeaturesEnricher,SearchKey
from upgini.metadata import CVType
enricher=FeaturesEnricher(search_keys={
    "date":SearchKey.DATE,
},cv=CVType.time_series)
enricher.fit(train_features,train_target,eval_set=[(test_features,test_target)])

In [None]:
#Baseline Model:CatBoost Regressor
from catboost import CatBoostRegressor
from catboost.utils import eval_metric
model=CatBoostRegressor(verbose=False,allow_writing_files=False,random_state=0)
enricher.calculate_metrics(train_features,
                           train_target,
                           eval_set=[(test_features,test_target)],
                           estimator=model,
                           scoring="mean_absolute_percentage_error")



In [None]:
train_1k = train_features.sample(n=1000, random_state=42)
target_1k = train_target.loc[train_1k.index]

test_1k=test_features.sample(n=1000, random_state=42)
target_test_1k=test_target.loc[test_1k.index]
enriched_train_1k=enricher.transform(
    train_1k,
    keep_input=True
)

enriched_test_1k = enricher.transform(
    test_1k,
    keep_input=True
)


In [None]:
model.fit(train_features, train_target)
pred=model.predict(test_features)
from catboost.utils import eval_metric
y_true = test_target.values
y_pred = pred
smape_full=eval_metric(y_true, y_pred, "SMAPE")
smape_full

In [None]:
model.fit(train_1k, target_1k)
y_true=target_test_1k.values
y_pred=model.predict(test_1k)
baseline_smape_1k=eval_metric(y_true, y_pred, "SMAPE")
baseline_smape_1k

In [None]:
model.fit(enriched_train_1k, target_1k)

y_true=target_test_1k.values
y_pred=model.predict(enriched_test_1k)
enriched_smape_1k = eval_metric(y_true, y_pred, "SMAPE")
enriched_smape_1k


In [None]:
final_results=pd.DataFrame({
    "Model":[
        "Baseline (Full dataset)",
        "Baseline (1,000 rows)",
        "Enriched (1,000 rows)"
    ],
    "SMAPE":[
        smape_full,
        baseline_smape_1k,
        enriched_smape_1k
    ]
})
final_results