In [None]:
# Sales Forecasting using CatBoost and Feature Enrichment

#This notebook documents my learning-based machine learning project on
#time-series sales forecasting. The goal is to build a baseline regression
#model and explore how feature enrichment can improve prediction accuracy.


In [1]:
%pip install -Uq upgini catboost
# NOTE:
# Upgini API key is required for feature enrichment.
# Due to API limits, enrichment was demonstrated on a 1000-row subset.


In [2]:
from os.path import exists
import pandas as pd

df_path = "train.csv.zip" if exists("train.csv.zip") else \
          "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"

df = pd.read_csv(df_path)
df = df.sample(n=19_000, random_state=0)
df.head()


Unnamed: 0,date,store,item,sales
335813,2017-07-14,4,19,56
630838,2015-05-19,6,35,45
365685,2014-05-01,1,21,48
322781,2016-11-06,7,18,85
151590,2013-02-02,4,9,46


In [3]:
#data preprocessing
df["store"]=df["store"].astype(str)
df["item"]=df["item"].astype(str)
df["date"]=pd.to_datetime(df["date"])
df.sort_values("date",inplace=True)
df.reset_index(inplace=True,drop=True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [4]:
#splitting for training
train=df[df["date"]<"2017-01-01"]
test=df[df["date"]>="2017-01-01"]

In [5]:
train_features=train.drop(columns=["sales"])
train_target=train["sales"]
test_features=test.drop(columns=["sales"])
test_target=test["sales"]

In [6]:
#making new features for data set(feature enricher)
from upgini import FeaturesEnricher,SearchKey
from upgini.metadata import CVType
enricher=FeaturesEnricher(search_keys={
    "date":SearchKey.DATE,
},cv=CVType.time_series)
enricher.fit(train_features,train_target,eval_set=[(test_features,test_target)])

<IPython.core.display.Javascript object>

Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history

Detected task type: ModelTaskType.REGRESSION. Reason: date search key is present, treating as regression
You can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly




<IPython.core.display.Javascript object>

Column name,Status,Errors
target,All valid,-
date,All valid,-




Running search request, search_id=12ddb395-a15d-4c95-bbef-1c549fa68ce4
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com



Feature name,SHAP value,Coverage %,Value preview,Provider,Source,Updates
f_economic_date_cbpol_umap_6_aa0352de,12.2863,100.0,"1.0587, 6.9596, 1.0408",Upgini,World economic indicators,Daily
f_autofe_roll_2d_norm_mean_b3210883b2,7.9131,85.6719,"0.438, -0.8822, 4.5948","Training dataset,Upgini","AutoFE: features from Training dataset,Markets data",Daily
f_events_date_week_cos3_7525fe31,3.9598,100.0,"1.0, -0.2225, 0.6235",Upgini,Calendar data,Daily
f_autofe_lag_7d_44ccb1e13c,2.6343,99.1952,"0.3253, -0.263, -0.3496","Training dataset,Upgini","AutoFE: features from Training dataset,Calendar data",Daily
f_events_date_year_cos1_9014a856,2.4643,100.0,"0.3253, -0.263, -0.3496",Upgini,Calendar data,Daily
f_financial_date_crude_oil_7d_to_1y_c3e0ad17,1.7845,100.0,"1.0001, 1.0769, 1.0154",Upgini,Markets data,Daily
f_economic_date_cbpol_umap_4_c5ce4e90,1.1706,100.0,"2.2306, 6.981, 4.0524",Upgini,World economic indicators,Daily
f_financial_date_natural_gas_7d_to_7d_1y_shift_a5c3c07f,0.9663,100.0,"1.0525, 1.2218, 1.0102",Upgini,Markets data,Daily
f_events_date_year_sin2_59955ffd,0.5206,100.0,"-0.6153, 0.3847, -0.1628",Upgini,Calendar data,Daily
f_financial_date_vix_7d_to_1y_634c77eb,0.5122,100.0,"0.8019, 1.2501, 0.8325",Upgini,Markets data,Daily


Provider,Source,All features SHAP,Number of relevant features
Upgini,World economic indicators,13.4569,2
"Training dataset,Upgini","AutoFE: features from Training dataset,Markets data",7.9131,1
Upgini,Calendar data,7.7652,5
Upgini,Markets data,3.263,3
"Training dataset,Upgini","AutoFE: features from Training dataset,Calendar data",2.9032,2


Sources,Feature name,Feature 1,Function
"Training dataset,Markets data",f_autofe_roll_2d_norm_mean_b3210883b2,f_financial_date_usd_eur_gap_c8eb8d4a,roll_2d_norm_mean
"Training dataset,Calendar data",f_autofe_lag_7d_44ccb1e13c,f_events_date_year_cos1_9014a856,lag_7d
"Training dataset,Calendar data",f_autofe_lag_7d_a2d9d6b54d,f_events_date_year_sin2_59955ffd,lag_7d


We detected 48 outliers in your sample.
Examples of outliers with maximum value of target:
33    205
17    196
12    187
Name: target, dtype: int64
Outliers will be excluded during the metrics calculation.
Calculating accuracy uplift after enrichment...
y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,
which makes metrics between the train and eval_set incomparable.


Dataset type,Rows,Mean target,Baseline MAPE,Enriched MAPE,"Uplift, abs","Uplift, %"
Train,9418,53.3352,0.324 Â± 0.109,0.286 Â± 0.083,0.038,11.8%
Eval 1,3764,58.5994,0.278 Â± 0.009,0.266 Â± 0.034,0.011,4.1%


In [7]:
#Baseline Model:CatBoost Regressor
from catboost import CatBoostRegressor
from catboost.utils import eval_metric
model=CatBoostRegressor(verbose=False,allow_writing_files=False,random_state=0)
enricher.calculate_metrics(train_features,
                           train_target,
                           eval_set=[(test_features,test_target)],
                           estimator=model,
                           scoring="mean_absolute_percentage_error")



Calculating accuracy uplift after enrichment...
-y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,
which makes metrics between the train and eval_set incomparable.


Unnamed: 0,Dataset type,Rows,Mean target,Baseline MAPE,Enriched MAPE,"Uplift, abs","Uplift, %"
0,Train,9418,53.3352,0.288 Â± 0.096,0.217 Â± 0.095,0.071,24.6%
1,Eval 1,3764,58.5994,0.247 Â± 0.008,0.195 Â± 0.029,0.052,21.1%


In [8]:
train_1k = train_features.sample(n=1000, random_state=42)
target_1k = train_target.loc[train_1k.index]

test_1k=test_features.sample(n=1000, random_state=42)
target_test_1k=test_target.loc[test_1k.index]
enriched_train_1k=enricher.transform(
    train_1k,
    keep_input=True
)

enriched_test_1k = enricher.transform(
    test_1k,
    keep_input=True
)


Unregistered-user limit: 284 rows remaining; you requested 1000.


Button(description='Get an API KEY', layout=Layout(width='auto'), style=ButtonStyle(), tooltip='Register', _doâ€¦

Unregistered-user limit: 284 rows remaining; you requested 1000.


Button(description='Get an API KEY', layout=Layout(width='auto'), style=ButtonStyle(), tooltip='Register', _doâ€¦

In [9]:
model.fit(train_features, train_target)
pred=model.predict(test_features)
from catboost.utils import eval_metric
y_true = test_target.values
y_pred = pred
smape_full=eval_metric(y_true, y_pred, "SMAPE")
smape_full

[37.65141857448004]

In [10]:
model.fit(train_1k, target_1k)
y_true=target_test_1k.values
y_pred=model.predict(test_1k)
baseline_smape_1k=eval_metric(y_true, y_pred, "SMAPE")
baseline_smape_1k

[43.08742426245925]

In [11]:
model.fit(enriched_train_1k, target_1k)

y_true=target_test_1k.values
y_pred=model.predict(enriched_test_1k)
enriched_smape_1k = eval_metric(y_true, y_pred, "SMAPE")
enriched_smape_1k


CatBoostError: X must not be None

In [None]:
final_results=pd.DataFrame({
    "Model":[
        "Baseline (Full dataset)",
        "Baseline (1,000 rows)",
        "Enriched (1,000 rows)"
    ],
    "SMAPE":[
        smape_full,
        baseline_smape_1k,
        enriched_smape_1k
    ]
})
final_results