In [13]:
from os.path import exists
import pandas as pd 
import numpy as np

In [14]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

from catboost import CatBoostRegressor
from catboost.utils import eval_metric

In [15]:
url = 'train.csv.zip' if exists('train.csv.zip') else 'https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip'
df = pd.read_csv(url)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [16]:
df = df.sample(n=19000, random_state=0)

df['store'] = df['store'].astype(str)
df['item'] = df['item'].astype(str)

df['date'] = pd.to_datetime(df['date'])

df.sort_values('date', inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [17]:
train = df[df['date'] < '2017-01-01']
test = df[df['date'] >= '2017-01-01']

In [18]:
train_features = train.drop(columns=['sales'])
train_target = train['sales']

test_features = test.drop(columns=['sales'])
test_target = test['sales']

In [19]:
enricher = FeaturesEnricher(
    search_keys ={
        'date':SearchKey.DATE,
    },
    cv = CVType.time_series
)
enricher.fit(train_features, train_target, eval_set=[(test_features, test_target)])

Detected task type: ModelTaskType.REGRESSION


Column name,Status,Description
target,All valid,All values in this column are good to go
date,All valid,All values in this column are good to go


Running search request with search_id=cc6a34ed-fcea-4608-aff4-ed5ab60d4ecd
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
28 relevant feature(s) found with the search keys: ['date'].[0m


Unnamed: 0,feature_name,shap_value,coverage %,type
0,item,0.487726,100.0,CHARACTER
1,store,0.172106,100.0,CHARACTER
2,f_weather_pca_0_94efd18d,0.056047,100.0,NUMERIC
3,f_week_sin1_a71d22f6,0.044632,100.0,NUMERIC
4,f_week_cos1_d3d56d7f,0.029552,100.0,NUMERIC
5,f_weather_umap_48_66a91289,0.025132,100.0,NUMERIC
6,f_weather_umap_24_409427e4,0.019315,100.0,NUMERIC
7,f_weather_umap_33_b9760f68,0.014638,100.0,NUMERIC
8,f_year_cos1_cd165f8c,0.012112,100.0,NUMERIC
9,f_dow_jones_89547e1d,0.007461,100.0,NUMERIC


In [20]:
model = CatBoostRegressor(verbose=False, allow_writing_files=False, random_state=0)
enricher.calculate_metrics(
    train_features, train_target,
    eval_set=[(test_features, test_target)], 
    estimator=model,
    scoring='mean_absolute_percentage_error'
)

Calculating metrics...
Done


Unnamed: 0,match_rate,baseline mean_absolute_percentage_error,enriched mean_absolute_percentage_error,uplift
,,,,
train,100.0,0.255844,0.16662,0.089224
eval 1,100.0,0.243877,0.13113,0.112746


In [21]:
enriched_train_features = enricher.transform(train_features, keep_input=True)
enriched_test_features = enricher.transform(test_features, keep_input=True)
enriched_train_features.head()

90.39637% of the rows are fully duplicated


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=0b0e718f-ea48-4169-afc0-bae27a213fac
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done
90.36176% of the rows are fully duplicated


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=ebe317b4-cb9c-4299-948b-5177ac63126b
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Unnamed: 0,date,f_cbpol_pca_3_2e94b9bf,f_cbpol_umap_1_34dc2149,f_cbpol_umap_6_f175da9a,f_cpi_pca_5_db7798a3,f_dow_jones_7d_to_7d_1y_shift_9628c89b,f_dow_jones_89547e1d,f_finance_umap_3_424d51ca,f_italy_game_cnt_9cfcfe65,f_mlending_approve_score_d4c33397,...,f_weather_umap_34_39fc3e94,f_weather_umap_35_436c04a5,f_weather_umap_43_4e9820c4,f_weather_umap_45_b348f420,f_weather_umap_48_66a91289,f_week_cos1_d3d56d7f,f_week_sin1_a71d22f6,f_year_cos1_cd165f8c,item,store
0,2013-01-01,-0.323471,4.815701,1.367325,-8.943169,1.065267,13104.139648,7.647812,0,0.338412,...,5.664261,4.76773,5.079482,4.923654,4.540985,0.62349,0.781831,0.98522,5,7
1,2013-01-01,-0.323471,4.815701,1.367325,-8.943169,1.065267,13104.139648,7.647812,0,0.338412,...,5.664261,4.76773,5.079482,4.923654,4.540985,0.62349,0.781831,0.98522,9,4
2,2013-01-01,-0.323471,4.815701,1.367325,-8.943169,1.065267,13104.139648,7.647812,0,0.338412,...,5.664261,4.76773,5.079482,4.923654,4.540985,0.62349,0.781831,0.98522,33,1
3,2013-01-01,-0.323471,4.815701,1.367325,-8.943169,1.065267,13104.139648,7.647812,0,0.338412,...,5.664261,4.76773,5.079482,4.923654,4.540985,0.62349,0.781831,0.98522,41,3
4,2013-01-01,-0.323471,4.815701,1.367325,-8.943169,1.065267,13104.139648,7.647812,0,0.338412,...,5.664261,4.76773,5.079482,4.923654,4.540985,0.62349,0.781831,0.98522,24,5


In [22]:
model.fit(train_features, train_target)
preds = model.predict(test_features)
eval_metric(test_target.values, preds, 'SMAPE')

[37.65141857448004]

In [23]:
model.fit(enriched_train_features, train_target)
enriched_preds = model.predict(enriched_test_features)
eval_metric(test_target.values, enriched_preds, 'SMAPE')

[14.504497540797917]