In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('sales.csv')
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [3]:
df.shape

(913000, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    913000 non-null  object
 1   store   913000 non-null  int64 
 2   item    913000 non-null  int64 
 3   sales   913000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 27.9+ MB


In [6]:
df = df.sample(n=19000, random_state=0) #Taking a sample of 19000
df['store'] = df['store'].astype(str)
df['item'] = df['item'].astype(str)
df['date'] = pd.to_datetime(df['date'])

df.sort_values("date", inplace = True)
df.reset_index(inplace = True, drop = True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [7]:
train = df[df['date']<'2017-01-01']
test = df[df['date']>='2017-01-01']

In [8]:
train_features = train.drop(columns=['sales'])
train_target = train['sales']
test_features = test.drop(columns=['sales'])
test_target = test['sales']

<b>Enrich features</b>

In [9]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

In [10]:
enricher = FeaturesEnricher(
    search_keys = {
        'date': SearchKey.DATE,
    },
    cv = CVType.time_series
)
enricher.fit(train_features,
            train_target,
            eval_set = [(test_features, test_target)])

Detected task type: ModelTaskType.REGRESSION


Column name,Status,Description
date,All valid,All values in this column are good to go
target,All valid,All values in this column are good to go


Running search request with search_id=713a613b-e8f4-454b-a458-f5f6858d7bc2
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
33 relevant feature(s) found with the search keys: ['date'][0m


Unnamed: 0,feature_name,shap_value,coverage %,type
0,item,0.488097,100.0,CHARACTER
1,store,0.172502,100.0,CHARACTER
2,f_weather_pca_0_94efd18d,0.056469,100.0,NUMERIC
3,f_week_sin1_a71d22f6,0.044848,100.0,NUMERIC
4,f_week_cos1_d3d56d7f,0.029807,100.0,NUMERIC
5,f_weather_umap_48_66a91289,0.025439,100.0,NUMERIC
6,f_weather_umap_24_409427e4,0.018721,100.0,NUMERIC
7,f_weather_umap_33_b9760f68,0.014879,100.0,NUMERIC
8,f_year_cos1_cd165f8c,0.011822,100.0,NUMERIC
9,f_dow_jones_89547e1d,0.007005,100.0,NUMERIC


In [11]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False, allow_writing_files = False, random_state = 0)

enricher.calculate_metrics(
    train_features, train_target,
    eval_set =  [(test_features, test_target)],
    estimator = model,
    scoring = 'mean_absolute_percentage_error'
)

Start calculating metrics
Done


Unnamed: 0,match_rate,baseline mean_absolute_percentage_error,enriched mean_absolute_percentage_error,uplift
,,,,
train,100.0,0.255844,0.170663,0.085181
eval 1,100.0,0.243877,0.131175,0.112701


In [12]:
enriched_train_features = enricher.transform(train_features, keep_input = True)
enriched_test_features = enricher.transform(test_features, keep_input = True)
enriched_train_features.head()

90.39637% of the rows are fully duplicated


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=5d0352ca-4700-44dc-8409-3214696e02fd
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Executing transform step
Done
90.36176% of the rows are fully duplicated


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=7596d9e0-b4a5-423d-ab65-9dcd4eb7f713
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Executing transform step
Done


Unnamed: 0,date,store,item,f_weather_pca_0_94efd18d,f_week_sin1_a71d22f6,f_week_cos1_d3d56d7f,f_weather_umap_48_66a91289,f_weather_umap_24_409427e4,f_weather_umap_33_b9760f68,f_year_cos1_cd165f8c,...,f_weather_umap_45_b348f420,f_c2c_fraud_score_5028232e,f_mlending_approve_score_d4c33397,f_silver_7d_to_7d_1y_shift_ccbd2abf,f_cbpol_umap_6_f175da9a,f_cbpol_pca_3_2e94b9bf,f_cpi_umap_4_83d3b2a6,f_transaction_fraud_union_score_c1a2808b,f_bank_approve_score_f3797f4b,f_gold_7d_to_1y_1df66550
0,2013-01-01,7,5,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,4.923654,0.285548,0.338412,1.072025,1.367325,-0.323471,10.153208,0.057741,0.332893,0.994608
1,2013-01-01,4,9,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,4.923654,0.285548,0.338412,1.072025,1.367325,-0.323471,10.153208,0.057741,0.332893,0.994608
2,2013-01-01,1,33,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,4.923654,0.285548,0.338412,1.072025,1.367325,-0.323471,10.153208,0.057741,0.332893,0.994608
3,2013-01-01,3,41,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,4.923654,0.285548,0.338412,1.072025,1.367325,-0.323471,10.153208,0.057741,0.332893,0.994608
4,2013-01-01,5,24,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,4.923654,0.285548,0.338412,1.072025,1.367325,-0.323471,10.153208,0.057741,0.332893,0.994608


In [13]:
model.fit(train_features,train_target)
preds = model.predict(test_features)
eval_metric(test_target.values,preds,'SMAPE')

[37.65141857448004]

In [14]:
model.fit(enriched_train_features,train_target)
enriched_preds = model.predict(enriched_test_features)
eval_metric(test_target.values,enriched_preds,'SMAPE')

[14.520546401124]