# Final Project Assignment
---
Complete a Kaggle Store Sales getting started competition 
https://www.kaggle.com/competitions/store-sales-time-series-forecasting/

## Problem statement
text here

## Problem solution

In [1]:
#import appropriate modules
import pandas as pd
import seaborn as sns
import numpy as np
sns.set()

In [2]:
#read data
import glob
dfs = dict()

for df_name in glob.glob('./Data/store-sales-time-series-forecasting/*.csv'):
        print(df_name)
        dfs[df_name.split('/')[-1].split('.')[0]] = pd.read_csv(df_name)

./Data/store-sales-time-series-forecasting/submission.csv
./Data/store-sales-time-series-forecasting/test.csv
./Data/store-sales-time-series-forecasting/oil.csv
./Data/store-sales-time-series-forecasting/transactions.csv
./Data/store-sales-time-series-forecasting/sample_submission.csv
./Data/store-sales-time-series-forecasting/train.csv
./Data/store-sales-time-series-forecasting/holidays_events.csv
./Data/store-sales-time-series-forecasting/stores.csv


### Basic Data Preprocessing
text of explanation of basic transformations

In [3]:
#merge dfs
full_df = pd.concat([dfs['train'], dfs['test']], ignore_index=True)
full_df.drop(columns=['id'], inplace=True)

In [4]:
#work with date features
full_df['date'] = pd.to_datetime(full_df['date'], format='%Y-%m-%d %H:%M:%S')
full_df['year'] = full_df['date'].dt.year
full_df['month'] = full_df['date'].dt.month
full_df['day_of_month'] = full_df['date'].dt.day
full_df['day_of_week'] = full_df['date'].dt.day_of_week
full_df.drop(columns=['date'], inplace=True)

In [5]:
#encode categorical columns
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

features_to_encode = ['family']
full_df['family'] = enc.fit_transform(full_df[features_to_encode])

### Baseline Model Training

In [6]:
#define X and y
df_local = full_df.iloc[:dfs['train'].shape[0]]
df_kaggle = full_df.iloc[-dfs['test'].shape[0]:]

X = df_local.drop(columns=['sales'])
y = df_local['sales']
X_kaggle_test = df_kaggle.drop(columns=['sales'])

In [7]:
#split data
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit()
train_idx, test_idx = list(tscv.split(X))[-1]

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [8]:
#define unordered categorical features
cols = list(X.columns)
unord_cat_cols = ['store_nbr', 'family']
cat_mask = [(col in unord_cat_cols) for col in cols]

In [9]:
#create baseline model
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

hgbr = HistGradientBoostingRegressor(categorical_features=cat_mask, random_state=43)

hgbr.fit(X_train, y_train);

In [10]:
#test results
from sklearn.metrics import mean_squared_log_error

y_pred = hgbr.predict(X_test)
y_pred[y_pred < 0] = 0
round(mean_squared_log_error(y_test, y_pred, squared=False), ndigits=3)

1.069

### Exploratory analysis and further feature generation

In [11]:
#code here

### Hyperparameter tuning

In [12]:
hgbr.get_params()

{'categorical_features': [True, True, False, False, False, False, False],
 'early_stopping': 'auto',
 'l2_regularization': 0.0,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_bins': 255,
 'max_depth': None,
 'max_iter': 100,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'monotonic_cst': None,
 'n_iter_no_change': 10,
 'quantile': None,
 'random_state': 43,
 'scoring': 'loss',
 'tol': 1e-07,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

## Dumb grid search on 0.8*X

In [13]:
from sklearn.metrics import mean_squared_error

param_grid = {
    'learning_rate' : np.linspace(0.01, 0.1, 2),
    'max_iter' : range(100, 200, 50),
    'max_leaf_nodes' : range(30, 50, 10)
}

best_hgbr = hgbr

y_pred = hgbr.predict(X_test)
y_pred[y_pred < 0] = 0

best_score = mean_squared_error(y_test, y_pred, squared=False)

best_params = {
    'learning_rate' : hgbr.learning_rate,
    'max_iter' : hgbr.max_iter,
    'max_leaf_nodes' : hgbr.max_leaf_nodes
}

for learning_rate in param_grid['learning_rate']:
    for max_iter in param_grid['max_iter']:
        for max_leaf_nodes in param_grid['max_leaf_nodes']:
            hgbr = HistGradientBoostingRegressor(learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, categorical_features=cat_mask, random_state=43)
            hgbr.fit(X_train, y_train)
            y_pred = hgbr.predict(X_test)
            y_pred[y_pred < 0] = 0
            score = mean_squared_error(y_test, y_pred, squared=False)
            if (score < best_score):
                best_score = score
                best_hgbr = hgbr
                best_params['learning_rate'] = learning_rate
                best_params['max_iter'] = max_iter
                best_params['max_leaf_nodes'] = max_leaf_nodes

print("Best score:", best_score)
print("Best parametors:", best_params)

Best score: 356.2861272003729
Best parametors: {'learning_rate': 0.1, 'max_iter': 100, 'max_leaf_nodes': 40}


## Genrator GridSearch on 0.8*X

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate' : np.linspace(0.01, 0.1, 2),
    'max_iter' : range(100, 200, 50),
    'max_leaf_nodes' : range(30, 50, 10)
}

def cv_gen():
    yield train_idx, test_idx

gs = GridSearchCV(estimator=hgbr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv_gen(), verbose=3, refit=False)

gs.fit(X, y)

best_params = gs.best_params_

new_best_hgbr = HistGradientBoostingRegressor(categorical_features=cat_mask, random_state=43, **best_params)

new_best_hgbr.fit(X_train, y_train)
y_pred = new_best_hgbr.predict(X_test)
y_pred[y_pred < 0] = 0

mean_squared_log_error(y_test, y_pred, squared=False)

Fitting 1 folds for each of 8 candidates, totalling 8 fits
[CV 1/1] END learning_rate=0.01, max_iter=100, max_leaf_nodes=30;, score=-592977.431 total time=   9.3s
[CV 1/1] END learning_rate=0.01, max_iter=100, max_leaf_nodes=40;, score=-570178.178 total time=  10.5s
[CV 1/1] END learning_rate=0.01, max_iter=150, max_leaf_nodes=30;, score=-401071.602 total time=  13.2s
[CV 1/1] END learning_rate=0.01, max_iter=150, max_leaf_nodes=40;, score=-378875.166 total time=  15.2s
[CV 1/1] END learning_rate=0.1, max_iter=100, max_leaf_nodes=30;, score=-127375.640 total time=   8.1s
[CV 1/1] END learning_rate=0.1, max_iter=100, max_leaf_nodes=40;, score=-126947.616 total time=   8.9s
[CV 1/1] END learning_rate=0.1, max_iter=150, max_leaf_nodes=30;, score=-128205.964 total time=  13.2s
[CV 1/1] END learning_rate=0.1, max_iter=150, max_leaf_nodes=40;, score=-129180.719 total time=  12.2s


1.0788717348828145

In [15]:
gs.best_params_

{'learning_rate': 0.1, 'max_iter': 100, 'max_leaf_nodes': 40}

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate' : np.linspace(0.01, 0.1, 2),
    'max_iter' : range(100, 200, 50),
    'max_leaf_nodes' : range(30, 50, 10)
}

gs = GridSearchCV(estimator=hgbr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=tscv, verbose=3, refit=False)

gs.fit(X, y)

best_params = gs.best_params_

new_best_hgbr = HistGradientBoostingRegressor(categorical_features=cat_mask, random_state=43, **best_params)

new_best_hgbr.fit(X_train, y_train)
y_pred = new_best_hgbr.predict(X_test)
y_pred[y_pred < 0] = 0

mean_squared_log_error(y_test, y_pred, squared=False)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=30;, score=-313232.959 total time=   4.1s
[CV 2/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=30;, score=-365041.595 total time=   4.3s
[CV 3/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=30;, score=-546964.301 total time=   6.0s
[CV 4/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=30;, score=-535092.081 total time=   7.1s
[CV 5/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=30;, score=-592977.431 total time=   9.5s
[CV 1/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=40;, score=-309641.201 total time=   3.4s
[CV 2/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=40;, score=-348973.748 total time=   4.6s
[CV 3/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=40;, score=-523787.733 total time=   7.4s
[CV 4/5] END learning_rate=0.01, max_iter=100, max_leaf_nodes=40;, score=-504991.631 total time=   8.2s
[CV 

1.0554867495168694

In [17]:
gs.best_params_

{'learning_rate': 0.1, 'max_iter': 150, 'max_leaf_nodes': 30}

### Kaggle prediction evaluation

In [14]:
#for Kaggle
y_kaggle_pred = hgbr.predict(X_kaggle_test)
y_kaggle_pred[y_kaggle_pred < 0] = 0
dfs['sample_submission']['sales'] = y_kaggle_pred
dfs['sample_submission'].to_csv('submission.csv', index=False)

## Wrap up and results analysis
text here