In [65]:
# Challange: https://www.kaggle.com/competitions/store-sales-time-series-forecasting

#Step 0: intall the required packages
!pip install pandas numpy pygments matplotlib seaborn scikit-learn
!pip install xgboost



# Step 1: Importing the required libraries
# Step 2: Load the data
# Step 3: Data Preprocessing
# Step 4: Merge the data
# Step 5: Check the data types of the merged data
# Step 6:  Prediction
# Step 7: Model Selection and Tuning
# Step 8: Model Evaluation by comparing the models using the best parameters
# Step 9: use the extra trees model to predict the sales for the test data


In [66]:
#Step 1: Importing the required libraries

# Data Analysis
import numpy as np
import pandas as pd
import datetime

# Visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker 
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# For data preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, PowerTransformer

# For model selection and tuning
from sklearn.model_selection import GridSearchCV, cross_val_score

#for sample splitting
from sklearn.model_selection import train_test_split

In [67]:
# warnings
import warnings
warnings.filterwarnings('ignore')

#display formating
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)




In [68]:

#Step 2: Load the data

# read the data
holidays_events = pd.read_csv('/Users/judymac/Documents/1. To keep /C. Kaggle data/1. store-sales-time-series-forecasting/holidays_events.csv')
oil = pd.read_csv('/Users/judymac/Documents/1. To keep /C. Kaggle data/1. store-sales-time-series-forecasting/oil.csv')
stores = pd.read_csv('/Users/judymac/Documents/1. To keep /C. Kaggle data/1. store-sales-time-series-forecasting/stores.csv')
test = pd.read_csv('/Users/judymac/Documents/1. To keep /C. Kaggle data/1. store-sales-time-series-forecasting/test.csv')
train = pd.read_csv('/Users/judymac/Documents/1. To keep /C. Kaggle data/1. store-sales-time-series-forecasting/train.csv')
transactions = pd.read_csv('/Users/judymac/Documents/1. To keep /C. Kaggle data/1. store-sales-time-series-forecasting/transactions.csv')


# print the first 10 rows of the data
print(holidays_events.head(10))



         date     type    locale locale_name                    description  \
0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobamba   
5  2012-05-12  Holiday     Local        Puyo         Cantonizacion del Puyo   
6  2012-06-23  Holiday     Local    Guaranda      Cantonizacion de Guaranda   
7  2012-06-25  Holiday  Regional    Imbabura  Provincializacion de Imbabura   
8  2012-06-25  Holiday     Local   Latacunga     Cantonizacion de Latacunga   
9  2012-06-25  Holiday     Local     Machala           Fundacion de Machala   

   transferred  
0        False  
1        False  
2        False  
3        False  
4        False  
5        False  
6        Fa

In [69]:
#Step 3: Data Preprocessing

# check for missing values
print('holidays_events')
print(holidays_events.isnull().sum())
print('/n')
print('oil')
print(oil.isnull().sum())
print('/n')
print('stores')
print(stores.isnull().sum())
print('/n')
print('test')
print(test.isnull().sum())
print('/n')
print('train')
print(train.isnull().sum())
print('/n')
print('transactions')
print(transactions.isnull().sum())


holidays_events
date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64
/n
oil
date           0
dcoilwtico    43
dtype: int64
/n
stores
store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64
/n
test
id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64
/n
train
id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64
/n
transactions
date            0
store_nbr       0
transactions    0
dtype: int64


In [70]:
# check for duplicates
print('holidays_events')
print(holidays_events.duplicated().sum())
print('/n')
print('oil')
print(oil.duplicated().sum())
print('/n')
print('stores')
print(stores.duplicated().sum())
print('/n')
print('test')
print(test.duplicated().sum())
print('/n')
print('train')
print(train.duplicated().sum())
print('/n')
print('transactions')
print(transactions.duplicated().sum())

holidays_events
0
/n
oil
0
/n
stores
0
/n
test
0
/n
train
0
/n
transactions
0


In [71]:
# show null values in the data
print('oil null values')
print(oil[oil.isnull().any(axis=1)])



oil null values
            date  dcoilwtico
0     2013-01-01         NaN
14    2013-01-21         NaN
34    2013-02-18         NaN
63    2013-03-29         NaN
104   2013-05-27         NaN
132   2013-07-04         NaN
174   2013-09-02         NaN
237   2013-11-28         NaN
256   2013-12-25         NaN
261   2014-01-01         NaN
274   2014-01-20         NaN
294   2014-02-17         NaN
338   2014-04-18         NaN
364   2014-05-26         NaN
393   2014-07-04         NaN
434   2014-09-01         NaN
497   2014-11-27         NaN
517   2014-12-25         NaN
522   2015-01-01         NaN
534   2015-01-19         NaN
554   2015-02-16         NaN
588   2015-04-03         NaN
624   2015-05-25         NaN
653   2015-07-03         NaN
699   2015-09-07         NaN
757   2015-11-26         NaN
778   2015-12-25         NaN
783   2016-01-01         NaN
794   2016-01-18         NaN
814   2016-02-15         NaN
843   2016-03-25         NaN
889   2016-05-30         NaN
914   2016-07-04         Na

In [72]:
#replace missing values with the mean
imputer = SimpleImputer(strategy='mean')
oil['dcoilwtico'] = imputer.fit_transform(oil[['dcoilwtico']])
print(oil.isnull().sum())


date          0
dcoilwtico    0
dtype: int64


In [73]:
# Step 4: merge the data

# merge the train data and rename it to train_df, use inner to avoid null values
train_df = pd.merge(train, stores.rename(columns={'type': 'type_stores'}), how='inner', on='store_nbr')
train_df = pd.merge(train_df, oil.rename(columns={'type': 'type_oil'}), how='inner', on='date')
# train_df = pd.merge(train_df, transactions.rename(columns={'type': 'type_transacti
# no details about the transcactions data from the database (https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data?select=transactions.csv)
train_df = pd.merge(train_df, holidays_events.rename(columns={'type': 'type_holidays_events'}), how='inner', on='date')

#print the first 10 rows of the merged data
print(train_df.head(10))





   id        date  store_nbr        family  sales  onpromotion   city  \
0   0  2013-01-01          1    AUTOMOTIVE    0.0            0  Quito   
1   1  2013-01-01          1     BABY CARE    0.0            0  Quito   
2   2  2013-01-01          1        BEAUTY    0.0            0  Quito   
3   3  2013-01-01          1     BEVERAGES    0.0            0  Quito   
4   4  2013-01-01          1         BOOKS    0.0            0  Quito   
5   5  2013-01-01          1  BREAD/BAKERY    0.0            0  Quito   
6   6  2013-01-01          1   CELEBRATION    0.0            0  Quito   
7   7  2013-01-01          1      CLEANING    0.0            0  Quito   
8   8  2013-01-01          1         DAIRY    0.0            0  Quito   
9   9  2013-01-01          1          DELI    0.0            0  Quito   

       state type_stores  cluster  dcoilwtico type_holidays_events    locale  \
0  Pichincha           D       13   67.714366              Holiday  National   
1  Pichincha           D       13   

In [74]:
# merge the test data and rename it to test_df, use inner to avoid null values
test_df = pd.merge(test, stores.rename(columns={'type': 'type_stores'}), how='inner', on='store_nbr')
test_df = pd.merge(test_df, oil.rename(columns={'type': 'type_oil'}), how='inner', on='date')
# test_df = pd.merge(test_df, transactions.rename(columns={'type': 'type_transactions'}), how='inner', on=['date', 'store_nbr'])
test_df = pd.merge(test_df, holidays_events.rename(columns={'type': 'type_holidays_events'}), how='inner', on='date')

# #print the first 10 rows of the merged data
print(test_df.head(10))

        id        date  store_nbr        family  onpromotion   city  \
0  3015144  2017-08-24          1    AUTOMOTIVE            0  Quito   
1  3015145  2017-08-24          1     BABY CARE            0  Quito   
2  3015146  2017-08-24          1        BEAUTY            0  Quito   
3  3015147  2017-08-24          1     BEVERAGES           26  Quito   
4  3015148  2017-08-24          1         BOOKS            0  Quito   
5  3015149  2017-08-24          1  BREAD/BAKERY            1  Quito   
6  3015150  2017-08-24          1   CELEBRATION            0  Quito   
7  3015151  2017-08-24          1      CLEANING           12  Quito   
8  3015152  2017-08-24          1         DAIRY           13  Quito   
9  3015153  2017-08-24          1          DELI           11  Quito   

       state type_stores  cluster  dcoilwtico type_holidays_events locale  \
0  Pichincha           D       13       47.24              Holiday  Local   
1  Pichincha           D       13       47.24              Holid

In [75]:
# Step 5: check the data types of the merged data
# show the data types of the merged data
print('train_df data types')
print(train_df.dtypes)


train_df data types
id                        int64
date                     object
store_nbr                 int64
family                   object
sales                   float64
onpromotion               int64
city                     object
state                    object
type_stores              object
cluster                   int64
dcoilwtico              float64
type_holidays_events     object
locale                   object
locale_name              object
description              object
transferred                bool
dtype: object


In [76]:

print('test_df data types')
print(test_df.dtypes)

test_df data types
id                        int64
date                     object
store_nbr                 int64
family                   object
onpromotion               int64
city                     object
state                    object
type_stores              object
cluster                   int64
dcoilwtico              float64
type_holidays_events     object
locale                   object
locale_name              object
description              object
transferred                bool
dtype: object


In [77]:
# Show the size of train_df and test_df

print('train_df size')
print(train_df.shape)

print('test_df size')
print(test_df.shape)


train_df size
(352836, 16)
test_df size
(1782, 15)


In [78]:
#step 6: prediction



# create a dictionary of models, including all popular models like 'Linear', 'Ridge', 'Lasso', 'ElasticNet', 'Extra Tree', 'Gradient Boosting', 'XGradientBoosting', 'DecisionTree', 'KNeighbors'
models = {

    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Extra Tree': ExtraTreesRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGradientBoosting': XGBRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNeighbors': KNeighborsRegressor()

}





In [79]:
# see data types of the data

print('train_df data types')
print(train_df.dtypes)

print('test_df data types')
print(test_df.dtypes)


train_df data types
id                        int64
date                     object
store_nbr                 int64
family                   object
sales                   float64
onpromotion               int64
city                     object
state                    object
type_stores              object
cluster                   int64
dcoilwtico              float64
type_holidays_events     object
locale                   object
locale_name              object
description              object
transferred                bool
dtype: object
test_df data types
id                        int64
date                     object
store_nbr                 int64
family                   object
onpromotion               int64
city                     object
state                    object
type_stores              object
cluster                   int64
dcoilwtico              float64
type_holidays_events     object
locale                   object
locale_name              object
description        

In [80]:
# conconvet date to float
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['date'] = train_df['date'].map(datetime.datetime.toordinal)

test_df['date'] = pd.to_datetime(test_df['date'])
test_df['date'] = test_df['date'].map(datetime.datetime.toordinal)

print(train_df['date'].head(10))

0    734869
1    734869
2    734869
3    734869
4    734869
5    734869
6    734869
7    734869
8    734869
9    734869
Name: date, dtype: int64


In [81]:
X_train = train_df.drop('sales', axis=1)
y_train = train_df['sales']

In [82]:
# get categorical columns for train_df and test_df
train_categorical_cols = X_train.select_dtypes(include=[object]).columns.tolist()


# get numerical columns for train_df and test_df, include int64 and float64
train_numerical_cols = X_train.select_dtypes(include=[np.int64, np.float64]).columns.tolist()

In [83]:
# use sk learn ColumnTransformer to transform the data
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# create a column transformer for the train data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), train_numerical_cols),
        ('cat', OneHotEncoder(), train_categorical_cols)
    ]
)


print(train_df.columns)

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'city',
       'state', 'type_stores', 'cluster', 'dcoilwtico', 'type_holidays_events',
       'locale', 'locale_name', 'description', 'transferred'],
      dtype='object')


    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Extra Tree': ExtraTreesRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGradientBoosting': XGBRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNeighbors': KNeighborsRegressor()    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Extra Tree': ExtraTreesRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGradientBoosting': XGBRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNeighbors': KNeighborsRegressor()

In [84]:

# print(X_train.head(10))
print(y_train.head(10))

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: sales, dtype: float64


In [85]:
#show null values in the data

print('train_df null values')
print(train_df.isnull().sum())


train_df null values
id                      0
date                    0
store_nbr               0
family                  0
sales                   0
onpromotion             0
city                    0
state                   0
type_stores             0
cluster                 0
dcoilwtico              0
type_holidays_events    0
locale                  0
locale_name             0
description             0
transferred             0
dtype: int64


In [86]:
# rename the train_df to train_full_df

train_full_df = train_df.copy()

# sample 10% of the train_full_df as train_df
train_df = train_full_df.sample(frac=0.1, random_state=1)

In [87]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Assuming 'sales' is the target variable and train_df is your original data
X_train_orig = train_df.drop('sales', axis=1)
y_train = train_df['sales']

# Preprocess the training data
X_train = preprocessor.fit_transform(X_train_orig)

#step 7: Model Selection and Tuning

#step 7.1 Ridge Regression

# Initialize a Ridge regressor
ridge = Ridge()

# Define the grid of values for alpha
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Use GridSearchCV to find the best alpha value
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)  # 5-fold cross-validation
ridge_cv.fit(X_train, y_train)

# Print the best alpha value
print("Best alpha: ", ridge_cv.best_params_)
print("Best score: ", ridge_cv.best_score_)
print("Best estimator: ", ridge_cv.best_estimator_)
print("Best index: ", ridge_cv.best_index_)




Best alpha:  {'alpha': 10}
Best score:  0.5085249054912283
Best estimator:  Ridge(alpha=10)
Best index:  4


In [88]:
# 7.2 Lasso Regression

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Initialize a Lasso regressor
lasso = Lasso()

# Define a simplified grid of values for alpha
param_grid_lasso = {
    'alpha': [0.01, 0.1, 1, 10, 100]  # Reduced range of alpha values
}

# Use GridSearchCV to find the best alpha value
lasso_cv = GridSearchCV(lasso, param_grid_lasso, cv=5, n_jobs=-1)  # 5-fold cross-validation and parallel processing
lasso_cv.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for Lasso: ", lasso_cv.best_params_)
print("Best score for Lasso: ", lasso_cv.best_score_)


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Best parameters for Lasso:  {'alpha': 0.1}
Best score for Lasso:  0.5086569616946288


In [89]:

# 7.3 ElasticNet Regression

from sklearn.linear_model import ElasticNet

# Initialize an ElasticNet regressor
elastic_net = ElasticNet()

# Define the grid of values for alpha and l1_ratio
param_grid_elastic_net = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1]  # l1_ratio is the mix ratio between Lasso and Ridge
}

# Use GridSearchCV to find the best alpha and l1_ratio values
elastic_net_cv = GridSearchCV(elastic_net, param_grid_elastic_net, cv=5, n_jobs=-1)  # 5-fold cross-validation and parallel processing
elastic_net_cv.fit(X_train, y_train)

# Print the best parameters
print("Best parameters for ElasticNet: ", elastic_net_cv.best_params_)
print("Best score for ElasticNet: ", elastic_net_cv.best_score_)



  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Best parameters for ElasticNet:  {'alpha': 0.1, 'l1_ratio': 1}
Best score for ElasticNet:  0.5086569616946288


In [90]:
# 7.4 ExtraTreesRegressor

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV

# Initialize ExtraTreesRegressor
extra_trees = ExtraTreesRegressor()

# Define the hyperparameter grid
param_grid_extra_trees = {
    'n_estimators': [50, 100],        # Number of trees in the forest
    'max_features': ['sqrt'],         # Number of features to consider at each split
    'max_depth': [10, 20],            # Maximum depth of the tree
    'min_samples_split': [2, 5],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]        # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation
extra_trees_cv = GridSearchCV(extra_trees, param_grid_extra_trees, cv=5, n_jobs=-1)
extra_trees_cv.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for ExtraTreesRegressor: ", extra_trees_cv.best_params_)
print("Best score for ExtraTreesRegressor: ", extra_trees_cv.best_score_)


Best parameters for ExtraTreesRegressor:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best score for ExtraTreesRegressor:  0.6429973935685289


In [91]:
# 7.5 GradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Initialize GradientBoostingRegressor
gbr = GradientBoostingRegressor()

# Define the hyperparameter grid
param_grid_gbr = {
    'n_estimators': [50, 100],        # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1],     # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5],              # Maximum depth of the individual regression estimators
    'min_samples_split': [2, 5],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]        # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation
gbr_cv = GridSearchCV(gbr, param_grid_gbr, cv=5, n_jobs=-1)
gbr_cv.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for GradientBoostingRegressor: ", gbr_cv.best_params_)
print("Best score for GradientBoostingRegressor: ", gbr_cv.best_score_)


Best parameters for GradientBoostingRegressor:  {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best score for GradientBoostingRegressor:  0.702108281843524


In [92]:
# 7.6 XGBRegressor

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Initialize XGBRegressor
xgb = XGBRegressor()

# Define the hyperparameter grid
param_grid_xgb = {
    'n_estimators': [50, 100],        # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1],     # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5],              # Maximum depth of the individual regression estimators
    'min_child_weight': [1, 2]        # Minimum sum of instance weight needed in a child
}

# Perform grid search with cross-validation
xgb_cv = GridSearchCV(xgb, param_grid_xgb, cv=5, n_jobs=-1)
xgb_cv.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for XGBRegressor: ", xgb_cv.best_params_)
print("Best score for XGBRegressor: ", xgb_cv.best_score_)


Best parameters for XGBRegressor:  {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 100}
Best score for XGBRegressor:  0.705338934341343


In [93]:
#7.7 DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Initialize DecisionTreeRegressor
decision_tree = DecisionTreeRegressor()

# Define the hyperparameter grid
param_grid_decision_tree = {
    'max_depth': [3, 5],              # Maximum depth of the tree
    'min_samples_split': [2, 5],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]        # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation
decision_tree_cv = GridSearchCV(decision_tree, param_grid_decision_tree, cv=5, n_jobs=-1)
decision_tree_cv.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for DecisionTreeRegressor: ", decision_tree_cv.best_params_)
print("Best score for DecisionTreeRegressor: ", decision_tree_cv.best_score_)


Best parameters for DecisionTreeRegressor:  {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best score for DecisionTreeRegressor:  0.491075122048448


In [94]:
# 7.8 KNeighborsRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Initialize KNeighborsRegressor
knn = KNeighborsRegressor()

# Define the hyperparameter grid
param_grid_knn = {
    'n_neighbors': [3, 5, 7],          # Number of neighbors to use
    'weights': ['uniform', 'distance']  # Weight function used in prediction
}

# Perform grid search with cross-validation
knn_cv = GridSearchCV(knn, param_grid_knn, cv=5, n_jobs=-1)

knn_cv.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for KNeighborsRegressor: ", knn_cv.best_params_)
print("Best score for KNeighborsRegressor: ", knn_cv.best_score_)



Best parameters for KNeighborsRegressor:  {'n_neighbors': 7, 'weights': 'distance'}
Best score for KNeighborsRegressor:  0.378755980547664


In [95]:
# print the best parameters and best score for all the models

print("Best parameters for Ridge: ", ridge_cv.best_params_)
print("Best score for Ridge: ", ridge_cv.best_score_)
print("Best parameters for Lasso: ", lasso_cv.best_params_)
print("Best score for Lasso: ", lasso_cv.best_score_)
print("Best parameters for ElasticNet: ", elastic_net_cv.best_params_)
print("Best score for ElasticNet: ", elastic_net_cv.best_score_)
print("Best parameters for ExtraTreesRegressor: ", extra_trees_cv.best_params_)
print("Best score for ExtraTreesRegressor: ", extra_trees_cv.best_score_)
print("Best parameters for GradientBoostingRegressor: ", gbr_cv.best_params_)
print("Best score for GradientBoostingRegressor: ", gbr_cv.best_score_)
print("Best parameters for XGBRegressor: ", xgb_cv.best_params_)
print("Best score for XGBRegressor: ", xgb_cv.best_score_)
print("Best parameters for DecisionTreeRegressor: ", decision_tree_cv.best_params_)
print("Best score for DecisionTreeRegressor: ", decision_tree_cv.best_score_)
print("Best parameters for KNeighborsRegressor: ", knn_cv.best_params_)
print("Best score for KNeighborsRegressor: ", knn_cv.best_score_)



Best parameters for Ridge:  {'alpha': 10}
Best score for Ridge:  0.5085249054912283
Best parameters for Lasso:  {'alpha': 0.1}
Best score for Lasso:  0.5086569616946288
Best parameters for ElasticNet:  {'alpha': 0.1, 'l1_ratio': 1}
Best score for ElasticNet:  0.5086569616946288
Best parameters for ExtraTreesRegressor:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best score for ExtraTreesRegressor:  0.6429973935685289
Best parameters for GradientBoostingRegressor:  {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best score for GradientBoostingRegressor:  0.702108281843524
Best parameters for XGBRegressor:  {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 100}
Best score for XGBRegressor:  0.705338934341343
Best parameters for DecisionTreeRegressor:  {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best score for DecisionT

In [96]:
# Step 8 Model Evaluation by comparing the models using the best parameters

# 8.1 Create a dictionary of models with the best parameters

models = {
    'Ridge': Ridge(alpha=0.01),
    'Lasso': Lasso(alpha=0.01),
    'ElasticNet': ElasticNet(alpha=0.01, l1_ratio=0.1),
    'Extra Trees': ExtraTreesRegressor(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100),
    'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.1, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100),
    'XGBoost': XGBRegressor(learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100),
    'Decision Tree': DecisionTreeRegressor(max_depth=5, min_samples_leaf=1, min_samples_split=2),
    'KNeighbors': KNeighborsRegressor(n_neighbors=7, weights='distance')
}


In [97]:
# 8.2 use another 10% of the data to evaluate the models and rename the database as evaluation_df

evaluation_df = train_full_df.sample(frac=0.1, random_state=2)


In [98]:
# 8.3 show the evaluation_df type
print('evaluation_df data types')
print(evaluation_df.dtypes)

# show the train_df type

print('train_df data types')
print(train_df.dtypes)

evaluation_df data types
id                        int64
date                      int64
store_nbr                 int64
family                   object
sales                   float64
onpromotion               int64
city                     object
state                    object
type_stores              object
cluster                   int64
dcoilwtico              float64
type_holidays_events     object
locale                   object
locale_name              object
description              object
transferred                bool
dtype: object
train_df data types
id                        int64
date                      int64
store_nbr                 int64
family                   object
sales                   float64
onpromotion               int64
city                     object
state                    object
type_stores              object
cluster                   int64
dcoilwtico              float64
type_holidays_events     object
locale                   object
locale_name  

In [101]:
# use the evaluation_df to evaluate the models to get the r2, rmse, mse, and mae. Show the result with best R2 score

for name, model in models.items():
    # Preprocess the evaluation data
    X_eval = evaluation_df.drop('sales', axis=1)
    y_eval = evaluation_df['sales']
    X_eval = preprocessor.transform(X_eval)
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_eval)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_eval, y_pred)
    rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
    mse = mean_squared_error(y_eval, y_pred)
    mae = mean_absolute_error(y_eval, y_pred)
    
    # Print the metrics
    print(f"Model: {name}")
    print(f"R^2: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print("")

    

    

Model: Ridge
R^2: 0.5921
RMSE: 725.69
MSE: 526619.47
MAE: 315.27

Model: Lasso
R^2: 0.5921
RMSE: 725.67
MSE: 526592.07
MAE: 315.03

Model: ElasticNet
R^2: 0.5717
RMSE: 743.58
MSE: 552910.89
MAE: 297.15

Model: Extra Trees
R^2: 0.7814
RMSE: 531.19
MSE: 282163.46
MAE: 209.50

Model: Gradient Boosting
R^2: 0.8492
RMSE: 441.17
MSE: 194635.32
MAE: 152.79

Model: XGBoost
R^2: 0.8320
RMSE: 465.72
MSE: 216893.53
MAE: 154.93

Model: Decision Tree
R^2: 0.6705
RMSE: 652.20
MSE: 425359.72
MAE: 247.09

Model: KNeighbors
R^2: 0.5293
RMSE: 779.52
MSE: 607648.64
MAE: 296.70



In [102]:
# Step 9: use the extra trees model to predict the sales for the test data

# Preprocess the test data
X_test = test_df
X_test = preprocessor.transform(X_test)

# Fit the Extra Trees model
extra_trees = ExtraTreesRegressor(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100)
extra_trees.fit(X_train, y_train)

# Predict the sales
y_pred_test = extra_trees.predict(X_test)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'id': test_df['id'], 'sales': y_pred_test})

# Save the predictions to a CSV file
predictions_df.to_csv('store_sales_time_series_forecasting_predictions.csv', index=False)

# Display the first few rows of the predictions

print(predictions_df.head())


        id        sales
0  3015144   150.585198
1  3015145   114.335088
2  3015146   118.413302
3  3015147  3304.529703
4  3015148   185.077539
