# Task 2 - Prediction of store sales
- Preprocessing
- Building models with sklearn pipelines
- Choose a loss function
- Post Prediction Analysis
- Serialize models
- Building model with deep learning 


In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

import os
import sys
sys.path.append(os.path.abspath(os.path.join('..','src')))
from eda import EDA
from utility import ANALYSIS
from model_utility import PREDICTION


import warnings
warnings.filterwarnings('ignore')

In [3]:
# Initialize
pred = PREDICTION('train_store.csv')
# train = pred.get_dataframe()

In [4]:
# New Features
pred.new_features()
train = pred.get_dataframe()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 25 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Unnamed: 0                 1017209 non-null  object        
 1   Store                      1017209 non-null  object        
 2   StoreType                  1017209 non-null  object        
 3   Assortment                 1017209 non-null  object        
 4   CompetitionDistance        1017209 non-null  object        
 5   CompetitionOpenSinceMonth  1017209 non-null  object        
 6   CompetitionOpenSinceYear   1017209 non-null  object        
 7   Promo2                     1017209 non-null  object        
 8   Promo2SinceWeek            1017209 non-null  object        
 9   Promo2SinceYear            1017209 non-null  object        
 10  PromoInterval              1017209 non-null  object        
 11  DayOfWeek                  1017209 no

In [5]:
# # change the datatype
pred.change_dataType()
train = pred.get_dataframe()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 30 columns):
 #   Column                          Non-Null Count    Dtype         
---  ------                          --------------    -----         
 0   Store                           1017209 non-null  object        
 1   Assortment                      1017209 non-null  int32         
 2   CompetitionDistance             1017209 non-null  object        
 3   CompetitionOpenSinceMonth       1017209 non-null  object        
 4   CompetitionOpenSinceYear        1017209 non-null  object        
 5   Promo2                          1017209 non-null  object        
 6   Promo2SinceWeek                 1017209 non-null  object        
 7   Promo2SinceYear                 1017209 non-null  object        
 8   DayOfWeek                       1017209 non-null  object        
 9   Date                            1017209 non-null  datetime64[ns]
 10  Sales                           1017209 no

In [6]:
new_df = train[['StateHoliday_a','StateHoliday_b','StateHoliday_c','nextHolidayDays','pastHolidayDays']]
# new_df[(new_df['StateHoliday_a'] > 1) & (new_df['StateHoliday_b'] != 0) & (new_df['StateHoliday_c'] != 0)]]
new_df['pastHolidayDays'].unique()

array([57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41,
       40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
       23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
       5, 4, 3, 2, 1, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75,
       74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58,
       106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92,
       91, 90, 89, 88, 0], dtype=object)

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
# sampling
# ab = train.corr()
# ab['Sales']

# sampling
sample = train.sample(frac = 0.1, random_state=42)
x_sample = sample.drop(['Sales','Date'],axis=1)
y_sample = sample['Sales']

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_sample)

# random forest and decision tree
x_train,x_test,y_train,y_test = train_test_split(x_scaled, y_sample, test_size=0.2, random_state=42)
random_para = {
    'regressor__n_estimators': [100, 200, 500],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 5, 10],
    'regressor__max_features': ['auto', 'sqrt', 'log2']
}

decision_para = {
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 5, 10],
    'regressor__criterion': ['gini', 'entropy']
}

random_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state = 42))
])

decision_pipline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

random_model = GridSearchCV(estimator=random_pipeline, param_grid=random_para,cv=3,n_jobs=-1, verbose=2)
# decision_model = GridSearchCV(estimator=decision_pipline, param_grid=decision_para,cv=5,n_jobs=-1, verbose=2)

In [8]:
random_model.fit(x_train,y_train)

best_random_model = random_model.best_estimator_

feature_importance = best_random_model.named_steps['classifier'].feature_importances_

feature_importance_df = pd.DataFrame({
    'features': x_train.columns,
    'importance_score': feature_importance
}).sort_values(by='importance_score', ascending = False)

print(feature_importance_df)

top_features = feature_importance_df['features'].head(10).tolist()

x_train_main = x_train[top_features]
x_test_main = x_test[top_features]

# remodel
random_model.fit(x_train_main,y_train)
best_random_model = random_model.best_estimator_
y_hat = best_random_model.predict(x_test_main)
accuracy = accuracy_score(y_test,y_hat)
print(accuracy)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [32]:
# pred.close_logs()