In [1]:
# imports
import os
import sys
import dvc.api
import mlflow
import pandas as pd
from IPython.display import Image
import warnings
warnings.filterwarnings('ignore')

In [2]:
# adding and setting up scripts
sys.path.append('.')
sys.path.append('..')
sys.path.insert(1, '../scripts/')
import defaults as defs
import dataCleaner as dc
import dataVisualizer as dv

cleaner = dc.dataCleaner('base-line modeling notebook')
visualizer = dv.dataVisualizer('base-line modeling notebook')

logger <Logger dataCleaner (DEBUG)> created at path: ../logs/cleaner_root.log
Data cleaner in action
logger <Logger dataVisualizer (DEBUG)> created at path: ../logs/visualizer_root.log
Data visualizer in action


In [3]:
# pandas settings
pd.set_option('display.max_columns', 30)

# version of the data
# v1 : gdrive 
# v2 : local messed up store
# v3 : local correct store
# v4 : local store: merged data file generated, data preparation completed
version = 'v4'

# read data sets using dvc api
test_data_url = dvc.api.get_url(path = defs.test_local_path, 
                                repo = defs.repo, 
                                rev = version)

train_data_url = dvc.api.get_url(path = defs.train_local_path, 
                                repo = defs.repo, 
                                rev = version)

store_data_url = dvc.api.get_url(path = defs.store_local_path, 
                                repo = defs.repo, 
                                rev = version)

full_data_url = dvc.api.get_url(path = defs.merged_local_path, 
                                repo = defs.repo, 
                                rev = version)
print('test data path: ' + test_data_url, '\ntrain data path: ' + train_data_url, '\nstore data path: ' + store_data_url, '\nmerged data path: ' + full_data_url)

test data path: C:\Users\f0x-tr0t\Documents\education\datascience\dvc-storage\d5\d6373bc6743cde41c22b4599dce90c 
train data path: C:\Users\f0x-tr0t\Documents\education\datascience\dvc-storage\1f\38ace4291877fe3b742e8b7a111377 
store data path: C:\Users\f0x-tr0t\Documents\education\datascience\dvc-storage\21\ffe65b18f58f4b58193c968654814c 
merged data path: C:\Users\f0x-tr0t\Documents\education\datascience\dvc-storage\2c\45c8bfea2787f193cb56fe88d8df04


In [4]:
# reading csv files
DateCols = ['Date']
missing_values = ["n/a", "na", "undefined", '?', 'NA', 'undefined']

test_data = pd.read_csv(test_data_url, na_values=missing_values, parse_dates=DateCols, low_memory=False)
train_data = pd.read_csv(train_data_url, na_values=missing_values, parse_dates=DateCols, low_memory=False)
full_data = pd.read_csv(full_data_url, na_values=missing_values, parse_dates=DateCols, low_memory=False)
store_data = pd.read_csv(store_data_url, na_values=missing_values, low_memory=False)

# Base line modeling

In [5]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 27 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1017209 non-null  int64         
 1   DayOfWeek                  1017209 non-null  int64         
 2   Date                       1017209 non-null  datetime64[ns]
 3   Sales                      1017209 non-null  int64         
 4   Customers                  1017209 non-null  int64         
 5   Open                       1017209 non-null  int64         
 6   Promo                      1017209 non-null  int64         
 7   StateHoliday               1017209 non-null  int64         
 8   SchoolHoliday              1017209 non-null  int64         
 9   Year                       1017209 non-null  int64         
 10  Month                      1017209 non-null  int64         
 11  DayOfMonth                 1017209 no

In [6]:
full_data.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'DayOfMonth',
       'WeekOfYear', 'weekday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'PromoInterval0', 'PromoInterval1',
       'PromoInterval2', 'PromoInterval3'],
      dtype='object')

In [7]:
full_data.weekday.value_counts()

1    727749
0    289460
Name: weekday, dtype: int64

* As we can see all of the data preprocessing tasks of this project were handled prior to this stage.
* The data preprocessing phase has been completed in the data preparation stage.

*  Every data type in full data set is numerical and good to go to ml

In [90]:
# setting up mlflow experiment
mlflow.set_experiment('base line modeling')
mlflow.start_run()

mlflow.log_param('full data columns', full_data.columns.to_list())
mlflow.log_param('full data shape', full_data.shape)
mlflow.log_param('full data version', version)
mlflow.log_param('full data input_rows', full_data.shape[0])
mlflow.log_param('full data input_cols', full_data.shape[1])

mlflow.log_param('store data columns', store_data.columns.to_list())
mlflow.log_param('store data shape', store_data.shape)
mlflow.log_param('store data version', version)
mlflow.log_param('store data input_rows', store_data.shape[0])
mlflow.log_param('store data input_cols', store_data.shape[1])

mlflow.log_param('test data columns', test_data.columns.to_list())
mlflow.log_param('test data shape', test_data.shape)
mlflow.log_param('test data version', version)
mlflow.log_param('test data input_rows', test_data.shape[0])
mlflow.log_param('test data input_cols', test_data.shape[1])

In [27]:
print(f"lowest date: {train_data['Date'].min()}, highest date: {train_data['Date'].max()}")

lowest date: 2013-01-01 00:00:00, highest date: 2015-07-31 00:00:00


* We have 31 months of data.

## Setting up pipelines and building the model

### Set up the dataset

In [99]:
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, cross_validate

In [74]:
train_data_ts = train_data.copy()

In [75]:
train_data_ts

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,DayOfMonth,WeekOfYear,weekday
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,7,31,31,1
1,2,5,2015-07-31,6064,625,1,1,0,1,2015,7,31,31,1
2,3,5,2015-07-31,8314,821,1,1,0,1,2015,7,31,31,1
3,4,5,2015-07-31,13995,609,1,1,0,1,2015,7,31,31,1
4,5,5,2015-07-31,4822,559,1,1,0,1,2015,7,31,31,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1111,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1
1017205,1112,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1
1017206,1113,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1
1017207,1114,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1


In [76]:
X_train = train_data_ts.loc[train_data_ts['Date']<'2015-01-01']
X_test = train_data_ts.loc[train_data_ts['Date']>='2015-01-01']

In [77]:
X_train

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,DayOfMonth,WeekOfYear,weekday
236380,1,3,2014-12-31,2605,327,1,0,0,1,2014,12,31,1,1
236381,2,3,2014-12-31,2269,252,1,0,0,1,2014,12,31,1,1
236382,3,3,2014-12-31,3804,408,1,0,0,1,2014,12,31,1,1
236383,4,3,2014-12-31,10152,1311,1,0,0,1,2014,12,31,1,1
236384,5,3,2014-12-31,1830,217,1,0,0,1,2014,12,31,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1111,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1
1017205,1112,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1
1017206,1113,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1
1017207,1114,2,2013-01-01,0,0,0,0,1,1,2013,1,1,1,1


In [78]:
X_test

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,DayOfMonth,WeekOfYear,weekday
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,7,31,31,1
1,2,5,2015-07-31,6064,625,1,1,0,1,2015,7,31,31,1
2,3,5,2015-07-31,8314,821,1,1,0,1,2015,7,31,31,1
3,4,5,2015-07-31,13995,609,1,1,0,1,2015,7,31,31,1
4,5,5,2015-07-31,4822,559,1,1,0,1,2015,7,31,31,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236375,1111,4,2015-01-01,0,0,0,0,1,1,2015,1,1,1,1
236376,1112,4,2015-01-01,0,0,0,0,1,1,2015,1,1,1,1
236377,1113,4,2015-01-01,0,0,0,0,1,1,2015,1,1,1,1
236378,1114,4,2015-01-01,0,0,0,0,1,1,2015,1,1,1,1


In [79]:
X_train['Year'].value_counts()

2013    406974
2014    373855
Name: Year, dtype: int64

In [80]:
X_test['Year'].value_counts()

2015    236380
Name: Year, dtype: int64

* We set up the training and testing data sets like this because this is a time series data set.
* We are going to use the data prior to 2015 as training and the data after 2015 as testing data.

In [81]:
print(f"X_train and X_test: {X_train.shape + X_test.shape},\nTOtal: {X_train.shape[0] + X_test.shape[0]}")

X_train and X_test: (780829, 14, 236380, 14),
TOtal: 1017209


* 780,829(X_train rows) + 236,380(X_test rows) = 1,017,209(total training data set rows)
* Train and test split account for all the data inside the training data set.

In [82]:
# setup train and test data
y_train = X_train.loc[ : , 'Sales']
X_train = X_train.drop(['Sales', 'Date'], axis=1)


y_test = X_test.loc[ : , 'Sales']
X_test = X_test.drop(['Sales', 'Date'], axis=1)

In [91]:
mlflow.log_param('features', X_train.columns.to_list())
mlflow.log_param('target', y_train)

### Create cross folds, pipelines, scoring and models

In [134]:
# create standard scaler
std_scaler = StandardScaler()

# create the model parameters
rf_params = {'n_estimators' : 50, 'n_jobs' : -1, 'random_state' : 777}
mlflow.log_param('random forest parameters', rf_params)

# create the model and set the parameters to the model
rf_model = RandomForestRegressor(n_estimators=50,
                                 n_jobs = -1,
                                 random_state=777)

# create a pipeline
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
rf_pipe = Pipeline([('scaler', std_scaler),
                    ('random forest', rf_model)],
                     verbose=True)

### Choosing loss functions for regression

In [108]:
# shuffle split using 5 folds and tet size of 20%
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 77)

# scoring metrics
scoring = ['r2', 'neg_mean_squared_error', 'accuracy', 'precision']
mlflow.log_param('scoring', scoring)
mlflow.log_param('cross_validation', cv)

In [109]:
# random forest cross validation
rf_results = cross_validate(rf_pipe, X_train, y_train, cv=cv, scoring=scoring, 
                                return_train_score=False, return_estimator=True,
                                n_jobs=-1, Verbose=3)
# 49min

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   1.0s
[Pipeline] ..... (step 2 of 2) Processing random forest, total= 5.3min
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   7.5s
[Pipeline] ..... (step 2 of 2) Processing random forest, total= 4.5min
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   1.1s
[Pipeline] ..... (step 2 of 2) Processing random forest, total= 4.2min
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.9s
[Pipeline] ..... (step 2 of 2) Processing random forest, total=10.0min
[Pipeline] ............ (step 1 of 2) Processing scaler, total=   1.7s
[Pipeline] ..... (step 2 of 2) Processing random forest, total= 4.3min


### Fit the model and get scores

In [146]:
# fit the pipeline
rf_pipe._final_estimator.fit(X_train, y_train)
# took 4m 29s with 50 as the default n_estimators

# get the score
test_score = rf_pipe._final_estimator.score(X_test, y_test) 
test_score

0.9224562968773111

* Test_score = 0.9224562968773111
* The default mse score of the random forest

### Save and log model

In [116]:
# save the model
joblib.dump(rf_pipe._final_estimator, '../models/rf_model.pkl') 
mlflow.log_artifact("../models/rf_model.pkl")

In [144]:
rf_pipe

In [147]:
rf_pipe._final_estimator

In [148]:
rf_results

{'fit_time': array([347.73115754, 346.35609293, 335.47637033, 816.948838  ,
        375.49969101]),
 'score_time': array([ 79.86410213,  45.47661901,  70.56799674, 501.07304931,
          6.74499965]),
 'estimator': [Pipeline(steps=[('scaler', StandardScaler()),
                  ('random forest',
                   RandomForestRegressor(n_estimators=50, n_jobs=-1,
                                         random_state=777))],
           verbose=True),
  Pipeline(steps=[('scaler', StandardScaler()),
                  ('random forest',
                   RandomForestRegressor(n_estimators=50, n_jobs=-1,
                                         random_state=777))],
           verbose=True),
  Pipeline(steps=[('scaler', StandardScaler()),
                  ('random forest',
                   RandomForestRegressor(n_estimators=50, n_jobs=-1,
                                         random_state=777))],
           verbose=True),
  Pipeline(steps=[('scaler', StandardScaler()),
              

### Predict the test data using the model

#### update the test dataset to use the same features as the train data

In [125]:
#test_data = transform_date_column(df = test_data, column = 'Date')

test_data['Year'] = test_data['Date'].apply(lambda x: x.year)
test_data['Month'] = test_data['Date'].apply(lambda x: x.month)
test_data['DayOfMonth'] = test_data['Date'].apply(lambda x: x.day)
test_data['WeekOfYear'] = test_data['Date'].apply(lambda x: x.weekofyear)
test_data['weekday'] = test_data['Date'].apply(lambda x: 0 if (x in [6, 7]) else 1)

test_data.drop(['Date', 'Id'], axis = 1, inplace = True)

In [131]:
train_data['Customers'].value_counts()#.max()

0      172869
609     40369
560      2414
576      2363
603      2337
        ...  
8           1
87          1
13          1
67          1
36          1
Name: Customers, Length: 1424, dtype: int64

In [129]:
test_data.shape

(41088, 11)

#### add customers feature, becaus the standard scaler expects the features to be the same

In [132]:
random_customers = pd.np.random.randint(low = 1, high = 172869, size = 41088)
test_data['Customers'] = random_customers
test_data

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,DayOfMonth,WeekOfYear,weekday,Customers
0,1,4,1,1,0,0,2015,9,17,38,1,142776
1,3,4,1,1,0,0,2015,9,17,38,1,6048
2,7,4,1,1,0,0,2015,9,17,38,1,24141
3,8,4,1,1,0,0,2015,9,17,38,1,155296
4,9,4,1,1,0,0,2015,9,17,38,1,80520
...,...,...,...,...,...,...,...,...,...,...,...,...
41083,1111,6,1,0,0,0,2015,8,1,31,1,70445
41084,1112,6,1,0,0,0,2015,8,1,31,1,84507
41085,1113,6,1,0,0,0,2015,8,1,31,1,170994
41086,1114,6,1,0,0,0,2015,8,1,31,1,40471


In [149]:
y_pred = rf_pipe._final_estimator.predict(test_data)
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

### Save prediction

In [137]:
# saving predictions
data = {"Id" : range(0,41088,1), "Sales" : y_pred}
predictions = pd.DataFrame(data = data)
predictions.to_csv('../predictions/rf_predictions.csv', index = False)
mlflow.log_param('rf predictions', predictions)

In [141]:
mlflow.end_run()