In [1]:
import pandas as pd
import sys  
sys.path.insert(0, '../scripts')
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import clean_data
import loading_data
import utilities
from sklearn.neighbors import LocalOutlierFactor
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


In [2]:
# load data
df_store = loading_data.load_csv('../data/store.csv')
df_train = loading_data.load_csv('../data/train.csv')
df_test = loading_data.load_csv('../data/test.csv')
df_submission = loading_data.load_csv('../data/sample_submission.csv')



  


  This is separate from the ipykernel package so we can avoid doing imports until


  after removing the cwd from sys.path.


  """


# display some data from training and testing data
### The testing data contains 8 columns, it doesn't include the sales because that is what we will be predicting.



In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Customers      1017209 non-null  int64 
 5   Open           1017209 non-null  int64 
 6   Promo          1017209 non-null  int64 
 7   StateHoliday   1017209 non-null  object
 8   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 69.8+ MB


## Preprocessing Step
### Process the data into a format where it can be fed to a machine learning model. This typically means converting all non-numeric columns to numeric, handling NaN values and generating new features from already existing features. 

### In our case, you have a few datetime columns to preprocess. you can extract the following from them:
- weekdays
- weekends 
- number of days to holidays
- Number of days after holiday
- Beginning of month, mid month and ending of month
(think of more features to extract), extra marks for it
			


In [4]:
# converting columns to numerical data types
# Encoding state holiday values  
df_train = utilities.format_datetime(df_train,"Date")
df_test = utilities.format_datetime(df_test,"Date")

# extracting numerical information from the date columns
# the year
df_train_copy = df_train.copy()
df_train_copy["Year"] = df_train_copy['Date'].dt.year
# which part of the month it is where 0 is begining, 1 is mid and 2 is end
df_train_copy["Part of the month"] = df_train_copy['Date'].dt.day.apply(lambda x: x // 10)
df_train_copy.loc[(df_train_copy["Date"].dt.day == 31), "Part of the month"] = 2
# How many days before or after holidays

# Encoding holiday values to numerical ones
holidays = {'0': 0, 'a': 1, 'b':2, 'c':3}
df_train_copy["StateHoliday"] = df_train_copy['StateHoliday'].map(lambda x: holidays[x])
df_train_copy = df_train_copy.drop(columns=["Date"])
df_train_copy.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype
---  ------             --------------    -----
 0   Store              1017209 non-null  int64
 1   DayOfWeek          1017209 non-null  int64
 2   Sales              1017209 non-null  int64
 3   Customers          1017209 non-null  int64
 4   Open               1017209 non-null  int64
 5   Promo              1017209 non-null  int64
 6   StateHoliday       1017209 non-null  int64
 7   SchoolHoliday      1017209 non-null  int64
 8   Year               1017209 non-null  int64
 9   Part of the month  1017209 non-null  int64
dtypes: int64(10)
memory usage: 77.6 MB


#### Now that we have all our data converted to numerical values, We need to take out sales as our target variable and the others as our features.

In [39]:
# preparing our features
y_train = np.array(df_train_copy['Sales']).reshape(-1, 1) 
Store_ids = list(df_train_copy['Store'])
X_train = np.array(df_train_copy.drop(columns=['Sales','Store']))

y_train.shape, X_train.shape
# X_train

((1017209, 1), (1017209, 8))

### As a final thing, we have to scale the data. This helps with predictions especially when using machine learning algorithms that use Euclidean distances. you can use the standard scaler in sklearn for this.


In [40]:
# using standard scalar to scale our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[ 0.50148416, -0.16826876,  0.45239852, ...,  2.14421115,
         1.50207687,  1.03142726],
       [ 0.50148416, -0.01754036,  0.45239852, ...,  2.14421115,
         1.50207687,  1.03142726],
       [ 0.50148416,  0.40449914,  0.45239852, ...,  2.14421115,
         1.50207687,  1.03142726],
       ...,
       [-1.00047591, -1.36332959, -2.21044047, ...,  2.14421115,
        -1.07061593, -1.28096673],
       [-1.00047591, -1.36332959, -2.21044047, ...,  2.14421115,
        -1.07061593, -1.28096673],
       [-1.00047591, -1.36332959, -2.21044047, ...,  2.14421115,
        -1.07061593, -1.28096673]])

### A reasonable starting point will be to use any of the tree based algorithms. Random forests Regressor will make for a good start. Before training our random forest model, we will transform, encode and scale our data. We will use 

In [42]:
clf_decisions = tree.DecisionTreeClassifier()
decision_clf_trained = clf_decisions.fit(X_train[:100000,],y_train[:100000])


array([[0.00000000e+00, 6.66666667e-01, 7.51218192e-02, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [8.97666068e-04, 6.66666667e-01, 8.45966432e-02, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.79533214e-03, 6.66666667e-01, 1.11126151e-01, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       ...,
       [9.98204668e-01, 1.66666667e-01, 0.00000000e+00, ...,
        3.33333333e-01, 1.00000000e+00, 0.00000000e+00],
       [9.99102334e-01, 1.66666667e-01, 0.00000000e+00, ...,
        3.33333333e-01, 1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.66666667e-01, 0.00000000e+00, ...,
        3.33333333e-01, 1.00000000e+00, 0.00000000e+00]])

### Now that we have scaled our data, we will 