In [1]:
#Initial Model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

In [2]:
#Load the Data
df1 = pd.read_csv('Datasets/Walmart_Sales_Forecasting.csv')
print(len(df1))
df1.head()

421570


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [3]:
df2 = pd.read_csv('Datasets/walmart_store_details.csv')
print(len(df2))
df2.head()

45


Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [4]:
df3 = pd.read_csv('Datasets/Walmart_features.csv')
print(len(df3))
df3.head()

8190


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [5]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
Store           8190 non-null int64
Date            8190 non-null object
Temperature     8190 non-null float64
Fuel_Price      8190 non-null float64
MarkDown1       4032 non-null float64
MarkDown2       2921 non-null float64
MarkDown3       3613 non-null float64
MarkDown4       3464 non-null float64
MarkDown5       4050 non-null float64
CPI             7605 non-null float64
Unemployment    7605 non-null float64
IsHoliday       8190 non-null bool
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 711.9+ KB


In [6]:
df = pd.merge(df1, df2)
print(len(df))
df.head(2)

421570


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size
0,1,1,2010-02-05,24924.5,False,A,151315
1,1,1,2010-02-12,46039.49,True,A,151315


In [7]:
df = pd.merge(df, df3)
print(len(df))
df.head(2)

421570


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106
1,1,2,2010-02-05,50605.27,False,A,151315,42.31,2.572,,,,,,211.096358,8.106


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 16 columns):
Store           421570 non-null int64
Dept            421570 non-null int64
Date            421570 non-null object
Weekly_Sales    421570 non-null float64
IsHoliday       421570 non-null bool
Type            421570 non-null object
Size            421570 non-null int64
Temperature     421570 non-null float64
Fuel_Price      421570 non-null float64
MarkDown1       150681 non-null float64
MarkDown2       111248 non-null float64
MarkDown3       137091 non-null float64
MarkDown4       134967 non-null float64
MarkDown5       151432 non-null float64
CPI             421570 non-null float64
Unemployment    421570 non-null float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 51.9+ MB


In [9]:
import numpy as np

In [10]:
#Absolute Minimum Preprocessing
df = df.fillna(value=0)

#Define the problem
y_feat = 'Weekly_Sales'
x_feats = [feat for feat in df.columns if df[feat].dtype in [np.int64, np.float64] and feat != y_feat]
y = df[y_feat]
X = df[x_feats]

In [11]:
#Initial Models and Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression(), Lasso(), Ridge()]
names = ['OLS', 'Lasso', 'Ridge']
for model, name in list(zip(models, names)):
    model.fit(X_train, y_train)
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test) 
    print('Model Stats for: {}'.format(name))
    print('Train R^2:', model.score(X_train, y_train))
    print('Test R^2:', model.score(X_test, y_test))
    print('Training MSE: {}'.format(mean_squared_error(y_train, y_hat_train)))
    print('Testing MSE: {}'.format(mean_squared_error(y_test, y_hat_test)))
    print('\n')

Model Stats for: OLS
Train R^2: 0.0859374571534
Test R^2: 0.0861852175659
Training MSE: 472091962.3200303
Testing MSE: 469477228.44418436


Model Stats for: Lasso
Train R^2: 0.0859374443532
Test R^2: 0.0861841120037
Training MSE: 472091968.93105274
Testing MSE: 469477796.4326916


Model Stats for: Ridge
Train R^2: 0.0859374571534
Test R^2: 0.0861852159926
Training MSE: 472091962.3200483
Testing MSE: 469477229.2524677




In [12]:
X.head(2)

Unnamed: 0,Store,Dept,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
1,1,2,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106


# Comment: 
Pretty darn poor results.

## Thoughts:

* Store is misleading: its an integer but really a categorical variable. Use a dummy variable.
* Dept has same problem as store
* We can engineer some potential useful features from our date field. The most obvious is month.
* We can make another dummy variable for the IsHoliday feature

Start there and repeat.

In [13]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
1,1,2,2010-02-05,50605.27,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106


In [14]:
df['Date'] = pd.to_datetime(df.Date)
df['Month'] = df.Date.dt.month
df.Store = df.Store.astype(str) #turn to string for creating dummy variables
df.Dept = df.Dept.astype(str)
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2
1,1,2,2010-02-05,50605.27,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2


In [15]:
#Define the problem
y_feat = 'Weekly_Sales'
x_feats = [feat for feat in df.columns if df[feat].dtype in [np.int64, np.float64] and feat != y_feat]
x_feats += ['Store', 'Dept', 'IsHoliday']


y = df[y_feat]
X = df[x_feats]
X = pd.get_dummies(X)
print(X.columns)
X.head()

Index(['Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       ...
       'Dept_90', 'Dept_91', 'Dept_92', 'Dept_93', 'Dept_94', 'Dept_95',
       'Dept_96', 'Dept_97', 'Dept_98', 'Dept_99'],
      dtype='object', length=138)


Unnamed: 0,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
1,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
2,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
3,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
4,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#Second Models and Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression(), Lasso(), Ridge()]
names = ['OLS', 'Lasso', 'Ridge']
for model, name in list(zip(models, names)):
    model.fit(X_train, y_train)
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test) 
    print('Model Stats for: {}'.format(name))
    print('Train R^2:', model.score(X_train, y_train))
    print('Test R^2:', model.score(X_test, y_test))
    print('Training MSE: {}'.format(mean_squared_error(y_train, y_hat_train)))
    print('Testing MSE: {}'.format(mean_squared_error(y_test, y_hat_test)))
    print('\n')

Model Stats for: OLS
Train R^2: 0.657385086927
Test R^2: 0.662639700184
Training MSE: 177726355.95465162
Testing MSE: 171032615.63090432






Model Stats for: Lasso
Train R^2: 0.657334696714
Test R^2: 0.662620762537
Training MSE: 177752495.12312382
Testing MSE: 171042216.5095258


Model Stats for: Ridge
Train R^2: 0.657386594991
Test R^2: 0.662653080781
Training MSE: 177725573.6686188
Testing MSE: 171025832.02714646




# Comments: 
Still not a perfect model by any means, but we have drastically improved results from a measly .08 R^2 value all the way to .65 R^2. Nice! Let's see if we can squeeze out some more performance!!m

# Impact of Normalization

In [21]:
#Third Round Models and Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression(normalize=True), Lasso(normalize=True), Ridge(normalize=True)]
names = ['OLS', 'Lasso', 'Ridge']
for model, name in list(zip(models, names)):
    model.fit(X_train, y_train)
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test) 
    print('Model Stats for: {}'.format(name))
    print('Train R^2:', model.score(X_train, y_train))
    print('Test R^2:', model.score(X_test, y_test))
    print('Training MSE: {}'.format(mean_squared_error(y_train, y_hat_train)))
    print('Testing MSE: {}'.format(mean_squared_error(y_test, y_hat_test)))
    print('\n')

Model Stats for: OLS
Train R^2: 0.658694111804
Test R^2: 0.658604995604
Training MSE: 176265154.21642897
Testing MSE: 175428032.26096848


Model Stats for: Lasso
Train R^2: 0.600546001385
Test R^2: 0.600934145801
Training MSE: 206295358.80672494
Testing MSE: 205062571.63479063


Model Stats for: Ridge
Train R^2: 0.499666174283
Test R^2: 0.501441083593
Training MSE: 258394074.05435443
Testing MSE: 256187725.4949077




In [24]:
#Using Cross Validation
def run_cross_validation_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model.fit(X_train, y_train)
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test) 
    print('Model Stats for: {}'.format('LassoCV'))
    print('Train R^2:', model.score(X_train, y_train))
    print('Test R^2:', model.score(X_test, y_test))
    print('Training MSE: {}'.format(mean_squared_error(y_train, y_hat_train)))
    print('Testing MSE: {}'.format(mean_squared_error(y_test, y_hat_test)))
    print('Model details:', model)
    print('Cross Validation Optimal Alpha Value for Regularization', model.alpha_)
    print('\n')    


for model in [LassoCV(normalize=True), RidgeCV(normalize=True)]:
    run_cross_validation_model(model, X, y)

Model Stats for: LassoCV
Train R^2: 0.657861675011
Test R^2: 0.661180567972
Training MSE: 177345635.5320944
Testing MSE: 172169999.91136828
Model details: LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=True, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)
Cross Validation Optimal Alpha Value for Regularization 0.013168994176


Model Stats for: LassoCV
Train R^2: 0.650235389754
Test R^2: 0.662749959824
Training MSE: 180765246.93405983
Testing MSE: 172915207.5505919
Model details: RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None,
    normalize=True, scoring=None, store_cv_values=False)
Cross Validation Optimal Alpha Value for Regularization 0.1




# Attempt Additional Feature Engineering

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import seaborn as sns

In [26]:
def plot_polynomial_reg(model, X, y, degree_min=2, degree_max=7):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    degrees = []
    train_errs = []
    test_errs = []
    for degree in range(degree_min,degree_max+1):
        model = make_pipeline(PolynomialFeatures(degree, interaction_only=False), model)
        #Could replace Ridge() above with a more complicated cross validation method to improve tuning
        #using a cross validation method will substantially increase runtime
        model.fit(X_train,y_train)
        #Get r^2 values for testing predictions and training predictions
        test_score = model.score(X_test,y_test)
        test_errs.append(test_score)
        
        train_score = model.score(X_train,y_train)
        train_errs.append(train_score)
        
        degrees.append(degree)
    #Create Plot
    plt.scatter(degrees, train_errs, label='Train R^2')
    plt.scatter(degrees, test_errs, label='Test R^2')
    plt.title('Train and Test Accuracy vs Model Complexity')
    plt.xlabel('Maximum Degree of Polynomial Regression')
    plt.legend()

for model in [Ridge(), Lasso()]:
    plot_polynomial_reg(Ridge(), X, y)

MemoryError: 

# Final Pipeline

Choose current best model.