In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
sales = pd.read_csv("C:\\Users\\DHRUBAJIT\\Desktop\\AnalyticsVidhya\\Big Mart Sales\\new_sales_csv.csv")
sales.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_type_new
0,FDA15,weights2,Low Fat,low,Dairy,249.8092,OUT049,med,High,Tier 1,Supermarket Type1,3735.138,other
1,DRC01,weights1,Regular,low,Soft Drinks,48.2692,OUT018,low,High,Tier 2,Supermarket Type2,443.4228,Drinks
2,FDN15,weights1,Low Fat,low,Meat,141.618,OUT049,med,High,Tier 1,Supermarket Type1,2097.27,other
3,FDX07,weights1,Regular,low,Fruits and Vegetables,182.095,OUT010,med,High,Tier 2,Grocery Store,732.38,other
4,NCD19,weights1,Low Fat,low,Household,53.8614,OUT013,med,High,Tier 2,Supermarket Type1,994.7052,other


In [3]:
#deleting unwanted features 
#also deleted Item_Fat_Content because from hypothesis testing,we found that there is no difference in the means.
sales = sales.drop(['Item_Identifier','Item_Type','Outlet_Identifier','Item_Fat_Content'], axis=1)

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

encoder_list = ['Item_Weight', 'Item_Visibility',
       'Outlet_Establishment_Year', 'Outlet_Size',
       'Outlet_Location_Type', 'Outlet_Type', 
       'Item_type_new']

for i in encoder_list:
    sales[i] = le.fit_transform(sales[i])

In [5]:
sales = pd.get_dummies(sales, columns=encoder_list)

In [6]:
#avoiding dummy variable trap
sales = sales.drop(['Item_Weight_0','Item_Visibility_0','Outlet_Establishment_Year_0','Outlet_Size_0',
                   'Outlet_Location_Type_0','Outlet_Type_0','Item_type_new_0'], axis=1)

In [7]:
X = sales.drop(['Item_Outlet_Sales'], axis=1)
y = sales.Item_Outlet_Sales

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
sc = StandardScaler()
XX = sc.fit_transform(X)

In [9]:
from sklearn.linear_model import SGDRegressor
from sklearn.feature_selection import SelectFromModel

lsvc = SGDRegressor(penalty="l2").fit(XX, y)
model = SelectFromModel(lsvc, prefit=True)
XXX = model.transform(XX)
print("Original number of features : %d" %XX.shape[1])
print("Selected number of features : %d" %XXX.shape[1])

Original number of features : 11
Selected number of features : 4




#### Test dataset

In [11]:
test = pd.read_csv("C:\\Users\\DHRUBAJIT\\Desktop\\AnalyticsVidhya\\Big Mart Sales\\new_testsales_csv.csv")

item_id = test['Item_Identifier']
out_id = test['Outlet_Identifier']

test = test.drop(['Item_Identifier','Item_Type','Outlet_Identifier','Item_Fat_Content'], axis=1)

for i in encoder_list:
    test[i] = le.fit_transform(test[i])
    
test = pd.get_dummies(test, columns=encoder_list)

test = test.drop(['Item_Weight_0','Item_Visibility_0','Outlet_Establishment_Year_0','Outlet_Size_0',
                   'Outlet_Location_Type_0','Outlet_Type_0','Item_type_new_0'], axis=1)

test = sc.fit_transform(test)

In [12]:
test = model.transform(test)

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def validation_cross(model, x,y):
    model.fit(x, y)

    #Predict training set:
    pred = model.predict(x)

    #Perform cross-validation:
    cv_score = cross_val_score(model, x, y, cv=10, scoring='neg_mean_squared_error')
    cv_score = np.sqrt(np.abs(cv_score))

    #Print model report:
    print ("\nModel Report")
    print ("RMSE : %.4g" % np.sqrt(mean_squared_error(y.values, pred)))
    print ("CV Score : Mean - %.4g | Std - %.4g" % (np.mean(cv_score),np.std(cv_score)))
    

def validation_cross1(model, x,y):
    model.fit(x, y)

    #Predict training set:
    pred = model.predict(x)

    #Perform cross-validation:
    cv_score = cross_val_score(model, x, y, cv=10, scoring=None)
    cv_score = np.sqrt(np.abs(cv_score))

    #Print model report:
    print ("\nModel Report")
    print ("RMSE : %.4g" % np.sqrt(mean_squared_error(y.values, pred)))
    print ("CV Score : Mean - %.4g | Std - %.4g" % (np.mean(cv_score),np.std(cv_score)))

### Baseline Model with l2-based feature selection

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
validation_cross1(gbr, XXX, y)


Model Report
RMSE : 1052
CV Score : Mean - 0.7674 | Std - 0.01833


#### Model Tuning

In [13]:
# 1. n_estimators

param_test1 = {'n_estimators':range(50,1050,50)}
gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = param_test1, n_jobs=-1, iid=False, cv=10)
gsearch1.fit(XXX,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.59126, std: 0.02739, params: {'n_estimators': 50},
  mean: 0.58913, std: 0.02794, params: {'n_estimators': 100},
  mean: 0.58483, std: 0.02795, params: {'n_estimators': 150},
  mean: 0.58052, std: 0.02769, params: {'n_estimators': 200},
  mean: 0.57581, std: 0.02849, params: {'n_estimators': 250},
  mean: 0.57158, std: 0.02879, params: {'n_estimators': 300},
  mean: 0.56796, std: 0.02916, params: {'n_estimators': 350},
  mean: 0.56494, std: 0.03097, params: {'n_estimators': 400},
  mean: 0.56187, std: 0.03047, params: {'n_estimators': 450},
  mean: 0.55844, std: 0.03084, params: {'n_estimators': 500},
  mean: 0.55509, std: 0.03029, params: {'n_estimators': 550},
  mean: 0.55245, std: 0.03043, params: {'n_estimators': 600},
  mean: 0.55001, std: 0.03113, params: {'n_estimators': 650},
  mean: 0.54824, std: 0.03076, params: {'n_estimators': 700},
  mean: 0.54600, std: 0.03115, params: {'n_estimators': 750},
  mean: 0.54409, std: 0.03192, params: {'n_estimators': 800},
  mean: 0

In [14]:
gbr1 = GradientBoostingRegressor(n_estimators = 50)
validation_cross1(gbr1, XXX, y)


Model Report
RMSE : 1066
CV Score : Mean - 0.7687 | Std - 0.01788


In [16]:
# 2. max_depth

param_test2 = {'max_depth':range(1,10,1)}
gsearch2 = GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = param_test2,n_jobs=-1,iid=False, cv=10)
gsearch2.fit(XXX,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_



([mean: 0.50672, std: 0.01936, params: {'max_depth': 1},
  mean: 0.58588, std: 0.02668, params: {'max_depth': 2},
  mean: 0.58922, std: 0.02804, params: {'max_depth': 3},
  mean: 0.58529, std: 0.02766, params: {'max_depth': 4},
  mean: 0.57477, std: 0.02923, params: {'max_depth': 5},
  mean: 0.56432, std: 0.03004, params: {'max_depth': 6},
  mean: 0.54888, std: 0.03209, params: {'max_depth': 7},
  mean: 0.53518, std: 0.03380, params: {'max_depth': 8},
  mean: 0.51602, std: 0.03772, params: {'max_depth': 9}],
 {'max_depth': 3},
 0.5892210987917601)

In [17]:
gbr2 = GradientBoostingRegressor(max_depth = 3)
validation_cross1(gbr2, XXX, y)


Model Report
RMSE : 1052
CV Score : Mean - 0.7673 | Std - 0.01839


In [19]:
# 3. min_samples_split & min_samples_leaf

param_test3 = {'min_samples_split':range(2,20,1), 'min_samples_leaf':range(1,20,1)}
gsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = param_test3,n_jobs=-1,iid=False, cv=10)
gsearch3.fit(XXX,y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.58915, std: 0.02797, params: {'min_samples_leaf': 1, 'min_samples_split': 2},
  mean: 0.58947, std: 0.02807, params: {'min_samples_leaf': 1, 'min_samples_split': 3},
  mean: 0.58970, std: 0.02810, params: {'min_samples_leaf': 1, 'min_samples_split': 4},
  mean: 0.58934, std: 0.02761, params: {'min_samples_leaf': 1, 'min_samples_split': 5},
  mean: 0.58946, std: 0.02762, params: {'min_samples_leaf': 1, 'min_samples_split': 6},
  mean: 0.58970, std: 0.02744, params: {'min_samples_leaf': 1, 'min_samples_split': 7},
  mean: 0.58981, std: 0.02760, params: {'min_samples_leaf': 1, 'min_samples_split': 8},
  mean: 0.58975, std: 0.02759, params: {'min_samples_leaf': 1, 'min_samples_split': 9},
  mean: 0.58944, std: 0.02758, params: {'min_samples_leaf': 1, 'min_samples_split': 10},
  mean: 0.58971, std: 0.02766, params: {'min_samples_leaf': 1, 'min_samples_split': 11},
  mean: 0.58971, std: 0.02772, params: {'min_samples_leaf': 1, 'min_samples_split': 12},
  mean: 0.59009, std: 0.02786

In [20]:
gbr3 = GradientBoostingRegressor(min_samples_split = 17, min_samples_leaf = 8)
validation_cross1(gbr3, XXX, y)


Model Report
RMSE : 1055
CV Score : Mean - 0.7706 | Std - 0.01806


In [24]:
# 4. max_features

param_test4 = {'max_features':range(1,4,1)}
gsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(),param_grid = param_test4,n_jobs=4,iid=False, cv=10)
gsearch4.fit(XXX,y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.59452, std: 0.02760, params: {'max_features': 1},
  mean: 0.59357, std: 0.02731, params: {'max_features': 2},
  mean: 0.59140, std: 0.02747, params: {'max_features': 3}],
 {'max_features': 1},
 0.59452386539240731)

In [25]:
gbr4 = GradientBoostingRegressor(max_features = 1)
validation_cross1(gbr4, XXX, y)


Model Report
RMSE : 1053
CV Score : Mean - 0.7705 | Std - 0.01792


In [27]:
# 4. max_features - strings

param_test4 = {'max_features':['auto','sqrt','log2']}
gsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(),param_grid = param_test4,n_jobs=4,iid=False, cv=10)
gsearch4.fit(XXX,y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.58914, std: 0.02805, params: {'max_features': 'auto'},
  mean: 0.59374, std: 0.02713, params: {'max_features': 'sqrt'},
  mean: 0.59322, std: 0.02700, params: {'max_features': 'log2'}],
 {'max_features': 'sqrt'},
 0.59373891795703515)

In [28]:
gbr4 = GradientBoostingRegressor(max_features = 'sqrt')
validation_cross1(gbr4, XXX, y)


Model Report
RMSE : 1053
CV Score : Mean - 0.7698 | Std - 0.0179


In [33]:
# 5. subsample

param_test5 = {'subsample': [0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0]}
gsearch5 = GridSearchCV(estimator = GradientBoostingRegressor(max_features = 1),param_grid = param_test5,n_jobs=-1,iid=False, cv=10)
gsearch5.fit(XXX,y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.59221, std: 0.02794, params: {'subsample': 0.6},
  mean: 0.59302, std: 0.02685, params: {'subsample': 0.65},
  mean: 0.59292, std: 0.02950, params: {'subsample': 0.7},
  mean: 0.59201, std: 0.02831, params: {'subsample': 0.75},
  mean: 0.59253, std: 0.02670, params: {'subsample': 0.8},
  mean: 0.59444, std: 0.02761, params: {'subsample': 0.85},
  mean: 0.59346, std: 0.02796, params: {'subsample': 0.9},
  mean: 0.59368, std: 0.02783, params: {'subsample': 0.95},
  mean: 0.59443, std: 0.02694, params: {'subsample': 1.0}],
 {'subsample': 0.85},
 0.59443636156738844)

In [34]:
gbr5 = GradientBoostingRegressor(max_features = 1, subsample = 0.85)
validation_cross1(gbr5, XXX, y)


Model Report
RMSE : 1054
CV Score : Mean - 0.7701 | Std - 0.0182


In [36]:
# 6. learning_rate

param_test6 = {'learning_rate':[0.0001,0.001,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4]}
gsearch6 = GridSearchCV(estimator = GradientBoostingRegressor(max_features = 1, subsample = 0.85),param_grid = param_test6,n_jobs=-1,iid=False, cv=10)
gsearch6.fit(XXX,y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: 0.00696, std: 0.00076, params: {'learning_rate': 0.0001},
  mean: 0.06964, std: 0.00227, params: {'learning_rate': 0.001},
  mean: 0.07102, std: 0.00164, params: {'learning_rate': 0.001},
  mean: 0.27439, std: 0.01014, params: {'learning_rate': 0.005},
  mean: 0.41673, std: 0.00918, params: {'learning_rate': 0.01},
  mean: 0.59103, std: 0.02473, params: {'learning_rate': 0.05},
  mean: 0.59379, std: 0.02896, params: {'learning_rate': 0.1},
  mean: 0.58516, std: 0.02774, params: {'learning_rate': 0.2},
  mean: 0.57864, std: 0.03138, params: {'learning_rate': 0.3},
  mean: 0.57067, std: 0.03375, params: {'learning_rate': 0.4}],
 {'learning_rate': 0.1},
 0.59379177357654567)

In [38]:
gbr6 = GradientBoostingRegressor(max_features = 1, subsample = 0.85,learning_rate= 0.1)
validation_cross1(gbr6, XXX, y)


Model Report
RMSE : 1051
CV Score : Mean - 0.7701 | Std - 0.01826


In [40]:
# 7. loss
param_test7 = {'loss':['ls','lad','huber','quantile']}
gsearch7 = GridSearchCV(estimator = GradientBoostingRegressor(max_features = 1, subsample = 0.85),param_grid = param_test7,n_jobs=-1,iid=False, cv=10)
gsearch7.fit(XXX,y)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_



([mean: 0.59471, std: 0.02779, params: {'loss': 'ls'},
  mean: 0.59335, std: 0.02354, params: {'loss': 'lad'},
  mean: 0.59338, std: 0.02524, params: {'loss': 'huber'},
  mean: -0.09399, std: 0.16519, params: {'loss': 'quantile'}],
 {'loss': 'ls'},
 0.59470687483435847)

In [13]:
gbr7 = GradientBoostingRegressor(max_features = 1, subsample = 0.85,loss= 'ls')
validation_cross1(gbr7, XXX, y)


Model Report
RMSE : 1052
CV Score : Mean - 0.7709 | Std - 0.01847
