In [None]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation, metrics

In [None]:
#read the train and test data.add a source column to specify its test/train data.combine them for data preprocessing
traindata = pd.read_csv("/home/arko/Downloads/AnalyticsVidya/BigmartsalesPrediction/Train_UWu5bXk.csv")
testdata = pd.read_csv("/home/arko/Downloads/AnalyticsVidya/BigmartsalesPrediction/Test_u94Q5KV.csv")
traindata['source']='train'
testdata['source']='test'
salesdata = pd.concat([traindata,testdata],ignore_index=True)
print traindata.shape,testdata.shape,salesdata.shape

In [None]:
#observe the numeric columns
salesdata.describe()

In [None]:
#rather than the establishment year, we can have a computed column saying how old the shop is. that makes more sense.
#year under consideration is 2013
salesdata['outlet_year'] = 2013 - salesdata['Outlet_Establishment_Year']
del salesdata['Outlet_Establishment_Year']

In [None]:
#the minimum value of item visibility = 0 which does not make sense.
#replace the zero values with mean for every product. 
#We need to identify the products by Product identifier
visibility_avg_byProduct = salesdata.pivot_table(values = 'Item_Visibility',index = 'Item_Identifier')
miss_bool = (salesdata['Item_Visibility']==0)
salesdata.loc[miss_bool,'Item_Visibility'] = salesdata.loc[miss_bool,'Item_Identifier'].apply(lambda x : 
                                                                                              visibility_avg_byProduct[x])

In [None]:
#Determine another variable with means ratio which will give 
#some idea about how much importance was given to that product in a store as compared to other stores
salesdata['ItemVisibilityMeanRatio'] = salesdata.apply(lambda x : 
                                       x['Item_Visibility']/visibility_avg_byProduct[x['Item_Identifier']],axis = 1)

In [None]:
#replace the missing values of item Weight with the mean
#again for this, we need to identify items by item identifier
itemweight_avg_byProduct = salesdata.pivot_table(values = 'Item_Weight',index = 'Item_Identifier')
miss_bool = salesdata['Item_Weight'].isnull()
salesdata.loc[miss_bool,'Item_Weight'] = salesdata.loc[miss_bool,'Item_Identifier'].apply(
    lambda x : itemweight_avg_byProduct[x])

In [None]:
#for outlet size, replace with mode
#need to segregate by outlet type
outlet_size_avg = salesdata.pivot_table(values='Outlet_Size', columns='Outlet_Type',aggfunc= (lambda x : mode(x).mode[0]) )
miss_bool = salesdata['Outlet_Size'].isnull()
salesdata.loc[miss_bool,'Outlet_Size'] = salesdata.loc[miss_bool,'Outlet_Type'].apply(lambda x : outlet_size_avg[x])

In [None]:
#get the number of Item types
salesdata['Item_Type'].value_counts()

In [None]:
#Item_Type variable has 16 categories which might prove to be very useful in analysis.
#So its a good idea to combine them. One way could be to manually assign a new category to each. 
#But there’s a catch here. If you look at the Item_Identifier, i.e. the unique ID of each item, it starts 
#with either FD, DR or NC. If you see the categories, these look like being Food, Drinks and Non-Consumables. 
#So I’ve used the Item_Identifier variable to create a new column
salesdata['ItemTypeModified'] = salesdata['Item_Identifier'].apply(lambda x : x[0:2])
#rename these
salesdata['ItemTypeModified'] = salesdata['ItemTypeModified'].map({'FD':'Food',
                                                                'NC':'Non-Consumable',
                                                                  'DR':'Drinks'})

In [None]:
#Correcting typos and difference in representation in categories of Item_Fat_Content
salesdata['Item_Fat_Content'] = salesdata['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                                                      'low fat': 'Low Fat',
                                                                      'reg': 'Regular'})
#there were some non-consumables as well and a fat-content should not be specified for them. 
#So we can also create a separate category for such kind of observations
salesdata.loc[salesdata['ItemTypeModified']=='Non-Consumable','Item_Fat_Content'] = 'Non-Edible'

In [None]:
#Since scikit-learn accepts only numerical variables, convert all categories of nominal variables into numeric types.
#Also, need Outlet_Identifier as a variable as well. So I created a new variable ‘Outlet’ same as Outlet_Identifier 
#and coded that. Outlet_Identifier should remain as it is, because it will be required in the submission file
le = LabelEncoder()
salesdata['Outlet'] = le.fit_transform(salesdata['Outlet_Identifier'])
variables_to_modify = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','ItemTypeModified','Outlet_Type','Outlet']
for i in variables_to_modify:
    salesdata[i] = le.fit_transform(salesdata[i])
#Now do One-Hot encoding for these variables
salesdata = pd.get_dummies(salesdata,columns = variables_to_modify)

In [None]:
salesdata.dtypes

In [None]:
#now that the preprocessing is done,we re-divide to train and test and delete the unnecessary columns
del salesdata['Item_Type']
traindata = salesdata.loc[salesdata['source']=='train']
testdata = salesdata.loc[salesdata['source']=='test']
del traindata['source']
testdata.drop(['source','Item_Outlet_Sales'],axis = 1, inplace = True) #for multiple columns. del won't work

In [50]:
#writing a generic function for running a model,cross validating and getting predictions
target = 'Item_Outlet_Sales'
IDcols = ['Item_Identifier','Outlet_Identifier']
def get_modelfit(alg,dtrain,dtest,predictors,target = target,IDcols = IDcols):
    #run the algorithm on train data
    alg.fit(dtrain[predictors],dtrain[target])
    #predict on train data
    dtrain_preds = alg.predict(dtrain[predictors])
    #perform cross validation
    cv_score = cross_validation.cross_val_score(alg,dtrain[predictors],dtrain[target],cv=20,scoring = 'mean_squared_error')
    cv_score = np.square(np.abs(cv_score))
    #get the model report
    print "\n Model Report \n"
    print "RMSE : %.4g " % np.sqrt(metrics.mean_squared_error(dtrain[target].values,dtrain_preds))
    print "CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),
                                                    np.std(cv_score),np.min(cv_score),np.max(cv_score))
    #predict on test data
    dtest[target] = alg.predict(dtest[predictors])
    #generate the prediction file containing the Item and Outlet identifiers and the predictions
    IDcols.append(target)
    resultdata = pd.DataFrame({x:dtest[x] for x in IDcols})
    return resultdata

In [51]:
#lets try out Linear regression model
from sklearn.linear_model import LinearRegression
predictors = [x for x in traindata.columns if x not in [target]+ IDcols]
alg1 = LinearRegression(normalize = True)
result1 = get_modelfit(alg1,traindata,testdata,predictors)


 Model Report 

RMSE : 1128 
CV Score : Mean - 1.641e+12 | Std - 2.576e+11 | Min - 1.336e+12 | Max - 2.152e+12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [80]:
result1.head()

Unnamed: 0,Item_Identifier,Item_Outlet_Sales,Outlet_Identifier
8523,FDW58,1842.0,OUT049
8524,FDW14,1583.5,OUT017
8525,NCN55,1912.5,OUT010
8526,FDQ58,2578.5,OUT017
8527,FDY38,5235.5,OUT027


In [52]:
#trying Decision tree regressor
from sklearn.tree import DecisionTreeRegressor
predictors = [x for x in traindata.columns if x not in [target]+ IDcols]
alg2 = DecisionTreeRegressor(max_depth = 15, min_samples_leaf = 100)
result2 = get_modelfit(alg2,traindata,testdata,predictors)


 Model Report 

RMSE : 1058 
CV Score : Mean - 1.434e+12 | Std - 2.382e+11 | Min - 1.011e+12 | Max - 1.981e+12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
result2.head()

In [89]:
#trying out random forest model
from sklearn.ensemble import RandomForestRegressor
predictors = [x for x in traindata.columns if x not in [target]+IDcols]
alg3 = RandomForestRegressor(n_estimators=1000,max_depth=6, min_samples_leaf=100,n_jobs=4)
result3 = get_modelfit(alg3, traindata, testdata, predictors)


 Model Report 

RMSE : 1068 
CV Score : Mean - 1.388e+12 | Std - 2.26e+11 | Min - 1.082e+12 | Max - 1.816e+12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [96]:
#we onbserve that decision tree performs the best in terms of RMSE, followed by Random forest and Linear regression.
#lets try an ensemble of the tree with decreasing order of importances
def get_Ensemblemodelfit(alg1,alg2,dtrain,dtest,predictors,target = target,IDcols = IDcols):
    dtrain_preds = pd.DataFrame()
    for i in range(1,3):
        alg = eval('alg'+str(i))
        #run the algorithm on train data
        alg.fit(dtrain[predictors],dtrain[target])
        #predict on train data
        dtrain_preds[i] = alg.predict(dtrain[predictors])
        #perform cross validation
        cv_score = cross_validation.cross_val_score(alg,dtrain[predictors],dtrain[target],cv=20,
                                                    scoring = 'mean_squared_error')
        cv_score = np.square(np.abs(cv_score))
        #get the model report
        print "\n Model Report for Algorithm ",i, "\n"
        print "CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),
                                                        np.std(cv_score),np.min(cv_score),np.max(cv_score))
        #predict on test data
        dtest[eval(repr(target+str(i)))] = alg.predict(dtest[predictors])
    #get the combined RMSE score
    dtrain_preds['final'] = dtrain_preds.mean(axis=1)
    print "Overall RMSE : %.4g " % np.sqrt(metrics.mean_squared_error(dtrain[target].values,dtrain_preds['final']))
    #finally take weighted average as the target column
    #I assigned the weights after trail and error to get minimum RMSE 
    dtest[target] = (dtest['Item_Outlet_Sales1']*0.7) + (dtest['Item_Outlet_Sales2']*0.3)
    #generate the prediction file containing the Item and Outlet identifiers and the predictions
    IDcols.append(target)
    resultdata = pd.DataFrame({x:dtest[x] for x in IDcols})
    return resultdata


In [92]:
#weightage for ensemble : LR .2, DT .5, RF .3
result4 = get_Ensemblemodelfit(alg1,alg2,alg3,traindata,testdata,predictors)


 Model Report for Algorithm  1 

CV Score : Mean - 1.641e+12 | Std - 2.576e+11 | Min - 1.336e+12 | Max - 2.152e+12

 Model Report for Algorithm  2 

CV Score : Mean - 1.434e+12 | Std - 2.382e+11 | Min - 1.011e+12 | Max - 1.981e+12

 Model Report for Algorithm  3 

CV Score : Mean - 1.388e+12 | Std - 2.262e+11 | Min - 1.08e+12 | Max - 1.817e+12
Overall RMSE : 1066 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy





In [99]:
result5.to_csv("BigmartSales.csv", index=False)

In [97]:
#weightage for ensemble : DT .7, RF .3
result5 = get_Ensemblemodelfit(alg2,alg3,traindata,testdata,predictors)


 Model Report for Algorithm  1 

CV Score : Mean - 1.434e+12 | Std - 2.382e+11 | Min - 1.011e+12 | Max - 1.981e+12

 Model Report for Algorithm  2 

CV Score : Mean - 1.387e+12 | Std - 2.261e+11 | Min - 1.076e+12 | Max - 1.815e+12
Overall RMSE : 1058 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



