In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('train_v9rqX0R.csv')
test = pd.read_csv('test_AbJTz2l.csv')

In [3]:
#Top 3 data
train.head(3)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27


In [4]:
# List of unique products
len(train['Item_Identifier'].unique())

1559

In [5]:
# Data description
train.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [6]:
# Finding number of null values in each column
train.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [7]:
train['Item_Fat_Content'].unique() # Item fat content is only 2 - Low fat and Normal, replace others with a standard one

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [8]:
train.replace({'reg':'Regular','LF':'Low Fat','low fat':'Low Fat'},inplace = True)
test.replace({'reg':'Regular','LF':'Low Fat','low fat':'Low Fat'},inplace = True)

In [9]:
#replace null values with mean in train data
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].mean())

In [10]:
#replace null values with mean in test data
test['Item_Weight'] = test['Item_Weight'].fillna(test['Item_Weight'].mean())

In [11]:
train.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [12]:
train['Outlet_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [13]:
train.loc[train['Outlet_Type']=='Grocery Store', 'Outlet_Size'] = 'Small'
train.loc[train['Outlet_Type']=='Supermarket Type1', 'Outlet_Size'] = train['Outlet_Size'].fillna('Small')
train.loc[train['Outlet_Type']=='Supermarket Type2', 'Outlet_Size'] = train['Outlet_Size'].fillna('Medium')

In [14]:
train.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [15]:
test.loc[test['Outlet_Type']=='Grocery Store', 'Outlet_Size'] = 'Small'
test.loc[test['Outlet_Type']=='Supermarket Type1', 'Outlet_Size'] = test['Outlet_Size'].fillna('Small')
test.loc[test['Outlet_Type']=='Supermarket Type2', 'Outlet_Size'] = test['Outlet_Size'].fillna('Medium')

In [16]:
test.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [17]:
train['Outlet_Age'] = 2013 - train['Outlet_Establishment_Year']
test['Outlet_Age'] = 2013 - test['Outlet_Establishment_Year']

In [18]:
categorical_columns = list(train.columns[train.dtypes == 'object'])
usefull = ['Item_Identifier','Outlet_Identifier']
for i in usefull:
    categorical_columns.remove(i)

In [19]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
  
for column in categorical_columns:
    train[column]= label_encoder.fit_transform(train[column])

In [20]:
label_encoder = preprocessing.LabelEncoder() 
  
for column in categorical_columns:
    test[column]= label_encoder.fit_transform(test[column])

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics

In [22]:
del train['Outlet_Establishment_Year']
del test['Outlet_Establishment_Year']

In [23]:
IDcol = ['Item_Identifier','Outlet_Identifier']
predictors = [x for x in train.columns if x not in ['Item_Outlet_Sales']+IDcol]

In [24]:
X_train = train[predictors]
Y_train = train['Item_Outlet_Sales']
X_test = test[predictors]

#### Linear Regression

In [25]:
regressor = LinearRegression(normalize=True)  
regressor.fit(X_train, Y_train) #training the algorithm
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)
y_pred = regressor.predict(X_test)
test['Item_Outlet_Sales']=y_pred
sample = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]

-481.8482129792187
[-1.05012427e+00  5.73471488e+01 -1.67861714e+03 -7.88218207e-01
  1.55769063e+01 -1.34921375e+02 -2.99972889e+02  9.42500972e+02
 -1.66865392e+00]


In [26]:
ridge = Ridge(alpha=0.05,normalize=True)
ridge.fit(X_train, Y_train) 
ridge_prediction = ridge.predict(X_test)
test['Item_Outlet_Sales']=ridge_prediction
ridgePred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
ridgePred

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1932.687773
1,FDW14,OUT017,1266.505104
2,NCN55,OUT010,2241.769863
3,FDQ58,OUT017,2251.784484
4,FDY38,OUT027,4906.878297
...,...,...,...
5676,FDB58,OUT046,2351.034417
5677,FDD47,OUT018,3045.545473
5678,NCO17,OUT045,1606.071229
5679,FDJ26,OUT017,3217.122865


#### Decision Tree

In [27]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
DT.fit(X_train, Y_train) 
DT_prediction = DT.predict(X_test)
test['Item_Outlet_Sales']=DT_prediction
DTPred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
DTPred

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1691.907802
1,FDW14,OUT017,1412.156205
2,NCN55,OUT010,584.757765
3,FDQ58,OUT017,2222.671282
4,FDY38,OUT027,6280.890880
...,...,...,...
5676,FDB58,OUT046,2245.822001
5677,FDD47,OUT018,2232.847022
5678,NCO17,OUT045,1771.688516
5679,FDJ26,OUT017,3627.549891


In [28]:
from sklearn.tree import DecisionTreeRegressor
DT1 = DecisionTreeRegressor(max_depth=8, min_samples_leaf=150)
DT1.fit(X_train, Y_train) 
DT1_prediction = DT1.predict(X_test)
test['Item_Outlet_Sales']=DT1_prediction
DT1Pred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
DT1Pred

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1612.413634
1,FDW14,OUT017,1367.982548
2,NCN55,OUT010,542.975540
3,FDQ58,OUT017,2384.015126
4,FDY38,OUT027,5669.163351
...,...,...,...
5676,FDB58,OUT046,2200.928318
5677,FDD47,OUT018,2346.648787
5678,NCO17,OUT045,1874.430960
5679,FDJ26,OUT017,3790.505601


#### Random Forest

In [29]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators=200,max_depth=5, min_samples_leaf=100,n_jobs=4)
print(RF.get_params())
RF.fit(X_train, Y_train) 
RF_prediction = RF.predict(X_test)
test['Item_Outlet_Sales']=RF_prediction
RFPred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': 4, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [30]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [31]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [32]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, Y_train) 
model_prediction = model.predict(X_test)
test['Item_Outlet_Sales']=model_prediction
modelPred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]

In [33]:
from xgboost import XGBRegressor
xg = XGBRegressor()
xg.fit(X_train, Y_train) 
xg_prediction = xg.predict(X_test)
test['Item_Outlet_Sales']=xg_prediction
xgPred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]

In [34]:
#! pip install tpot 
import tpot

In [35]:
from tpot import TPOTRegressor

tpot = TPOTRegressor(verbosity = 2,
                     generations=7,
                     population_size=100)

In [36]:
tpot.fit(X_train, Y_train)
tpot_pred = tpot.predict(X_test)
print(tpot_pred)
test['Item_Outlet_Sales'] = tpot_pred
tpotPred = test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
tpotPred.to_csv('tpotSubmission.csv',index=False)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=800.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: -1163696.0638291948

Generation 2 - Current best internal CV score: -1163696.0638291948

Generation 3 - Current best internal CV score: -1163696.0638291948

Generation 4 - Current best internal CV score: -1163172.7825385972

Generation 5 - Current best internal CV score: -1163172.7825385972

Generation 6 - Current best internal CV score: -1162742.7275459096

Generation 7 - Current best internal CV score: -1162248.2055551845

Best pipeline: ExtraTreesRegressor(RidgeCV(LinearSVR(input_matrix, C=25.0, dual=True, epsilon=0.001, loss=epsilon_insensitive, tol=1e-05)), bootstrap=True, max_features=0.45, min_samples_leaf=19, min_samples_split=19, n_estimators=100)
[1830.33421697 1475.20120259  672.32573728 ... 1818.45777791 3920.07417996
 1349.81463232]
