In [1]:
#importing pandas 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#reading the dataset

data=pd.read_csv("train.csv")

In [3]:
#dimensions of the dataset

data.shape

(8523, 12)

In [4]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Dummification 
* To avoid dimension errors perform the dummification before splitting the dataset and assigning the train and test datasets values, otherwise dummification has to be done at each step 
* puts dummy values in place of string values which is required inorder to fit the data onto the prediction model 
* The prediction model only accepts int and float values  

In [6]:
data=pd.get_dummies(data)

In [18]:
data.shape

(8523, 1605)

In [7]:
#making the train dataset 

train=data[0:7999]

In [8]:
#making the test dataset

test=data[8000:]

In [9]:
test.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
8000,7.02,0.081329,150.0734,2002,4454.202,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
8001,7.42,0.020388,247.1092,2004,4233.1564,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
8002,17.25,0.113518,253.5724,1997,5033.448,0,0,0,0,0,...,0,0,1,1,0,0,0,1,0,0
8003,18.75,0.052917,190.6504,2002,1342.2528,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
8004,20.25,0.018911,220.5772,2007,2446.1492,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


# Notes 
* Target variable : Item_Outlet_Sales
* sklearn algorithms (linear regression/logistic regression) requires **two parameters - the indepedent variable and dependent variables separately**
* The independent variable and dependent variable are available separately in x_train and y_train respectively
* Test dataset with only the independent variables to make predictions on and test our model is required and stored as x_test
* The **target variable is separated out of the test data** and stored to later evaluate the model in 'true_p' \[True predictions\]

In [10]:
x_train=train.drop('Item_Outlet_Sales', axis=1)

In [11]:
y_train=train['Item_Outlet_Sales']

In [12]:
x_test=test.drop('Item_Outlet_Sales', axis=1)

In [13]:
true_p=test['Item_Outlet_Sales']

# About sklearn
* It first creates an object and then the methods are fitted to that object 'lreg'
* fit() puts the data onto our training model 

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
lreg=LinearRegression()

In [16]:
lreg.fit(x_train,y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# Note 
* The ValueError string value not accepted is fixed by dummification 
* The ValueError Input contains NAN is fixed by using missing value treatment and replacing missing values with 0

In [19]:
x_train.fillna(0,inplace=True)

In [20]:
x_test.fillna(0,inplace=True)

In [21]:
lreg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [22]:
pred=lreg.predict(x_test)

In [24]:
#performance of our model -r2 [R Squared method]

lreg.score(x_test,true_p)

0.4023191793527754

In [25]:
lreg.score(x_train,y_train)

0.6497748754585606

# Evaluation of prediction model using RMSE

In [26]:
rmse_test = np.sqrt(np.mean(np.power((np.array(true_p)-np.array(pred)),2)))

In [27]:
rmse_train = np.sqrt(np.mean(np.power((np.array(y_train)-np.array(lreg.predict(x_train))),2)))

In [28]:
print(rmse_train)
print(rmse_test)

1013.0021806094803
1255.3481412890485


# Inference 
* Either the prediction model has been overfitted with the train data 
* or the test dataset is not a representation of the train dataset 