**Sales Prediction for Big Mart Outlets**

In [1]:
# importing the required libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#loading the train data
train_data=pd.read_csv("/content/train_XnW6LSF.csv")

In [3]:
train_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
#removing unnecessary features
train_data = train_data.drop('Outlet_Identifier', axis=1)
train_data = train_data.drop('Item_Identifier', axis=1)

**Preprocessing**

In [5]:
#imputing missing values, checking missing values
train_data.isnull().sum()

Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
# Impute the missing values with mean imputation
train_data.fillna(train_data.mean(), inplace=True)
# Count the number of NaNs in the dataset to verify
train_data.isnull().sum()

Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [7]:
# for non-numeric features
train_data = train_data.fillna(train_data['Outlet_Size'].value_counts().index[0])
train_data.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

**Converting non-numeric values to numeric values**

In [8]:
#Finding Numeric and Categorical columns
col         = list(train_data)
numeric_col     = ['Item_MRP', 'Item_Weight', 'Item_Visibility', 'Outlet_Establishment_Year','Item_Outlet_Sales']
categorical_col = [i for i in col if i not in numeric_col ]
print('\nNumeric columns: ')
print(numeric_col)
print('\nCategorical columns: ')
print(categorical_col)


Numeric columns: 
['Item_MRP', 'Item_Weight', 'Item_Visibility', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']

Categorical columns: 
['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [9]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Instantiate LabelEncoder
le = LabelEncoder()
# Iterate over all the values of each column and extract their dtypes
for col in categorical_col:
        train_data[col]=le.fit_transform(train_data[col])

In [10]:
train_data.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,1,0.016047,4,249.8092,1999,1,0,1,3735.138
1,5.92,2,0.019278,14,48.2692,2009,1,2,2,443.4228
2,17.5,1,0.01676,10,141.618,1999,1,0,1,2097.27
3,19.2,2,0.0,6,182.095,1998,1,2,0,732.38
4,8.93,1,0.0,9,53.8614,1987,0,2,1,994.7052


In [11]:
# separating the independent and dependent variables
# storing all the independent variables as X
X = train_data.drop('Item_Outlet_Sales', axis=1)
# storing the dependent variable as y
y = train_data['Item_Outlet_Sales']

**Doing same with test data**

In [12]:
#loading the test data
test_data=pd.read_csv("/content/test_FewQE9B.csv")
original_data=pd.read_csv("/content/test_FewQE9B.csv")
test_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [13]:
test_data = test_data.drop('Outlet_Identifier', axis=1)
test_data = test_data.drop('Item_Identifier', axis=1)

In [14]:
#imputing missing values, checking missing values
test_data.isnull().sum()

Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [15]:
# Impute the missing values with mean imputation
test_data.fillna(test_data.mean(), inplace=True)
# Count the number of NaNs in the dataset to verify
test_data.isnull().sum()

Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [16]:
test_data = test_data.fillna(test_data['Outlet_Size'].value_counts().index[0])
test_data.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [17]:
for col in categorical_col:
        test_data[col]=le.fit_transform(test_data[col])

**Architecture of The Model**

In [18]:
import keras
import tensorflow as tf

In [19]:
# importing the sequential model
from keras.models import Sequential

In [20]:
# importing different layers from keras
from keras.layers import InputLayer, Dense 

In [21]:
X.shape[1]

9

In [22]:
# defining input neurons
input_layer_neurons = X.shape[1]

In [23]:
output_layer_neurons=1

In [24]:
# define hidden layers and neuron in each layer
num_hidden_layers = 10
hidden_layer_1 = 1000
hidden_layer_2 = 950
hidden_layer_3 = 900
hidden_layer_4 = 850
hidden_layer_5 = 700
hidden_layer_6 = 650
hidden_layer_7 = 600
hidden_layer_8 = 550
hidden_layer_9 = 500
hidden_layer_10 = 450

In [25]:
# defining the architecture of the model
model = Sequential()
model.add(InputLayer(input_shape=(input_layer_neurons,)))
model.add(Dense(units=hidden_layer_1, activation='relu'))
model.add(Dense(units=hidden_layer_2, activation='relu'))
model.add(Dense(units=hidden_layer_3, activation='relu'))
model.add(Dense(units=hidden_layer_4, activation='relu'))
model.add(Dense(units=hidden_layer_5, activation='relu'))
model.add(Dense(units=hidden_layer_6, activation='relu'))
model.add(Dense(units=hidden_layer_7, activation='relu'))
model.add(Dense(units=hidden_layer_8, activation='relu'))
model.add(Dense(units=hidden_layer_9, activation='relu'))
model.add(Dense(units=hidden_layer_10, activation='relu'))
model.add(Dense(units=output_layer_neurons, activation='linear'))

In [26]:
# summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1000)              10000     
                                                                 
 dense_1 (Dense)             (None, 950)               950950    
                                                                 
 dense_2 (Dense)             (None, 900)               855900    
                                                                 
 dense_3 (Dense)             (None, 850)               765850    
                                                                 
 dense_4 (Dense)             (None, 700)               595700    
                                                                 
 dense_5 (Dense)             (None, 650)               455650    
                                                                 
 dense_6 (Dense)             (None, 600)               3

**Traing the Model**

In [27]:
model.compile(loss='mse',optimizer='Adam',metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [28]:
model_history = model.fit(X, y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [29]:
# getting predictions for the validation set
prediction = model.predict(test_data)

**Converting to csv**

In [30]:
answer=pd.DataFrame(prediction, columns = ['Item_Outlet_Sales'])

In [31]:
ans=pd.DataFrame()

In [32]:
ans['Item_Identifier']=original_data['Item_Identifier']

In [33]:
ans['Outlet_Identifier']=original_data['Outlet_Identifier']

In [34]:
ans['Item_Outlet_Sales']=answer['Item_Outlet_Sales']

In [35]:
ans.tail()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
5676,FDB58,OUT046,2153.766357
5677,FDD47,OUT018,3072.425537
5678,NCO17,OUT045,1669.059448
5679,FDJ26,OUT017,3344.544922
5680,FDU37,OUT045,1119.414917


In [36]:
ans.to_csv('Submission.csv', index=False)