# Problem Statement:
The data scientists at BigMart have collected 2013 sales data for 1559 products across 10 stores in different cities. Also, certain attributes of each product and store have been defined. The aim is to build a predictive model and find out the sales of each product at a particular store.

Using this model, BigMart will try to understand the properties of products and stores which play a key role in increasing the sales of their products.

# Importing Required Libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Importing Dataset:

In [None]:
df=pd.read_csv("bigdatamart_Train.csv")
df

# EDA:

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.describe()

In [None]:
df.columns

# Data Visualization:

In [None]:
sns.countplot(df["Item_Fat_Content"])

In [None]:
df["Item_Fat_Content"]=df["Item_Fat_Content"].replace('low fat','Low Fat')
df["Item_Fat_Content"]=df["Item_Fat_Content"].replace('LF','Low Fat')
df["Item_Fat_Content"]=df["Item_Fat_Content"].replace('reg','Regular')

In [None]:
sns.countplot(df["Item_Fat_Content"])

In [None]:
plt.figure(figsize=[15,10])
sns.countplot(df["Item_Type"])
plt.xticks(rotation=90)
plt.title("Item_Type")

In [None]:
sns.countplot(df["Outlet_Size"])

In [None]:
sns.countplot(df["Outlet_Location_Type"])

In [None]:
plt.figure(figsize=[15,10])
sns.countplot(df["Outlet_Type"])
plt.title("Outlet_Type")

In [None]:
sns.distplot(df['Item_Outlet_Sales'])

In [None]:
sns.distplot(df['Item_Weight'])

In [None]:
plt.figure(figsize=[5,10])
sns.scatterplot(x=df['Item_Weight'],y=df['Outlet_Type'], hue=df["Item_Outlet_Sales"], data=df)

In [None]:
plt.figure(figsize=(5,10))
sns.scatterplot(x='Item_Weight',y='Item_Outlet_Sales', hue='Item_Type',data=df)
plt.show()

In [None]:
plt.figure(figsize=(5,10))
sns.scatterplot(x='Outlet_Location_Type', y='Item_Type', hue='Item_Outlet_Sales',  data=df)
plt.show()

In [None]:
sns.pairplot(df,hue="Item_Outlet_Sales")

# Correlation:

In [None]:
df.corr()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,7))
sns.heatmap(df.corr(),annot=True,linewidth=0.5,fmt='.2f')

# Data Cleaning:

In [None]:
df["Item_Weight"].fillna(df["Item_Weight"].mean(),inplace=True)

In [None]:
from sklearn.impute import SimpleImputer
emb=SimpleImputer(strategy="most_frequent")
df['Outlet_Size']=emb.fit_transform(df['Outlet_Size'].values.reshape(-1, 1))

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())

# Checking Outliers:

In [None]:
df.plot(kind='box',fontsize=10,figsize=(20,10))

Item_Visibility has outliers

# Removing Outliers:

In [None]:
col=df[["Item_Visibility"]]

In [None]:
from scipy.stats import zscore
z=np.abs(zscore(col))
df_new=df[(z<3).all(axis=1)]
df_new

In [None]:
df.shape

In [None]:
df_new.shape

# Percentage of Data Loss:

In [None]:
Data_loss = (8523-8428)/8523*100
Data_loss

# Skewness:

In [None]:
df_new.skew()

In [None]:
sk=["Item_Visibility"]

In [None]:
from sklearn.preprocessing import PowerTransformer
Scaler=PowerTransformer(method='yeo-johnson')

In [None]:
df_new[sk] = Scaler.fit_transform(df_new[sk].values)

# Encoding the Data:

In [None]:
from sklearn.preprocessing import LabelEncoder as LE

In [None]:
cols=["Item_Identifier", "Item_Fat_Content", "Item_Type", "Outlet_Identifier", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type"]

In [None]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
df_new[cols]=df_new[cols].apply(LE.fit_transform)

In [None]:
df_new

# Data Preprocessing:

In [None]:
x=df_new.drop("Item_Outlet_Sales",axis=1)
y=df_new["Item_Outlet_Sales"]

# Scaling:

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler()
x=pd.DataFrame(mms.fit_transform(x),columns=x.columns)
x

# Finding Best Random State:

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
maxAccu=0
maxRS=0
for i in range(1,200):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.30, random_state=i)
    mod = LinearRegression()
    mod.fit(x_train, y_train)
    pred = mod.predict(x_test)
    acc=r2_score(y_test, pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print("Maximum r2 score is ",maxAccu," on Random_state ",maxRS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=10,test_size=.20)

# Random Forest Regressor:

In [None]:
RFR=RandomForestRegressor()
RFR.fit(x_train,y_train)
pred=RFR.predict(x_test)
acc=r2_score(y_test,pred)
print('R2_Score:',r2_score(y_test,pred))
print('MAE:',mean_absolute_error(y_test,pred))
print('MSE:',mean_squared_error(y_test,pred))

# Decision Tree Regressor:

In [None]:
DTR=RandomForestRegressor()
DTR.fit(x_train,y_train)
pred=DTR.predict(x_test)
acc=r2_score(y_test,pred)
print('R2_Score:',r2_score(y_test,pred))
print('MAE:',mean_absolute_error(y_test, pred))
print('MSE:',mean_squared_error(y_test, pred))

# KNeighbors Regressor:

In [None]:
KN=KNeighborsRegressor()
KN.fit(x_train,y_train)
pred=KN.predict(x_test)
acc=r2_score(y_test,pred)
print('R2_Score:',r2_score(y_test,pred))
print('MAE:',mean_absolute_error(y_test,pred))
print('MSE:',mean_squared_error(y_test,pred))

# Linear Regression:

In [None]:
LR = LinearRegression()
LR.fit(x_train,y_train)
predLR=LR.predict(x_test)
print('R2_score:',r2_score(y_test,predLR))
print('MAE:',mean_absolute_error(y_test, predLR))
print('MSE:',mean_squared_error(y_test, predLR))

# Cross Validation Score:

In [None]:
from sklearn.model_selection import cross_val_score
for i in range(2,8):
    cr=cross_val_score(DTR,x,y,cv=i)
    cr_mean=cr.mean()
    print("at cv= ", i)
    print('cross val score = ',cr_mean*100)

In [None]:
print(cross_val_score(RFR,x,y,cv=6).mean())

In [None]:
print(cross_val_score(DTR,x,y,cv=6).mean())

In [None]:
print(cross_val_score(KN,x,y,cv=6).mean())

In [None]:
print(cross_val_score(LR,x,y,cv=6).mean())

# Saving:

In [None]:
import joblib
joblib.dump(RFR,"Big_Data_Mart_Sales.pkl")

# Now Let's Import Test Dataset:

In [None]:
df_test=pd.read_csv("bigdatamart_test.csv")
df_test.head()

In [None]:
df_test.shape

In [None]:
df_test.dtypes

In [None]:
df_test.info()

In [None]:
df_test.nunique()

In [None]:
df_test.isnull().sum()

In [None]:
sns.heatmap(df_test.isnull())

# Data Visualization:

In [None]:
sns.countplot(df_test["Item_Fat_Content"])

In [None]:
df_test["Item_Fat_Content"]=df_test["Item_Fat_Content"].replace('low fat','Low Fat')
df_test["Item_Fat_Content"]=df_test["Item_Fat_Content"].replace('LF','Low Fat')
df_test["Item_Fat_Content"]=df_test["Item_Fat_Content"].replace('reg','Regular')

In [None]:
sns.countplot(df_test["Item_Fat_Content"])

# Data Cleaning:

In [None]:
df_test["Item_Weight"].fillna(df_test["Item_Weight"].mean(),inplace=True)

In [None]:
from sklearn.impute import SimpleImputer
emb=SimpleImputer(strategy="most_frequent")
df_test['Outlet_Size']=emb.fit_transform(df_test['Outlet_Size'].values.reshape(-1, 1))

In [None]:
sns.heatmap(df_test.isnull())

# Outliers:

In [None]:
df_test.plot(kind='box',fontsize=10,figsize=(20,10))

# Removing the Outliers:

In [None]:
col=df_test[["Item_Visibility"]]

In [None]:
from scipy.stats import zscore
z=np.abs(zscore(col))
df_test_new=df_test[(z<3).all(axis=1)]
df_test_new

In [None]:
df_test.shape

In [None]:
df_test_new.shape

# Percentage of Data Loss:

In [None]:
Data_loss = (5681-5600)/5681*100

In [None]:
Data_loss

# Skewness:

In [None]:
df_test_new.skew()

In [None]:
from sklearn.preprocessing import PowerTransformer
Scaler=PowerTransformer(method='yeo-johnson')

In [None]:
sk=['Item_Visibility']

In [None]:
df_new[sk] = Scaler.fit_transform(df_new[sk].values)

# Encoding:

In [None]:
from sklearn.preprocessing import LabelEncoder as LE

In [None]:
cols=["Item_Identifier", "Item_Fat_Content", "Item_Type", "Outlet_Identifier", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type"]

In [None]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
df_test_new[cols]=df_test_new[cols].apply(LE.fit_transform)

# Scaling:

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler()
df_test_new=pd.DataFrame(mms.fit_transform(df_test_new),columns=df_test_new.columns)
df_test_new


No let's load our trained Model

In [217]:
Model=joblib.load("Big_Data_Mart_Sales.pkl")

In [211]:
prdct = Model.predict(x_test)

In [212]:
pred = pd.DataFrame(prdct)

In [215]:
pred

Unnamed: 0,0
0,3895.702328
1,4049.921582
2,174.466232
3,856.292038
4,834.626906
...,...
1681,1427.009140
1682,2404.809678
1683,292.958658
1684,1721.385952


In [218]:
prdct=Model.predict(x_train)

In [None]:
pred2=pd.DataFrame(Prdct)
pred2

Now,let's save the predicted values

In [None]:
pred2.to_csv('Sales.csv')