In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet

import warnings 

warnings.filterwarnings('ignore')

<h3>Column lists</h3>

* Store ID:  (Index) ID of the particular store.

* Store_Area:  Physical Area of the store in yard square.

* Items_Available:  Number of different items available in the corresponding store.

* Daily_Customer_Count:  Number of customers who visited to stores on an average over month.

* Store_Sales:  Sales in (US $) that stores made.

In [None]:
# Store_ID is not relevant

df = pd.read_csv('/kaggle/input/stores-area-and-sales-data/Stores.csv')
df.drop('Store ID ',axis=1,inplace=True)
df

In [None]:
#Good to see no null values here.

df.info()

In [None]:
df.describe().T

In [None]:
plt.figure(figsize=(5, 5))
ax = sns.heatmap(df.corr().round(2),vmin=-1, vmax=1, annot=True, cmap='RdPu')

> As the correlation value between Items_Available and Store_Area is 1, that implies they are entirely having a linear relationship.

In [None]:
sns.scatterplot(data=df, x="Store_Area", y="Items_Available")

<div style="color:black;
           display:fill;
           border-radius:5px;
           background-color:#b4e6fa;
           font-size:300%;
           font-family:Verdana;
           letter-spacing:1px">
<a class="anchor" id="1"></a> 
<p style="font-size:30px;text-align:left">Exploratory data analysis📊</p>
</p>
</div>

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10,10))
fig.tight_layout(pad=4.0)

features = ['Store_Area', 'Items_Available', 'Daily_Customer_Count','Store_Sales']

for f,ax in zip(features,axs.ravel()):
    ax=sns.histplot(ax=ax,data=df,x=df[f],kde=True)
    ax.set_title('Feature:'+ f)

Apparently, all features are normally distributed.

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8,10))
fig.tight_layout(pad=4.0)

for f,ax in zip(features,axs.ravel()):
    ax=sns.boxplot(ax=ax,data=df,y=df[f])
    ax.set_title('Feature:'+ f)

In [None]:
df_drop_id = df[features]
sns.pairplot(df_drop_id)

Beside the pairs of Store_Area and Items_Available feature,

generally other pairs of different feature intensively distributed and close to the center.

It would be better to use contour plot to explain the relationship between those features.

<H3>Focusing on Daily_Customer_Count and Store_Sales...</H3>

In [None]:
sns.kdeplot(x=df.Store_Sales, y=df.Store_Area, cmap="Blues", shade=True, thresh=0,cbar=True)
plt.grid()

In [None]:
sns.kdeplot(x=df.Store_Sales, y=df.Items_Available, cmap="Blues", shade=True, thresh=0 ,cbar=True)
plt.grid()

In [None]:
sns.kdeplot(x=df.Store_Sales, y=df.Daily_Customer_Count, cmap="Blues", shade=True, thresh=0,cbar=True)
plt.grid()

In [None]:
sns.kdeplot(x=df.Daily_Customer_Count, y=df.Items_Available, cmap="Blues", shade=True, thresh=0 ,cbar=True)
plt.grid()

In [None]:
sns.kdeplot(x=df.Daily_Customer_Count, y=df.Store_Sales, cmap="Blues", shade=True, thresh=0,cbar=True)
plt.grid()

The contour plots can tell the density of distribution.

So, that is:

Generally...

<h4>For Store_Sales from 40,000 to 80,000:</h4>


* Store_Area ranges from 1250 to 1750

* Items_Available ranges from 1500 to 2000

* Daily_Customer_Count ranges from 500 to 1000


<h4>For Store_Sales from 40,000 to 80,000:</h4>


* Items_Available ranges from 1500 to 2000

* Store_Sales ranges from 40000 to 80000

<div style="color:black;
           display:fill;
           border-radius:5px;
           background-color:#f7b96d;
           font-size:300%;
           font-family:Verdana;
           letter-spacing:1px">
<a class="anchor" id="2"></a> 
<p style="font-size:30px;text-align:left">Sales Prediction🎯</p>
</p>
</div>

The used machine learning models are:

* LinearRegression
* RandomForest
* SupportVectorMachine
* GradientBoosting
* XGBoost
* LGBM
* ElasticNet

> All the model doesn't involve hyper-parameters tuning so they might have room for improvement.

In [None]:
x = df.iloc[:,0:-1].to_numpy()
y = df.iloc[:,-1].to_numpy()

lg_reg = LinearRegression()
rf_reg = RandomForestRegressor()
svm_reg = SVR()
gb_reg = GradientBoostingRegressor()
xgb_reg = XGBRegressor()
lgbm_reg = LGBMRegressor()
en_reg = ElasticNet()

reg_list = [lg_reg,rf_reg,svm_reg,gb_reg,xgb_reg,lgbm_reg,en_reg]

for reg in reg_list:
        
    reg.fit(x,y)
        
    y_pred = reg.predict(x)
    
    mse = mean_squared_error(y_pred,y)
    rmse = np.sqrt(mean_squared_error(y_pred,y))
    mae = mean_absolute_error(y_pred,y)
    score = reg.score(x,y)
    
    print('Regressor:{}\nMSE:{:.2f}\nRMSE:{:.2f}\nMAE:{:.2f}\nScore:{:.4f}\n\n'.format(str(reg),mse,rmse,mae,score))

🙌🙌

The best model is XGBoost model with

MSE:  22500856.64

RMSE:  4743.51

MAE:  3478.12

Score:  0.9238



<p style="text-align:center;"><img src="https://i.pinimg.com/originals/21/08/6a/21086a10e749f90dddf38297b949895d.gif" width="300" height="200">

Thanks for viewing my notebook!

I hope you enjoy it.

Please upvote if you love it or found that it is useful.

You can feel free to comment it and view my other notebook😃. 