In [1]:
# Basic Libraries
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor



# Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the dataset
df = pd.read_csv("Datasets\BigMart Sales Data.csv")

In [3]:
# Check the first few rows
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [5]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
df.shape

(8523, 12)

## Handling the missing values

In [7]:
# Fill missing values for Item_Weight with the median
df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].median())

# Fill missing values for Outlet_Size with the mode (most frequent value)
df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0])

# Verify if all missing values are handled
print("Missing values after filling:", df.isnull().sum())

Missing values after filling: Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


In [8]:
df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [9]:
Selected_columns = [ 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type']

In [10]:
numerical_columns = df.select_dtypes(include=[float, int]).columns

In [11]:
numerical_columns

Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Item_Outlet_Sales'],
      dtype='object')

In [12]:
categorical_columns =df.select_dtypes(include=[object]).columns

In [13]:
categorical_columns

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [14]:
for column in categorical_columns:
    print(f"Total Unique values in column {column}:{len(df[column].unique())}")
   

Total Unique values in column Item_Identifier:1559
Total Unique values in column Item_Fat_Content:5
Total Unique values in column Item_Type:16
Total Unique values in column Outlet_Identifier:10
Total Unique values in column Outlet_Size:3
Total Unique values in column Outlet_Location_Type:3
Total Unique values in column Outlet_Type:4


In [15]:
df.Item_Fat_Content.value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

## It needed to be cleaned !!!

In [16]:
# Clean 'Item_Fat_Content' by replacing variations
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
})

# Check if the cleaning was successful
print(df['Item_Fat_Content'].value_counts())


Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64


In [17]:
print(df['Outlet_Type'].value_counts())


Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: count, dtype: int64


In [18]:
print(df['Outlet_Location_Type'].value_counts())

Outlet_Location_Type
Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: count, dtype: int64


In [19]:
print(df['Outlet_Size'].value_counts())

Outlet_Size
Medium    5203
Small     2388
High       932
Name: count, dtype: int64


In [20]:
print(df['Outlet_Location_Type'].dtype)


object


In [21]:
df['Outlet_Size'].isnull().sum()

0

In [22]:
## label encoding for Outlet_Location_Type,Outlet_Size, Item_Type, Outlet Identifier

from sklearn.preprocessing import OrdinalEncoder

# Define the order of categories
categories_lt = ['Tier 1', 'Tier 2', 'Tier 3']
categories_s = ['Small', 'Medium', 'High']

# Create an instance of OrdinalEncoder
ordinal_encoder_lt = OrdinalEncoder(categories=[categories_lt])
ordinal_encoder_s = OrdinalEncoder(categories=[categories_s])
ordinal_encoder_t = OrdinalEncoder()
ordinal_encoder_I = OrdinalEncoder()

# Fit and transform the 'Outlet_Location_Type' column
df['Outlet_Location_Type'] = ordinal_encoder_lt.fit_transform(df[['Outlet_Location_Type']])

# Fit and transform the 'Outlet_Size' column
df['Outlet_Size'] = ordinal_encoder_s.fit_transform(df[['Outlet_Size']])

# Fit and transform the 'Item_Type' column
df['Item_Type'] = ordinal_encoder_t.fit_transform(df[['Item_Type']])


# Fit and transform the 'Outlet_Identifier' column
df['Outlet_Identifier'] = ordinal_encoder_I.fit_transform(df[['Outlet_Identifier']])

# Check the unique values after encoding
print(df['Outlet_Location_Type'].unique())

# Check the unique values after encoding
print(df['Outlet_Size'].unique())

# Check the unique values after encoding
print(df['Item_Type'].unique())

# Check the unique values after encoding
print(df['Outlet_Identifier'].unique())






[0. 2. 1.]
[1. 2. 0.]
[ 4. 14. 10.  6.  9.  0. 13.  5.  2.  8.  7.  3.  1. 15. 11. 12.]
[9. 3. 0. 1. 5. 7. 2. 8. 6. 4.]


## We will do one hot encoding for the other categorical columns:


In [23]:
# OH encoding for Item_Fat_Content,Outlet_Type

# One-Hot Encoding for 'Item_Fat_Content' and 'Outlet_Type'
df_encoded = pd.get_dummies(df, columns=['Item_Fat_Content', 'Outlet_Type'], drop_first=True)

# Display the encoded DataFrame
print(df_encoded)

     Item_Identifier  Item_Weight  Item_Visibility  Item_Type  Item_MRP  \
0              FDA15        9.300         0.016047        4.0  249.8092   
1              DRC01        5.920         0.019278       14.0   48.2692   
2              FDN15       17.500         0.016760       10.0  141.6180   
3              FDX07       19.200         0.000000        6.0  182.0950   
4              NCD19        8.930         0.000000        9.0   53.8614   
...              ...          ...              ...        ...       ...   
8518           FDF22        6.865         0.056783       13.0  214.5218   
8519           FDS36        8.380         0.046982        0.0  108.1570   
8520           NCJ29       10.600         0.035186        8.0   85.1224   
8521           FDN46        7.210         0.145221       13.0  103.1332   
8522           DRG01       14.800         0.044878       14.0   75.4670   

      Outlet_Identifier  Outlet_Establishment_Year  Outlet_Size  \
0                   9.0         

In [24]:
df_encoded.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year',
       'Outlet_Size', 'Outlet_Location_Type', 'Item_Outlet_Sales',
       'Item_Fat_Content_Regular', 'Outlet_Type_Supermarket Type1',
       'Outlet_Type_Supermarket Type2', 'Outlet_Type_Supermarket Type3'],
      dtype='object')

In [25]:
df_scaled = df_encoded.copy()

## Scaling the data(for linear regression)

In [26]:
from sklearn.preprocessing import StandardScaler

# Select the columns that need to be scaled
numeric_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
target = ['Item_Outlet_Sales']

# Initialize the scaler
scaler_f = StandardScaler()
scaler_t = StandardScaler()

# Apply scaling to the numeric columns
df_scaled[numeric_columns] = scaler_f.fit_transform(df_scaled[numeric_columns])
df_scaled[target] = scaler_t.fit_transform(df_scaled[target])


In [27]:
df_scaled.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Item_Fat_Content_Regular,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,FDA15,-0.831187,-0.970732,4.0,1.747454,9.0,0.139541,1.0,0.0,0.910601,False,True,False,False
1,DRC01,-1.63081,-0.908111,14.0,-1.489023,3.0,1.334103,1.0,2.0,-1.01844,True,False,True,False


In [28]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Item_Identifier                8523 non-null   object 
 1   Item_Weight                    8523 non-null   float64
 2   Item_Visibility                8523 non-null   float64
 3   Item_Type                      8523 non-null   float64
 4   Item_MRP                       8523 non-null   float64
 5   Outlet_Identifier              8523 non-null   float64
 6   Outlet_Establishment_Year      8523 non-null   int64  
 7   Outlet_Size                    8523 non-null   float64
 8   Outlet_Location_Type           8523 non-null   float64
 9   Item_Outlet_Sales              8523 non-null   float64
 10  Item_Fat_Content_Regular       8523 non-null   bool   
 11  Outlet_Type_Supermarket Type1  8523 non-null   bool   
 12  Outlet_Type_Supermarket Type2  8523 non-null   b

## Using Linear Regression

In [67]:
X_lr = df_scaled.drop(["Item_Identifier","Item_Outlet_Sales"], axis = "columns")

In [68]:
y_lr = df_scaled.Item_Outlet_Sales

In [69]:
def get_x_y(X,y):
    X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size =.2)
    return X_train, X_test, y_train, y_test

In [70]:
X_train, X_test, y_train, y_test = get_x_y(X_lr,y_lr)

In [71]:
model = LinearRegression()

In [72]:
model.fit(X_train,y_train)

In [73]:
# Example code to inverse transform predictions
y_pred_scaled = model.predict(X_test)
y_pred_scaled[:5]


array([-0.90181861, -0.08523486, -0.44095169,  0.58596297, -1.32660688])

In [74]:
y_pred_scaled = y_pred_scaled.reshape(-1, 1) # converting 1D to 2D becuse inverse_transform expects a 2D.
y_pred = scaler_t.inverse_transform(y_pred_scaled)
y_pred[:5]
y_test_scaled = y_test
# Convert y_test to NumPy array and reshape it
y_test = y_test.values  # Convert Series to NumPy array
if len(y_test.shape) == 1:  # Check if y_test is 1D
    y_test = y_test.reshape(-1, 1)
    
# Inverse transform the y_test data
y_test = scaler_t.inverse_transform(y_test)

# Print first 5 values of both y_pred and y_test to compare
print(y_pred[:5])
print(y_test[:5])

[[ 642.42608463]
 [2035.84419081]
 [1428.84916413]
 [3181.17583195]
 [ -82.4324132 ]]
[[ 327.5736]
 [3323.0078]
 [ 808.947 ]
 [4262.4516]
 [ 339.558 ]]


In [75]:
mse_lr = mean_squared_error(y_test,y_pred)
r2_lr = r2_score(y_test,y_pred)

In [76]:
print(f"Mean Squared Error for linear Regression is: {mse_lr}")

Mean Squared Error for linear Regression is: 1346938.8267293454


In [77]:
print(f"R2 score for linear Regression is: {r2_lr}")

R2 score for linear Regression is: 0.5750267851048723


In [78]:
score_train= model.score(X_train,y_train)
score_train

0.5588577713274163

In [79]:
score_test= model.score(X_test,y_test_scaled)
score_test

0.5750267851048723

## using Random Forest Regressor

In [52]:
X_rf = df_encoded.drop(["Item_Identifier","Item_Outlet_Sales"], axis = "columns")

In [53]:
y_rf = df_encoded.Item_Outlet_Sales

In [54]:
X_train, X_test, y_train, y_test = get_x_y(X_rf,y_rf)

In [55]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
y_predicted_rf = model.predict(X_test)
R2_score_rf = r2_score(y_test, y_predicted_rf)
mse = mean_squared_error(y_test, y_predicted_rf)

In [56]:
print(f"Mean Squared Error for Random Forest Regression is: {mse}")

Mean Squared Error for Random Forest Regression is: 1296791.2312902422


In [57]:
print(f"R2 score for Random Forest Regression is: {R2_score_rf}")

R2 score for Random Forest Regression is: 0.5569342320041304


## Cross val Score (K-Stratified fold)

In [58]:
from sklearn.model_selection import cross_val_score

In [59]:
scores_lr = cross_val_score(LinearRegression(), X_lr , y_lr, cv= 10 )

In [61]:
average_r2_l = scores_lr.mean()
print(f'Average R² Score for linear Regression: {average_r2_l}')


Average R² Score for linear Regression: 0.5597859789562751


In [62]:
scores_rf = cross_val_score(RandomForestRegressor(n_estimators= 100, random_state=0), X_rf , y_rf, cv= 10 )

In [63]:
average_r2_rf = scores_rf.mean()
print(f'Average R² Score for Random Forest Regression: {average_r2_rf}')

Average R² Score for Random Forest Regression: 0.554871145995373


## Can do proper tuning by changing the value of n_estimators and thus the Random Forest model can be improved.

## Hypertuning

In [70]:
model_params = {
 
    'random_forest': {
        'model': RandomForestRegressor(),
        'params' : {
            'n_estimators': [70,80,90]
        }
    }
    
}

In [71]:
from sklearn.model_selection import GridSearchCV
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(df_encoded.drop(["Item_Identifier","Item_Outlet_Sales"], axis = "columns"), df_encoded.Item_Outlet_Sales)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,random_forest,0.550449,{'n_estimators': 70}
