## 1. Setup and Imports

In [41]:
import pandas as pd
import numpy as np
import kagglehub
import os
import plotly.express as px
import nbformat
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

## 2. Data Loading

In [42]:
path = kagglehub.dataset_download("akashdeepkuila/big-mart-sales")

print("Path to dataset files:", path)
print(os.listdir(path))
path = os.path.join(path, "Train-Set.csv")

Using Colab cache for faster access to the 'big-mart-sales' dataset.
Path to dataset files: /kaggle/input/big-mart-sales
['Test-Set.csv', 'Train-Set.csv']


### 2.1 Dataset Structure and Summary

In [43]:
df = pd.read_csv(path)

print("\nFirst 5 rows: ")
display(df.head())
print("\nLast 5 rows: ")
display(df.tail())

print(f"Dataset rows: {df.shape[0]}, Columns: {df.shape[1]}\n")

print("Columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2}. {col:<20}"
          f"Type: {str(df[col].dtype):<7}"
          f" | Missing: {df[col].isna().sum():<4} "
          f"({df[col].isnull().sum()/df.shape[0]*100:5.2f}%)"
          f" | Unique: {df[col].nunique():<7}"
          f" | duplicates: {df.duplicated().sum()}")


df.drop(["ProductID", "OutletID"], axis=1, inplace=True)



First 5 rows: 


Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052



Last 5 rows: 


Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.38,Regular,0.046982,Baking Goods,108.157,OUT045,2002,,Tier 2,Supermarket Type1,549.285
8520,NCJ29,10.6,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976
8522,DRG01,14.8,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Small,Tier 1,Supermarket Type1,765.67


Dataset rows: 8523, Columns: 12

Columns:
   1. ProductID           Type: object  | Missing: 0    ( 0.00%) | Unique: 1559    | duplicates: 0
   2. Weight              Type: float64 | Missing: 1463 (17.17%) | Unique: 415     | duplicates: 0
   3. FatContent          Type: object  | Missing: 0    ( 0.00%) | Unique: 5       | duplicates: 0
   4. ProductVisibility   Type: float64 | Missing: 0    ( 0.00%) | Unique: 7880    | duplicates: 0
   5. ProductType         Type: object  | Missing: 0    ( 0.00%) | Unique: 16      | duplicates: 0
   6. MRP                 Type: float64 | Missing: 0    ( 0.00%) | Unique: 5938    | duplicates: 0
   7. OutletID            Type: object  | Missing: 0    ( 0.00%) | Unique: 10      | duplicates: 0
   8. EstablishmentYear   Type: int64   | Missing: 0    ( 0.00%) | Unique: 9       | duplicates: 0
   9. OutletSize          Type: object  | Missing: 2410 (28.28%) | Unique: 3       | duplicates: 0
  10. LocationType        Type: object  | Missing: 0    ( 0.00%) | 

### 2.2 Data Quality Check

In [44]:
missing_count = df.isna().sum().sum()
if missing_count == 0:
    print("No missing values in the dataset.")
else:
    print(f"\n  Number of missing values in the dataset: {missing_count} in following columns:")
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"    {col}")

duplicate_count = df.duplicated().sum()
if duplicate_count == 0:
    print("\nNo duplicate rows in the dataset.")
else:
    print(f"\n  Number of duplicate rows in the dataset: {duplicate_count} in following rows:")
    duplicated_df = df[df.duplicated()]
    for col in duplicated_df.columns:
        print(f"    {col}")



  Number of missing values in the dataset: 3873 in following columns:
    Weight
    OutletSize

No duplicate rows in the dataset.


In [45]:
cat_columns = df.select_dtypes(include=['object'])
for col in cat_columns:
    print(f"\nValue counts for column {df[col].value_counts(dropna=False)}:")



Value counts for column FatContent
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64:

Value counts for column ProductType
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: count, dtype: int64:

Value counts for column OutletSize
Medium    2793
NaN       2410
Small     2388
High       932
Name: count, dtype: int64:

Value counts for column LocationType
Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: count, dtype: int64:

Value counts for column OutletType
Supermarket Type1    5577
Grocery Store     

In [46]:
num_summary = df.select_dtypes(include=['number']).describe()
print("\nStatistical summary of numerical columns:")
display(num_summary)


Statistical summary of numerical columns:


Unnamed: 0,Weight,ProductVisibility,MRP,EstablishmentYear,OutletSales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


## 3. Data Cleaning and Preprocessing

### 3.1 Column Renaming

In [47]:
df.rename(columns={
    "OutletType": "StoreCategory",
    "LocationType": "CityTier",
    "OutletSize": "StoreSize",
    "OutletSales": "StoreSales"
}, inplace=True)

df["FatContent"] = df["FatContent"].replace({
    "LF": "Low Fat",
    "low fat": "Low Fat",
    "reg": "Regular",
})

print("\nUpdated fat content values:\n")
print(df["FatContent"].value_counts(dropna=False))


Updated fat content values:

FatContent
Low Fat    5517
Regular    3006
Name: count, dtype: int64


### 3.2 Missing Store Size Values Analysis

In [48]:
display(pd.crosstab(df["StoreCategory"], df["StoreSize"], dropna=False))
display(pd.crosstab(df["CityTier"], df["StoreSize"], dropna=False))

size_counts = df.groupby(
    ["StoreCategory", "CityTier"],
)["StoreSize"].value_counts(dropna=False).unstack(fill_value=0)

display(size_counts)

agg_sales = df.groupby(
    ["StoreCategory", "CityTier", "StoreSize"],
    dropna=False
)["StoreSales"].agg(["count", "mean", "median"])

display(agg_sales)

StoreSize,High,Medium,Small,NaN
StoreCategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Grocery Store,0,0,528,555
Supermarket Type1,932,930,1860,1855
Supermarket Type2,0,928,0,0
Supermarket Type3,0,935,0,0


StoreSize,High,Medium,Small,NaN
CityTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tier 1,0,930,1458,0
Tier 2,0,0,930,1855
Tier 3,932,1863,0,555


Unnamed: 0_level_0,StoreSize,High,Medium,Small,NaN
StoreCategory,CityTier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Grocery Store,Tier 1,0,0,528,0
Grocery Store,Tier 3,0,0,0,555
Supermarket Type1,Tier 1,0,930,930,0
Supermarket Type1,Tier 2,0,0,930,1855
Supermarket Type1,Tier 3,932,0,0,0
Supermarket Type2,Tier 3,0,928,0,0
Supermarket Type3,Tier 3,0,935,0,0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,median
StoreCategory,CityTier,StoreSize,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Grocery Store,Tier 1,Small,528,340.329723,265.3213
Grocery Store,Tier 3,,555,339.351662,250.3408
Supermarket Type1,Tier 1,Medium,930,2348.354635,1966.1074
Supermarket Type1,Tier 1,Small,930,2277.844267,1945.8005
Supermarket Type1,Tier 2,Small,930,2438.841866,2109.2544
Supermarket Type1,Tier 2,,1855,2266.410119,1946.7992
Supermarket Type1,Tier 3,High,932,2298.995256,2050.664
Supermarket Type2,Tier 3,Medium,928,1995.498739,1655.1788
Supermarket Type3,Tier 3,Medium,935,3694.038558,3364.9532


In [49]:

df_plot = df.copy()
df_plot["StoreSize"] = df_plot["StoreSize"].fillna("Missing")

counts = (
    df_plot.groupby(["StoreCategory","CityTier","StoreSize"])
           .size().reset_index(name="Count")
)

color_map = {
    "Small": "#4DB6FF",     # light blue
    "Medium": "#FFB74D",    # orange-gold
    "High": "#81C784",      # lime green
    "Missing": "#F06292"    # pink/red
}

fig = px.bar(
    counts,
    x="StoreCategory", y="Count",
    color="StoreSize",
    facet_col="CityTier",
    barmode="group",
    color_discrete_map=color_map,
    category_orders={
        "CityTier": ["Tier 1","Tier 2","Tier 3"],
        "StoreCategory": ["Grocery Store","Supermarket Type1","Supermarket Type2","Supermarket Type3"],
        "StoreSize": ["Small","Medium","High","Missing"]
    },
    title="Value counts of Store Size by Store Category and City Tier"
)
fig.update_layout(
    width=1000, height=550, bargap=0.15,
    plot_bgcolor="black",
    paper_bgcolor="black",
    font=dict(color="white"),
    title_font=dict(size=22, color="white"),
    legend=dict(
        title="Store Size",
        font=dict(color="white"),
        bgcolor="black"
    )
)
fig.show()

fig = px.box(
    df_plot[df_plot["StoreSize"].isin(["Small","Missing"])],
    x="StoreSize", y="StoreSales", color="StoreSize",
    color_discrete_map=color_map,
    title="Store Sales Distribution in Small vs Missing"
)
fig.update_layout(
    xaxis_title="Store Size",
    yaxis_title="Store Sales",
    plot_bgcolor="black",
    paper_bgcolor="black",
    font=dict(color="white"),
    title_font=dict(size=22, color="white"),
    legend=dict(
        title="Store Size",
        font=dict(color="white"),
        bgcolor="black"
    )
)
fig.show()


### 3.3 Store Size Imputation

In [50]:
df.loc[(df["StoreSize"].isna()) &
       (df["StoreCategory"] == "Grocery Store") &
       (df["CityTier"] == "Tier 3"), "StoreSize"] = "Small"

df.loc[(df["StoreSize"].isna()) &
       (df["StoreCategory"] == "Supermarket Type1") &
       (df["CityTier"] == "Tier 2"), "StoreSize"] = "Small"

print(df["StoreSize"].value_counts(dropna=False))


StoreSize
Small     4798
Medium    2793
High       932
Name: count, dtype: int64


### 3.4 Product Weight Analysis

In [51]:
weight_mean = df["Weight"].mean()
weight_median = df["Weight"].median()
print(f"\nWeight total mean: {weight_mean:.2f}, total median: {weight_median:.2f}")

weight_stats = df.groupby("ProductType")["Weight"].agg(["count","mean","median","std"])
display(weight_stats.head())



Weight total mean: 12.86, total median: 12.60


Unnamed: 0_level_0,count,mean,median,std
ProductType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Baking Goods,536,12.277108,11.65,4.773622
Breads,204,11.346936,10.6,4.44054
Breakfast,89,12.768202,10.695,5.038131
Canned,539,12.305705,12.15,4.586564
Dairy,566,13.426069,13.35,4.686532


### 3.5 Weight Imputation

In [52]:
df["Weight"] = df["Weight"].fillna(
    df.groupby("ProductType")["Weight"].transform("median")
)

### 3.6 Product Visibility Analysis

In [53]:
visibility_mean = df["ProductVisibility"].mean()
visibility_median = df["ProductVisibility"].median()
print(f"Visibility total mean: {visibility_mean}, total median: {visibility_median}")

visibility_stats = df.groupby("ProductType")["ProductVisibility"].agg(["count","mean","median","std"])
display(visibility_stats.head())

Visibility total mean: 0.06613202877895108, total median: 0.053930934


Unnamed: 0_level_0,count,mean,median,std
ProductType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Baking Goods,648,0.069169,0.058725,0.052248
Breads,251,0.066255,0.055873,0.048816
Breakfast,110,0.085723,0.068893,0.052807
Canned,649,0.068129,0.050786,0.05373
Dairy,682,0.072427,0.06322,0.053205


### 3.7 Product Visibility Imputation

In [54]:
df["ProductVisibility"] = df["ProductVisibility"].replace(0, np.nan)
df["ProductVisibility"] = df["ProductVisibility"].fillna(
    df.groupby("ProductType")["ProductVisibility"].transform("median")
)

## 4. Outlier Detection

In [55]:
num_cols = ["Weight","MRP","ProductVisibility","StoreSales"]
for col in num_cols:
    fig = px.box(df, y=col,title=f"Boxplot of {col}")
    fig.show()
    fig = px.histogram(df, x=col, nbins=40, title=f"Histogram of {col}")
    fig.show()



### 4.1 IQR-Based Outlier Removal


In [56]:
cols_to_check = ["ProductVisibility", "StoreSales"]
previous_rows = df.shape[0]
for col in cols_to_check:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    print(f"\nColumn: {col}")
    print(f"Q1: {Q1:.4f}, Q3: {Q3:.4f}, IQR: {IQR:.4f}")
    print(f"Lower Bound: {lower_bound:.4f}, Upper Bound: {upper_bound:.4f}")
    print(f"Number of outliers: {outliers.shape[0]}")
    print(f"Percentage: {100 * outliers.shape[0] / df.shape[0]:.2f}%")

    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]



Column: ProductVisibility
Q1: 0.0331, Q3: 0.0946, IQR: 0.0615
Lower Bound: -0.0592, Upper Bound: 0.1868
Number of outliers: 173
Percentage: 2.03%

Column: StoreSales
Q1: 882.8508, Q3: 3131.9232, IQR: 2249.0724
Lower Bound: -2490.7578, Upper Bound: 6505.5318
Number of outliers: 185
Percentage: 2.22%


### 4.2 Outlier Removal Summary


In [57]:
print(f"\nRows before outlier removal: {previous_rows}, after: {df.shape[0]}")
print(f"Total rows removed: {previous_rows - df.shape[0]} ({100 * (previous_rows - df.shape[0]) / previous_rows:.2f}%)")


Rows before outlier removal: 8523, after: 8165
Total rows removed: 358 (4.20%)


## 5. Exploratory Data Analysis and Feature Engineering

### 5.1 Numerical Feature Correlations

In [58]:
corr = df.select_dtypes(include=[np.number]).corr()
display(corr)

Unnamed: 0,Weight,ProductVisibility,MRP,EstablishmentYear,StoreSales
Weight,1.0,-0.011316,0.02836,0.00612,0.010683
ProductVisibility,-0.011316,1.0,-0.000404,-0.03221,-0.068752
MRP,0.02836,-0.000404,1.0,0.026817,0.565211
EstablishmentYear,0.00612,-0.03221,0.026817,1.0,-0.012391
StoreSales,0.010683,-0.068752,0.565211,-0.012391,1.0


In [59]:
corr = df.select_dtypes(include=[np.number]).corr()
fig = px.imshow(
    corr,
    text_auto=True,
    aspect="auto",
    color_continuous_scale="Viridis",
    title="Correlation Heatmap of Numerical Features"
)

fig.update_layout(
    width=800, height=600,
    font=dict(color="black"),
    title_font=dict(size=22, color="black"),
)
fig.show()

### 5.1 Categorical Feature Correlations

In [60]:
cat_cols = ["StoreCategory", "CityTier", "StoreSize", "FatContent", "ProductType"]

for col in cat_cols:
    fig = px.box(
        df, x=col, y="StoreSales", color=col,
        title=f"StoreSales by {col}"
    )
    fig.show()

### 5.2 Distribution Analysis
Examine the distribution of key variables to understand their characteristics.

In [61]:
px.histogram(df,'ProductVisibility', nbins = 80)

In [62]:
px.histogram(df,'StoreSales', nbins = 80)

### Feature Engineering

In [63]:
df['SqrtProductVisibility'] = np.sqrt(df['ProductVisibility'])

group_map = {
    "Fruits and Vegetables": "Fresh Produce",
    "Meat": "Fresh Produce",
    "Seafood": "Fresh Produce",

    "Snack Foods": "Snacks & Beverages",
    "Soft Drinks": "Snacks & Beverages",
    "Hard Drinks": "Snacks & Beverages",

    "Breads": "Grains & Bakery",
    "Baking Goods": "Grains & Bakery",
    "Breakfast": "Grains & Bakery",
    "Starchy Foods": "Grains & Bakery",

    "Dairy": "Dairy & Frozen",
    "Frozen Foods": "Dairy & Frozen",

    "Canned": "Canned & Packaged",
    "Others": "Canned & Packaged",

    "Household": "Household & Hygiene",
    "Health and Hygiene": "Household & Hygiene"
}

df["ProductGroup"] = df["ProductType"].map(group_map)


In [64]:
px.histogram(df,'SqrtProductVisibility', nbins = 80)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8165 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Weight                 8165 non-null   float64
 1   FatContent             8165 non-null   object 
 2   ProductVisibility      8165 non-null   float64
 3   ProductType            8165 non-null   object 
 4   MRP                    8165 non-null   float64
 5   EstablishmentYear      8165 non-null   int64  
 6   StoreSize              8165 non-null   object 
 7   CityTier               8165 non-null   object 
 8   StoreCategory          8165 non-null   object 
 9   StoreSales             8165 non-null   float64
 10  SqrtProductVisibility  8165 non-null   float64
 11  ProductGroup           8165 non-null   object 
dtypes: float64(5), int64(1), object(6)
memory usage: 829.3+ KB


In [66]:
df.drop('ProductVisibility', axis = 1)

Unnamed: 0,Weight,FatContent,ProductType,MRP,EstablishmentYear,StoreSize,CityTier,StoreCategory,StoreSales,SqrtProductVisibility,ProductGroup
0,9.300,Low Fat,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.1380,0.126678,Dairy & Frozen
1,5.920,Regular,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228,0.138846,Snacks & Beverages
2,17.500,Low Fat,Meat,141.6180,1999,Medium,Tier 1,Supermarket Type1,2097.2700,0.129461,Fresh Produce
3,19.200,Regular,Fruits and Vegetables,182.0950,1998,Small,Tier 3,Grocery Store,732.3800,0.242443,Fresh Produce
4,8.930,Low Fat,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052,0.221397,Household & Hygiene
...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,Snack Foods,214.5218,1987,High,Tier 3,Supermarket Type1,2778.3834,0.238293,Snacks & Beverages
8519,8.380,Regular,Baking Goods,108.1570,2002,Small,Tier 2,Supermarket Type1,549.2850,0.216754,Grains & Bakery
8520,10.600,Low Fat,Health and Hygiene,85.1224,2004,Small,Tier 2,Supermarket Type1,1193.1136,0.187580,Household & Hygiene
8521,7.210,Regular,Snack Foods,103.1332,2009,Medium,Tier 3,Supermarket Type2,1845.5976,0.381078,Snacks & Beverages


In [67]:
sales_pt = df.groupby("ProductType", as_index=False)["StoreSales"].sum().sort_values("StoreSales", ascending=False)

fig = px.bar(
    sales_pt, x="ProductType", y="StoreSales",
    title="Total Store Sales by Product Type",
    text="StoreSales"
)
fig.update_traces(texttemplate="%{text:.0f}", textposition="outside")
fig.update_layout(width=1100, height=600, xaxis_tickangle=45)
fig.show()

sales_pg = df.groupby("ProductGroup", as_index=False)["StoreSales"].sum()
sales_pg["Share"] = sales_pg["StoreSales"] / sales_pg["StoreSales"].sum()

fig = px.pie(
    sales_pg, names="ProductGroup", values="StoreSales",
    title="Product Group Share of Total Sales"
)
fig.show()


In [68]:
avg_size = df.groupby("StoreSize", as_index=False)["StoreSales"].mean().sort_values("StoreSales", ascending=False)

fig = px.bar(
    avg_size, x="StoreSize", y="StoreSales",
    title="Average Sales per Store Size",
    text="StoreSales", color="StoreSize"
)
fig.update_traces(texttemplate="%{text:.1f}")
fig.show()

In [69]:
sales_tier = df.groupby("CityTier", as_index=False)["StoreSales"].sum().sort_values("StoreSales", ascending=False)

fig = px.bar(
    sales_tier, x="CityTier", y="StoreSales",
    title="Total Sales by City Tier",
    text="StoreSales", color="CityTier"
)
fig.update_traces(texttemplate="%{text:.0f}")
fig.show()

In [70]:
# Bin continuous features
df["MRP_bin"] = pd.qcut(df["MRP"], 4, labels=["Low","Mid-Low","Mid-High","High"])
df["Vis_bin"] = pd.qcut(df["ProductVisibility"], 4, labels=["Low","Mid-Low","Mid-High","High"])

# Average sales grouped by all 3
pivot = df.groupby(["MRP_bin","Vis_bin","FatContent"], as_index=False)["StoreSales"].mean()

fig = px.density_heatmap(
    pivot, x="MRP_bin", y="Vis_bin", z="StoreSales",
    text_auto=True, color_continuous_scale="Viridis",
    title="Avg Sales by MRP & Visibility"
)
fig.show()





In [71]:
fig = px.violin(
    df, x="StoreSize", y="MRP", color="StoreSize", box=True, points="all",
    title="MRP Distribution by Store Size"
)
fig.show()

In [72]:
sales_fc = df.groupby("FatContent", as_index=False)["StoreSales"].sum().sort_values("StoreSales", ascending=False)

fig = px.bar(
    sales_fc, x="FatContent", y="StoreSales",
    title="Total Sales by Fat Content",
    text="StoreSales", color="FatContent"
)
fig.update_traces(texttemplate="%{text:.0f}")
fig.show()

## 6. Feature Encoding and Data Preparation

In [73]:
group_map = {
    "Fruits and Vegetables": "Fresh Produce",
    "Meat": "Fresh Produce",
    "Seafood": "Fresh Produce",

    "Snack Foods": "Snacks & Beverages",
    "Soft Drinks": "Snacks & Beverages",
    "Hard Drinks": "Snacks & Beverages",

    "Breads": "Grains & Bakery",
    "Baking Goods": "Grains & Bakery",
    "Breakfast": "Grains & Bakery",
    "Starchy Foods": "Grains & Bakery",

    "Dairy": "Dairy & Frozen",
    "Frozen Foods": "Dairy & Frozen",

    "Canned": "Canned & Packaged",
    "Others": "Canned & Packaged",

    "Household": "Household & Hygiene",
    "Health and Hygiene": "Household & Hygiene"
}

df["ProductGroup"] = df["ProductType"].map(group_map)


In [74]:
enc = LabelEncoder()
df["ProductType"] = enc.fit_transform(df["ProductType"])

categorical_cols = ["StoreSize", "CityTier", "StoreCategory", "FatContent", "ProductGroup"]
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', one_hot_encoder, categorical_cols)
    ],
    remainder='passthrough'
)

df_encoded = preprocessor.fit_transform(df)

onehot_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_cols)
remaining_feature_names = [col for col in df.columns if col not in categorical_cols]

new_column_names = list(onehot_feature_names) + remaining_feature_names

df = pd.DataFrame(df_encoded, columns=new_column_names)

In [75]:
df.head(20)

Unnamed: 0,StoreSize_High,StoreSize_Medium,StoreSize_Small,CityTier_Tier 1,CityTier_Tier 2,CityTier_Tier 3,StoreCategory_Grocery Store,StoreCategory_Supermarket Type1,StoreCategory_Supermarket Type2,StoreCategory_Supermarket Type3,...,ProductGroup_Snacks & Beverages,Weight,ProductVisibility,ProductType,MRP,EstablishmentYear,StoreSales,SqrtProductVisibility,MRP_bin,Vis_bin
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,9.3,0.016047,4,249.8092,1999,3735.138,0.126678,High,Low
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,5.92,0.019278,14,48.2692,2009,443.4228,0.138846,Low,Low
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,17.5,0.01676,10,141.618,1999,2097.27,0.129461,Mid-High,Low
3,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,19.2,0.058778,6,182.095,1998,732.38,0.242443,Mid-High,Mid-High
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,8.93,0.049017,9,53.8614,1987,994.7052,0.221397,Low,Mid-Low
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,10.395,0.062343,0,51.4008,2009,556.6088,0.249687,Low,Mid-High
6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,13.65,0.012741,13,57.6588,1987,343.5528,0.112876,Low,Low
7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,13.15,0.12747,13,107.7622,1985,4022.7636,0.357029,Mid-Low,High
8,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,16.2,0.016687,5,96.9726,2002,1076.5986,0.129179,Mid-Low,Low
9,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,19.2,0.09445,5,187.8214,2007,4710.535,0.307327,High,High


In [76]:
df.describe()

Unnamed: 0,StoreSize_High,StoreSize_Medium,StoreSize_Small,CityTier_Tier 1,CityTier_Tier 2,CityTier_Tier 3,StoreCategory_Grocery Store,StoreCategory_Supermarket Type1,StoreCategory_Supermarket Type2,StoreCategory_Supermarket Type3,...,ProductGroup_Snacks & Beverages,Weight,ProductVisibility,ProductType,MRP,EstablishmentYear,StoreSales,SqrtProductVisibility,MRP_bin,Vis_bin
count,8165.0,8165.0,8165.0,8165.0,8165.0,8165.0,8165.0,8165.0,8165.0,8165.0,...,8165.0,8165.0,8165.0,8165,8165.0,8165,8165.0,8165.0,8165,8165
unique,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,416.0,7549.0,16,5750.0,9,3299.0,7549.0,4,4
top,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,13.15,0.058778,6,172.0422,1985,958.752,0.242443,Low,Mid-Low
freq,7248.0,5491.0,4574.0,5886.0,5427.0,5017.0,7249.0,5493.0,7241.0,7333.0,...,6383.0,364.0,82.0,1172,7.0,1273,17.0,82.0,2042,2104


In [77]:
print(df.dtypes)

StoreSize_High                      object
StoreSize_Medium                    object
StoreSize_Small                     object
CityTier_Tier 1                     object
CityTier_Tier 2                     object
CityTier_Tier 3                     object
StoreCategory_Grocery Store         object
StoreCategory_Supermarket Type1     object
StoreCategory_Supermarket Type2     object
StoreCategory_Supermarket Type3     object
FatContent_Low Fat                  object
FatContent_Regular                  object
ProductGroup_Canned & Packaged      object
ProductGroup_Dairy & Frozen         object
ProductGroup_Fresh Produce          object
ProductGroup_Grains & Bakery        object
ProductGroup_Household & Hygiene    object
ProductGroup_Snacks & Beverages     object
Weight                              object
ProductVisibility                   object
ProductType                         object
MRP                                 object
EstablishmentYear                   object
StoreSales 

### 7 Linear Regression Model

In [78]:
X = df.drop("StoreSales", axis=1)
y = df["StoreSales"]

X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

X = X.drop(["MRP_bin", "Vis_bin"], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             StoreSales   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.552
Method:                 Least Squares   F-statistic:                     424.8
Date:                Sat, 20 Sep 2025   Prob (F-statistic):               0.00
Time:                        17:27:13   Log-Likelihood:                -54402.
No. Observations:                6532   AIC:                         1.088e+05
Df Residuals:                    6512   BIC:                         1.090e+05
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   