# Importing Libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.impute import SimpleImputer

# Loading the Data

In [3]:
df = pd.read_csv('./datasets/Train-Set.csv')
df.head()

Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Understanding the Features of the Dataset


----------
**Features:**

    :ProductID: unique product ID
    :Weight: weight of products
    :FatContent: specifies whether the product is low on fat or not
    :ProductVisibility: percentage of total display area of all products in a store allocated to the particular product
    :ProductType: the category to which the product belongs
    :MRP: Maximum Retail Price (listed price) of the products
    :OutletID: unique store ID
    :EstablishmentYear: year of establishment of the outlets
    :OutletSize: the size of the store in terms of ground area covered
    :LocationType: the type of city in which the store is located
    :OutletType: specifies whether the outlet is just a grocery store or some sort of supermarket
    :OutletSales: (target variable) sales of the product in the particular store
---------

# Exploratory Data Analysis/Exploration

In [5]:
df.head()

Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
df.tail()

Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.38,Regular,0.046982,Baking Goods,108.157,OUT045,2002,,Tier 2,Supermarket Type1,549.285
8520,NCJ29,10.6,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976
8522,DRG01,14.8,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Small,Tier 1,Supermarket Type1,765.67


In [6]:
print(list(df.columns))

['ProductID', 'Weight', 'FatContent', 'ProductVisibility', 'ProductType', 'MRP', 'OutletID', 'EstablishmentYear', 'OutletSize', 'LocationType', 'OutletType', 'OutletSales']


In [7]:
df.shape

(8523, 12)

In [9]:
# Checking for the number of NaN or null values
print(df.isna().values.sum(), '----', df.isnull().values.sum())

3873 ---- 3873


In [13]:
# Finding the columns with the NaN values
nan_cols = []
print("-" * 50)

for col in df.columns:
    if df[col].isna().sum() > 1:
        nan_cols.append(col)
        print(f'Column: {col} | Number of NaN values: {df[col].isna().sum()}\n{"-" * 50}')

--------------------------------------------------
Column: Weight | Number of NaN values: 1463
--------------------------------------------------
Column: OutletSize | Number of NaN values: 2410
--------------------------------------------------


In [16]:
# Function to handle missing values in columns
def impute_missing_values(column: pd.Series, method: str = None, value: str = None) -> pd.Series:
    if value:
        column = column.fillna(value)
    else:
        imputer = SimpleImputer(missing_values=np.nan, strategy=method)
        column = imputer.fit_transform((column.values).reshape(-1, 1))

    return column

# Handling the missing values in Weight and OutletSize columns
df['Weight'] = impute_missing_values(column=df['Weight'], method='median')
df['OutletSize'] = impute_missing_values(column=df['OutletSize'], value='Medium')

df.head()

Unnamed: 0,ProductID,Weight,FatContent,ProductVisibility,ProductType,MRP,OutletID,EstablishmentYear,OutletSize,LocationType,OutletType,OutletSales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ProductID          8523 non-null   object 
 1   Weight             8523 non-null   float64
 2   FatContent         8523 non-null   object 
 3   ProductVisibility  8523 non-null   float64
 4   ProductType        8523 non-null   object 
 5   MRP                8523 non-null   float64
 6   OutletID           8523 non-null   object 
 7   EstablishmentYear  8523 non-null   int64  
 8   OutletSize         8523 non-null   object 
 9   LocationType       8523 non-null   object 
 10  OutletType         8523 non-null   object 
 11  OutletSales        8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [18]:
# Checking for duplicates
df.duplicated().any()

False

## Understanding the FatContent Feature

In [19]:
df['FatContent'].value_counts()

FatContent
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [20]:
# Handling FatContent feature input values
fat_content = []

for val in df['FatContent']:
    if val in ['Low Fat', 'LF', 'low fat']:
        val = 'Low Fat'
    elif val in ['Regular', 'reg']:
        val = 'Regular'
    fat_content.append(val)
    
df['FatContent'] = fat_content
df['FatContent'].value_counts()

FatContent
Low Fat    5517
Regular    3006
Name: count, dtype: int64

In [21]:
# Visualizing the fat content classes
fatcon = px.histogram(
    df, 
    x='FatContent',
    color='FatContent',
    title='Big Mart Products Fat Content Categories',
    color_discrete_sequence=px.colors.qualitative.Antique,
    height=550, 
    width=1000,
)
fatcon.update_layout(
    yaxis_title='Number of Products',
)
fatcon.show()