<a href="https://colab.research.google.com/github/AshikSathiya/Prediction-of-Product-Sales/blob/main/Prediction_of_Product_Sales_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import pandas as change max columns
import pandas as pd
import numpy as np


from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, ColumnTransformer

from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
fpath = '/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023.csv'
df = pd.read_csv(fpath)
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


#Cleaning Process

##Drop Duplicates

In [4]:
# Duplicate Filter

duplicated_rows = df.duplicated()
duplicated_rows

# Calculate the sum of the duplicated_rows filter
duplicated_rows.sum()

0

There are no duplicates

##Inconsitencies

In [10]:
object_cols = df.select_dtypes('object').columns


In [11]:
# Check the nunique for just the object cols
object_nunique = df[object_cols].nunique()
object_nunique

Item_Identifier         1559
Item_Fat_Content           5
Item_Type                 16
Outlet_Identifier         10
Outlet_Size                3
Outlet_Location_Type       3
Outlet_Type                4
dtype: int64

It appears as though Item_Fat_Content, Outlet_Size, Outlet_Location_Type, and Outlet_Type have a small amount of values, we will look more into these columns

In [12]:
# Drop the columns listed above
df.drop(["Item_Identifier", "Item_Type", "Outlet_Identifier"], axis=1, inplace=True)
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,182.095,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [13]:
# Remake the list of string columns (after dropping previous cols)
# loop through the list of string columns and print the value counts for the column


string_columns = df.select_dtypes(include='object').columns

for col in string_columns:
    value_counts = df[col].value_counts()
    print(value_counts)
    print()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64



Inconsitent values are present in the Item_Fat_Content

In [14]:
# Check for inconsistencies by inspecting the value_counts for Item_Fat_Content
Item_Fat_Content_counts = df['Item_Fat_Content'].value_counts()
print(Item_Fat_Content_counts)

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



Rather than having 5 values, we should only have 2 (Low Fat and Regular)

In [15]:
# Standardize the values in the Central column
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({"LF":"Low Fat"})
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({"low fat":"Low Fat"})
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({"reg":"Regular"})


df['Item_Fat_Content'].value_counts()
# Check the value counts again to confirm
Item_Fat_Content_counts = df['Item_Fat_Content'].value_counts()
print(Item_Fat_Content_counts)

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64


##Split Data

In [16]:
y = df['Item_Outlet_Sales']
X = df.drop('Item_Outlet_Sales', axis = 1)

display(y)

display(X)

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.300,Low Fat,0.016047,249.8092,1999,Medium,Tier 1,Supermarket Type1
1,5.920,Regular,0.019278,48.2692,2009,Medium,Tier 3,Supermarket Type2
2,17.500,Low Fat,0.016760,141.6180,1999,Medium,Tier 1,Supermarket Type1
3,19.200,Regular,0.000000,182.0950,1998,,Tier 3,Grocery Store
4,8.930,Low Fat,0.000000,53.8614,1987,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,0.056783,214.5218,1987,High,Tier 3,Supermarket Type1
8519,8.380,Regular,0.046982,108.1570,2002,,Tier 2,Supermarket Type1
8520,10.600,Low Fat,0.035186,85.1224,2004,Small,Tier 2,Supermarket Type1
8521,7.210,Regular,0.145221,103.1332,2009,Medium,Tier 3,Supermarket Type2


In [17]:
# Train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.info()
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                5285 non-null   float64
 1   Item_Fat_Content           6392 non-null   object 
 2   Item_Visibility            6392 non-null   float64
 3   Item_MRP                   6392 non-null   float64
 4   Outlet_Establishment_Year  6392 non-null   int64  
 5   Outlet_Size                4580 non-null   object 
 6   Outlet_Location_Type       6392 non-null   object 
 7   Outlet_Type                6392 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 449.4+ KB
<class 'pandas.core.series.Series'>
Int64Index: 6392 entries, 4776 to 7270
Series name: Item_Outlet_Sales
Non-Null Count  Dtype  
--------------  -----  
6392 non-null   float64
dtypes: float64(1)
memory usage: 99.9 KB


##Create a preprocessing object to prepare the dataset for Machine Learning

In [18]:
X_train.select_dtypes('object')

Unnamed: 0,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Low Fat,Medium,Tier 3,Supermarket Type2
7510,Regular,Medium,Tier 3,Supermarket Type2
5828,Regular,Medium,Tier 1,Supermarket Type1
5327,Low Fat,Small,Tier 2,Supermarket Type1
4810,Low Fat,,Tier 2,Supermarket Type1
...,...,...,...,...
5734,Regular,,Tier 3,Grocery Store
5191,Low Fat,,Tier 2,Supermarket Type1
5390,Low Fat,,Tier 2,Supermarket Type1
860,Low Fat,,Tier 2,Supermarket Type1


It appears that Outlet Size snd Outlet Location Type are our ordinal values

###Ordinal Pipeline

In [19]:
df['Outlet_Size'].nunique()


3

In [20]:
ord_col = ['Outlet_Size','Outlet_Location_Type']

# Impute Most frequent Strategy
impute_common = SimpleImputer(strategy='most_frequent')

# SPecify Order of Ordinal COLUMNS
size_order = ['Small','Medium','High']
loc_order= ['Tier 1','Tier 2','Tier 3']
ord_encoder = OrdinalEncoder(categories=[size_order, loc_order])

scaler = StandardScaler()

# make a pipeline
ord_pipe = make_pipeline(impute_common, ord_encoder, scaler)
ord_pipe


#Categorical Pipeline

In [21]:
# Prepare Categorical pipeline
cat_cols = X_train.select_dtypes('object').drop(columns=ord_col).columns

#Impute Missing values with "MISSING"
impute_missing = SimpleImputer(strategy='constant',fill_value='MISSING')

#One Hot Encoding
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

#Create pipline
ohe_pipe = make_pipeline(impute_missing, ohe_encoder)
ohe_pipe