<a href="https://colab.research.google.com/github/AhmedAnasHaouari/sales-predictions/blob/main/Sales_predictions_part5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

  # **PROJECT 1** :

- Haouari Ahmed Anas

In [98]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [99]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(transform_output='pandas')

In [100]:
filename = "/content/drive/MyDrive/datasets/sales_predictions_2023.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [101]:
df.duplicated().sum()

0

There are no duplicates in this dataframe

In [102]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

The ‘Item_Weight’ column has 1463 missing values, the ‘Outlet_Size’ column has 2410 missing values

In [103]:
# fixing Inconsistencies
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [104]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')


In [105]:
df['Item_Type'].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [106]:
df_copy = df.drop(columns=['Item_Identifier', 'Item_Type', 'Outlet_Identifier', 'Outlet_Establishment_Year'])

In [107]:
df_copy.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,249.8092,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,48.2692,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,141.618,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,182.095,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,53.8614,High,Tier 3,Supermarket Type1,994.7052


# Separate data

In [108]:
X = df_copy.drop(columns='Item_Outlet_Sales')
y = df_copy['Item_Outlet_Sales']

# Train test split the data

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [110]:
X_train.dtypes

Item_Weight             float64
Item_Fat_Content         object
Item_Visibility         float64
Item_MRP                float64
Outlet_Size              object
Outlet_Location_Type     object
Outlet_Type              object
dtype: object

# Create a ColumnTransformer to preprocess the data

- Create lists of column names

In [142]:
num_df = make_column_selector(dtype_include='number')
ord_df = make_column_selector('Outlet_Size')
cat_df = ['Outlet_Type', 'Outlet_Location_Type', 'Item_Fat_Content']

In [143]:
mean_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
num_pipe = make_pipeline(mean_imputer, scaler)
num_pipe

In [144]:
num_tuple = ('numeric', num_pipe, num_df)
num_tuple

('numeric',
 Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 <sklearn.compose._column_transformer.make_column_selector at 0x7dac7446eb00>)

- Categorical

In [145]:
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_pipe = make_pipeline(ohe_encoder)
ohe_pipe

In [146]:
cat_tuple = ('categorical', ohe_pipe, cat_df)
cat_tuple

('categorical',
 Pipeline(steps=[('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 ['Outlet_Type', 'Outlet_Location_Type', 'Item_Fat_Content'])

- Ordinal

In [147]:
impute_ord = SimpleImputer(strategy='constant', fill_value='MISSING')
order_ord_df = ['MISSING', 'Small', 'Medium', 'High']
ord_cat = [order_ord_df]
ord_encoder = OrdinalEncoder(categories=ord_cat)
scaler_ord = StandardScaler()
ord_pipe = make_pipeline(impute_ord, ord_encoder, scaler_ord)
ord_pipe

In [148]:
ord_tuple = ('Ordinal', ord_pipe, ord_df)

- Create a ColumnTransformer

In [149]:
col_transformer = ColumnTransformer([num_tuple, cat_tuple, ord_tuple],
                                    verbose_feature_names_out=False)

# Fit the ColumnTransformer on training data.

In [150]:
col_transformer.fit(X_train)

# Transform the training and test data

In [151]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

# Check the transformed training data

In [152]:
X_train_processed.dtypes

Item_Weight                      float64
Item_Visibility                  float64
Item_MRP                         float64
Outlet_Type_Grocery Store        float64
Outlet_Type_Supermarket Type1    float64
Outlet_Type_Supermarket Type2    float64
Outlet_Type_Supermarket Type3    float64
Outlet_Location_Type_Tier 1      float64
Outlet_Location_Type_Tier 2      float64
Outlet_Location_Type_Tier 3      float64
Item_Fat_Content_Low Fat         float64
Item_Fat_Content_Regular         float64
Outlet_Size                      float64
dtype: object

In [153]:
X_train_processed.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Outlet_Size
count,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0
mean,0.0,-0.0,0.0,0.12,0.65,0.11,0.11,0.27,0.33,0.4,0.65,0.35,0.0
std,1.0,1.0,1.0,0.33,0.48,0.31,0.32,0.45,0.47,0.49,0.48,0.48,1.0
min,-1.98,-1.29,-1.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.28
25%,-0.81,-0.76,-0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.28
50%,0.0,-0.23,0.03,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.26
75%,0.76,0.56,0.72,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.75
max,2.0,5.13,1.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.76


In [None]:
X_train_processed.isna().sum()