In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn import set_config
set_config(display='diagram')
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
sales_predictions = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/02 Week 2: Pandas/sales_predictions.csv')
pd.set_option('display.width',250)
pd.set_option('display.max_columns',15)
pd.set_option('display.min_rows', 50)
print(sales_predictions.shape)
print(sales_predictions.head(2))


(8523, 12)
  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility    Item_Type  Item_MRP Outlet_Identifier  Outlet_Establishment_Year Outlet_Size Outlet_Location_Type        Outlet_Type  Item_Outlet_Sales
0           FDA15         9.30          Low Fat         0.016047        Dairy  249.8092            OUT049                       1999      Medium               Tier 1  Supermarket Type1          3735.1380
1           DRC01         5.92          Regular         0.019278  Soft Drinks   48.2692            OUT018                       2009      Medium               Tier 3  Supermarket Type2           443.4228


In [None]:
#1) How many rows and columns?
print('Shape: ', sales_predictions.shape)


Shape:  (8523, 12)


In [None]:
#2) What are the datatypes of each variable?
print(sales_predictions.dtypes)


Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object


In [None]:
#3) Are there duplicates? If so, drop any duplicates.
sales_predictions.duplicated().sum()


0

In [None]:
#4) Identify missing values.
sales_predictions.isna().sum()


Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
#Copy dataframe for processing
dfml = sales_predictions

#Find Inconsistent Categories
counter = len(dfml.loc[1,:])  
for x in range(counter):
  if dfml.iloc[:,x].dtypes == 'object':
    print('\n\n', dfml.iloc[:,x].head(0))
    print('Categories: \n', dfml.iloc[:,x].value_counts())



 Series([], Name: Item_Identifier, dtype: object)
Categories: 
 FDW13    10
FDG33    10
DRE49     9
FDW26     9
FDO19     9
NCQ06     9
FDP25     9
FDX04     9
FDF52     9
NCJ30     9
FDV38     9
NCY18     9
NCB18     9
DRN47     9
FDV60     9
FDX20     9
FDG09     9
FDD38     9
NCI54     9
FDW49     9
FDT07     9
NCF42     9
NCL31     9
FDF56     9
FDX31     9
         ..
FDU43     2
FDF38     2
NCM42     2
FDB10     2
FDU09     2
FDM38     2
NCW05     2
FDR57     2
FDT33     2
FDR03     2
FDH22     2
NCX53     2
NCV18     2
FDA48     2
FDM16     2
FDE38     2
FDQ60     1
FDE52     1
FDT35     1
FDO33     1
DRF48     1
FDK57     1
FDC23     1
FDY43     1
FDN52     1
Name: Item_Identifier, Length: 1559, dtype: int64


 Series([], Name: Item_Fat_Content, dtype: object)
Categories: 
 Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64


 Series([], Name: Item_Type, dtype: object)
Categories: 
 Fruits and Vegetables    123

In [None]:

#Fix Inconsistent Categories in "Item_Fat_Content", 'LF' should be renamed 'Low Fat' and 'reg' should be renamed 'Regular' and 'low fat' should be renamed to 'Low Fat'

dfml.replace({'Item_Fat_Content':'LF'},'Low Fat', inplace = True)
dfml.replace({'Item_Fat_Content':'low fat'},'Low Fat', inplace = True)
dfml.replace({'Item_Fat_Content':'reg'},'Regular', inplace = True)
print('\n\nCleansed DataFrame: \n',dfml['Item_Fat_Content'].value_counts())




Cleansed DataFrame: 
 Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64


In [None]:
#Identify the target (X) and features (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.  

X = dfml.drop(columns=['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year'])
y = dfml['Item_Outlet_Sales']

#Perform a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')
mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
#Loading the pipeline with preprocessors
num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(freq_imputer, ohe_encoder)

In [None]:
#Pairing the datatype with the pipelines
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

#Load the transformer
column_transformer = make_column_transformer(num_tuple, cat_tuple)

#Load the analytical model
lin_reg = LinearRegression()

In [None]:
#Create the final pipe linking all the instructions
pipe = make_pipeline(column_transformer, lin_reg)

#Fit the data into the pipeline
pipe.fit(X_train, y_train)

In [None]:
# Evaluate model
print(f'Train R2: {r2_score(y_train, pipe.predict(X_train))}')
print(f'Test R2: {r2_score(y_test, pipe.predict(X_test))}')


Train R2: 0.560615819190381
Test R2: 0.5656202563241928


Variable Name	Description
Item_Identifier	Unique product ID
Item_Weight	Weight of product
Item_Fat_Content	Whether the product is low fat or regular
Item_Visibility	The percentage of total display area of all products in a store allocated to the particular product
Item_Type	The category to which the product belongs
Item_MRP	Maximum Retail Price (list price) of the product
Outlet_Identifier	Unique store ID
Outlet_Establishment_Year	The year in which store was established
Outlet_Size	The size of the store in terms of ground area covered
Outlet_Location_Type	The type of area in which the store is located
Outlet_Type	Whether the outlet is a grocery store or some sort of supermarket
Item_Outlet_Sales	Sales of the product in the particular store. This is the target variable to be predicted.