<a href="https://colab.research.google.com/github/BrianArradondo/AI-predictions/blob/main/Project_1_Part_5_Brian_Arradondo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [36]:
# Typical Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Modeling & Preprocessing import
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

# Read in the Data

In [37]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/sales_predictions.csv")

In [38]:
df.head() # This line of code reveals a look at the first 5 lines of my Data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [39]:
df.describe(include="number") # This line of code gives a descriptive look at the numbers and stats in our data initially

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [40]:
df.isna().sum() # This line of code reveals how many missing values we have in our untouched data

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

- We have missing data in two columns 

In [41]:
df.shape # This line of code tells us the general shape of our data

(8523, 12)

In [42]:
print(f"There are {df.shape[0]}, rows and {df.shape[1]} columns in our dataframe")

There are 8523, rows and 12 columns in our dataframe


In [43]:
df.duplicated().sum() # This line of code reveals there are no duplicates in our data so there is no need to drop any duplicates

0

In [44]:
# We will approach our missing values 
df["Item_Weight"].value_counts()

12.150    86
17.600    82
13.650    77
11.800    76
15.100    68
          ..
7.275      2
7.685      1
9.420      1
6.520      1
5.400      1
Name: Item_Weight, Length: 415, dtype: int64

In [45]:
df["Item_Fat_Content"].value_counts() # These rows need to be renamed to reflect the names utilized for those designations

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [46]:
df = df.replace({"LF":"Low Fat", "low fat":"Low Fat","reg":"Regular"}) # This line of code helps to fix inconsistencies in our categorical data 
df["Item_Fat_Content"].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [47]:
#check for unique values
df.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                2
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [65]:
# This line will drop the Item Identifier column as it has very high cardinality and will negatively impact our results without describing anything about the products that will help predict how many are sold
df = df.drop(columns=["Item_Identifier"])
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Format for ML and Train Test Split

In [49]:
X = df.drop(columns = "Item_Outlet_Sales") # This line of code identifies the features and separates from the target
y = df["Item_Outlet_Sales"]
X_train,X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [50]:
df["Outlet_Type"].value_counts() # This line of code is just to confirm there is no redundancy or need to adjust this category 

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

###Rename the unique row names to match the standards being used

#Making Column Selectors

##Category Selectors

In [51]:
cat_selector = make_column_selector(dtype_include="object")

cat_selector(df) # This line of code reveals the columns that have categorical Data

['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

###Number Selectors

In [52]:
num_selector = make_column_selector(dtype_include='number')
num_selector(df)

['Item_Weight',
 'Item_Visibility',
 'Item_MRP',
 'Outlet_Establishment_Year',
 'Item_Outlet_Sales']

#Match Transformer with Columns

In [53]:
#Add One Hot encoder
ohe = OneHotEncoder(sparse_output= False,handle_unknown="ignore") # this line of code was adjusted to make the final preprocessed data appear as a Numpy array
#add the Scaler
scaler = StandardScaler()

num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

# **Preprocessing Object for Data to Prep for ML**

## Instantiate transformers

In [54]:
# Add Imputers
freq_imputer = SimpleImputer(strategy="most_frequent")
mean_imputer = SimpleImputer(strategy="mean")


##Instantiate Pipelines 

In [55]:
# Numerical Pipeline
num_pipe = make_pipeline(mean_imputer,scaler)
num_pipe

In [56]:
#Categorical Pipeline
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

# Instantiate the Column transformer

In [57]:
# Tuples for Column Transformer
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)
# ColumnTransformer
preprocessor = make_column_transformer(num_tuple,cat_tuple)
preprocessor

#Transformer Data

In [58]:
preprocessor.fit(X_train)

In [59]:
# transform train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [60]:
X_train_processed.shape

(6392, 1592)

In [73]:
print(np.isnan(X_train_processed).sum().sum(),"missing values in training data") # This line of code confirms there are no missing values in the processed data
print("All data in X_train_processed are", X_train_processed.dtype) # This line of code confirms the categorical data was one hot encoded

0 missing values in training data
All data in X_train_processed are float64


In [61]:
X_train_processed

array([[ 0.81724868, -0.71277507,  1.82810922, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.5563395 , -1.29105225,  0.60336888, ...,  0.        ,
         1.        ,  0.        ],
       [-0.13151196,  1.81331864,  0.24454056, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.11373638, -0.92052713,  1.52302674, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.76600931, -0.2277552 , -0.38377708, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.81724868, -0.95867683, -0.73836105, ...,  1.        ,
         0.        ,  0.        ]])

In [62]:
X_train_final =pd.DataFrame(X_train_processed)
X_train_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591
0,0.817249,-0.712775,1.828109,1.327849,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.55634,-1.291052,0.603369,1.327849,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.131512,1.813319,0.244541,0.136187,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.169219,-1.004931,-0.952591,0.732018,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.528819,-0.965484,-0.33646,0.493686,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [63]:
X_train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6392 entries, 0 to 6391
Columns: 1592 entries, 0 to 1591
dtypes: float64(1592)
memory usage: 77.6 MB
