<a href="https://colab.research.google.com/github/Deanne-Blair/Sales_Predictions/blob/main/Project_1_Part_5_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Project 1 - Part 5 (Core)

---

- Deanne Blair
- December 2022



---

- Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data.

- Identify the features (X) and target (y): 

- Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.

- Perform a train test split

- Create a preprocessing object to prepare the dataset for Machine Learning

- Make sure your imputation of missing values occurs after the train test split using SimpleImputer.

- Commit your work to GitHub.

- Turn in a link to your GitHub repo! 

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display = 'diagram')

# Load Data

In [2]:
path = '/content/drive/MyDrive/01 Week One Intro to Machine Learning/DF/sales_predictions.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Explore Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
print(df.isna().sum().sum(), 'missing values')

3873 missing values


In [6]:
df[df.isna().any(axis = 1)].shape

(3873, 12)

In [7]:
df.duplicated().sum()

0

#Fix inconsistencies in categorical data.

In [8]:
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [9]:
replace_dict = {'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'}
df['Item_Fat_Content'].replace(replace_dict, inplace = True)
df['Item_Fat_Content']

0       Low Fat
1       Regular
2       Low Fat
3       Regular
4       Low Fat
         ...   
8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: Item_Fat_Content, Length: 8523, dtype: object

#Identify the features (X) and target (y):

In [10]:
target = ("Item_Outlet_Sales")
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Create a preprocessing object to prepare the dataset for Machine Learning

In [11]:
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')

#Imputation of missing values using SimpleImputer.

In [12]:
freq_imputer = SimpleImputer(strategy= 'most_frequent')
mean_imputer = SimpleImputer(strategy= 'mean')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown= 'ignore', sparse = False)

#Pipelines

In [13]:
num_pipe = make_pipeline(mean_imputer, scaler)
num_pipe

In [14]:
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

ColumnTransformer

In [15]:
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)
preprocessor = make_column_transformer(num_tuple, cat_tuple)
preprocessor

In [16]:
preprocessor.fit(X_train, y_train)

In [17]:

X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

#Transform Data

In [19]:
X_train_preprocessed[:5]

array([[ 0.81724868, -0.71277507,  1.82810922, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.5563395 , -1.29105225,  0.60336888, ...,  0.        ,
         1.        ,  0.        ],
       [-0.13151196,  1.81331864,  0.24454056, ...,  1.        ,
         0.        ,  0.        ],
       [-1.1692189 , -1.00493112, -0.95259072, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.52881915, -0.96548425, -0.33646004, ...,  1.        ,
         0.        ,  0.        ]])