<a href="https://colab.research.google.com/github/Deanne-Blair/Sales_Predictions/blob/main/Project_1_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Project 1 Part 5

---
- Deanne Blair
- January 2023


Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data.* (*There is a way to do this after the split, but for this project, you may perform this step before the split)
Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.
Perform a train test split
Create a preprocessing object to prepare the dataset for Machine Learning
Make sure your imputation of missing values occurs after the train test split using SimpleImputer.

#Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
set_config(display='diagram')

#Read in data

In [2]:
path = '/content/sales_predictions.xlsx'
df = pd.read_excel(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999.0,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009.0,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999.0,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998.0,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987.0,High,Tier 3,Supermarket Type1,994.7052


In [3]:
#Making a copy
eda_ml = df.copy()

# Check for Missing Values

In [4]:
total = df.isna().sum().sum()
print(total, f"missing values")

3873 missing values


##What Columns are missing values

In [5]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

###There are missing values in columns "Item_Weight" and "Outlet_Size". We will need to use simple imputer

#Check the datatypes for each column

In [6]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year    float64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

##There are 7 object/categorical data types and 5 numeric column datatypes

#Drop duplicates

In [7]:
df.drop_duplicates(inplace=True)

##Confirm duplicates are dropped

In [8]:
drop = df.duplicated().sum()
print(drop, f'remaining duplicates')

0 remaining duplicates


##Check data information

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   float64
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(5), object(7)
memory usage: 865.6+ KB


##There are missing values in outlet size and item weight columns

#Display descriptive statistics

In [10]:
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


##No unusual data observed

In [11]:
X= df.drop(columns=['Item_Outlet_Sales', 'Item_Identifier'])
y= df['Item_Outlet_Sales']
X_train,X_test, y_train, y_test=train_test_split(X,y, random_state=42)

In [12]:
scaler = StandardScaler()
mean_imputer= SimpleImputer(strategy='mean')
freq_imputer= SimpleImputer(strategy='most_frequent')
ohe= OneHotEncoder(sparse = False, handle_unknown='ignore')

In [13]:
num_pipe= make_pipeline(mean_imputer, scaler)
cat_pipe= make_pipeline(freq_imputer, ohe)

In [14]:
num_selector=make_column_selector(dtype_include='number')
cat_selector=make_column_selector(dtype_include='object')

In [15]:
cat_tuple=(cat_pipe, cat_selector)
num_tuple=(num_pipe, num_selector)

In [16]:
preprocessor=make_column_transformer(cat_tuple, num_tuple, remainder='drop')
preprocessor

## **Linear Regression Model**

In [17]:
# Instantiate a linear regression model
linreg = LinearRegression()
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
linreg_pipe = make_pipeline(preprocessor, linreg)
linreg_pipe

In [18]:
# Fit the model pipeline on the training data
linreg_pipe.fit(X_train, y_train)
# Make predictions using the training and testing data
train_pred = linreg_pipe.predict(X_train)
test_pred = linreg_pipe.predict(X_test)
train_pred[:10]

array([3860., 2672., 2608., 1488., 1800.,  -88., 1584., 5584., 4216.,
       2056.])

In [19]:
#Calculating R^2
train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)
print(f'Model Training R2: {train_r2}')
print(f'Model Testing R2: {test_r2}')

Model Training R2: 0.5605802245120629
Model Testing R2: 0.5643597910827562


In [20]:
#Calculating RMSE
train_RMSE = np.sqrt(np.mean(np.abs(train_pred - y_train)**2))
test_RMSE= np.sqrt(np.mean(np.abs(test_pred - y_test)**2))
print(f'Model Training RMSE: {train_RMSE}')
print(f'Model Testing RMSE: {test_RMSE}')

Model Training RMSE: 1140.3698352944782
Model Testing RMSE: 1096.3217785203492


## **Regression Tree Model**

In [21]:
dec_tree = DecisionTreeRegressor(random_state=42)
dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
#Fit using training data
dec_tree_pipe.fit(X_train, y_train)

In [22]:
#Predict Values for train and test
train_preds = dec_tree_pipe.predict(X_train)
test_preds = dec_tree_pipe.predict(X_test)

In [24]:
#Evaluate the model
train_score = dec_tree_pipe.score(X_train, y_train)
test_score = dec_tree_pipe.score(X_test, y_test)
print(train_score)
print(test_score)

1.0
0.20588262720759032


In [32]:
#Tune the Model
dec_tree_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer', ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f74cc399760>),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                 

In [33]:
#Check depth of tree
dec_tree_pipe.get_depth()

AttributeError: ignored

In [28]:
depths = list(range(2, max_depth+1))
scores = pd.Dataframe(index = depths, columns =['Test Score', 'Train Score'])
for depth in depths:
  dec_tree=DecisionTreeRegressor(max_depth, random_state=42)
  dec_tree.fit(X_train, y_train)
  train_score = dec_tree.score(X_train, y_train)
  test_score = dec_tree.score(x_test, y_test)
  scores.loc[depth, 'Train Score']= train_score
  scores.loc[depth, 'Test Score'] = test_score

NameError: ignored

In [None]:
#Sort datafrane to find best score for test
sorted_scores = scores.sort_values(by= 'Test Score', ascending=False)
sorted_scores.head()

In [None]:
#Run model with optimized value for max_depth
dec_tree_# = DecesionTreeRegressor(max-depth = #, random_state=42)
dec_tree_#.fit(X_train, y_train)
train_#_score= dec_tree_#.score(X_train, y_train)
test_9_score =dec_tree_#.score(X_test, y_test)
print(train_9_score)
print(test_9_score)