# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report

## The Data

Let's start by reading in the train.csv file into a pandas dataframe.

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/shwetachandel/Black-Friday-Dataset/master/Black%20Friday%20Dataset/train.csv')

In [3]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


# Exploratory Data Analysis

Let's begin some exploratory data analysis! We'll start by checking out missing data!

## Missing Data
We can use isnull().sum() and mean() to see where we are missing data!

In [4]:
df.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [5]:
df.isnull().mean()

User_ID                       0.000000
Product_ID                    0.000000
Gender                        0.000000
Age                           0.000000
Occupation                    0.000000
City_Category                 0.000000
Stay_In_Current_City_Years    0.000000
Marital_Status                0.000000
Product_Category_1            0.000000
Product_Category_2            0.315666
Product_Category_3            0.696727
Purchase                      0.000000
dtype: float64

Roughly 30 percent of the Product_Category_2 data is missing. The proportion of Product_Category_2 missing is likely small enough for reasonable replacement with some form of imputation. Looking at the Product_Category_3 column, it looks like we are just missing too much of that data to do something useful with at a basic level. We'll probably drop this later.

## Data Cleaning

We want to fill in missing Product_Category_2 data instead of just dropping the missing Product_Category_2 data rows. One way to do this is by filling 'others' in null values as product category 2 is discrete.

In [6]:
df.Product_Category_2.fillna('Others',inplace=True)

In [7]:
df.drop(['Product_Category_3'],axis=1,inplace=True)

We can also drop user id and product id which not affects sales

In [8]:
df.drop(['User_ID','Product_ID'],axis=1,inplace=True)

As we know Product_Category_1 and occupation is discrete but it takes as countinous. to prevent this we have to convert it into str type

In [9]:
df["Product_Category_1"]= df["Product_Category_1"].astype(str)

In [10]:
df["Occupation"]= df["Occupation"].astype(str)

In [11]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,F,0-17,10,A,2,0,3,Others,8370
1,F,0-17,10,A,2,0,1,6.0,15200
2,F,0-17,10,A,2,0,12,Others,1422
3,F,0-17,10,A,2,0,12,14.0,1057
4,M,55+,16,C,4+,0,8,Others,7969


## Converting Categorical Features

We'll need to convert categorical features to dummy variables using pandas! Otherwise our machine learning algorithm won't be able to directly take in those features as inputs.

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   Gender                      550068 non-null  object
 1   Age                         550068 non-null  object
 2   Occupation                  550068 non-null  object
 3   City_Category               550068 non-null  object
 4   Stay_In_Current_City_Years  550068 non-null  object
 5   Marital_Status              550068 non-null  int64 
 6   Product_Category_1          550068 non-null  object
 7   Product_Category_2          550068 non-null  object
 8   Purchase                    550068 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 37.8+ MB


In [13]:
df = pd.get_dummies(df)

In [14]:
df.head()

Unnamed: 0,Marital_Status,Purchase,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,...,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0,Product_Category_2_Others
0,0,8370,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,15200,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1422,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1057,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,7969,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Great! Our data is ready for our model!

In [15]:
scaler = MinMaxScaler()
scaler.fit(df)

MinMaxScaler()

In [16]:
scalerData = scaler.transform(df)
scalerData = pd.DataFrame(scalerData)

## Train Test Split

In [17]:
x = df.drop(['Purchase'],axis=1)

In [18]:
y = df['Purchase']

In [19]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.2,random_state=42)

## Regression

In [20]:
def LinearReg(train_x,train_y,test_x):
    modelLinear=LinearRegression()
    modelLinear.fit(train_x,train_y)
    return modelLinear.predict(test_x)
    #return mean_squared_error(test_y, modelLinear.predict(test_x), squared=False)
def DecisionTreeReg(train_x,train_y,test_x):
    modeldtr = DecisionTreeRegressor(random_state=42)
    modeldtr.fit(train_x,train_y)
    return modeldtr.predict(test_x)
    #return mean_squared_error(test_y, modeldtr.predict(test_x), squared=False)
def RandomForestReg(train_x,train_y,test_x):
    modelRFR = RandomForestRegressor(max_depth=2, random_state=42)
    modelRFR.fit(train_x,train_y)
    return modelRFR.predict(test_x)
    #return mean_squared_error(test_y, modelRFR.predict(test_x), squared=False)
def GradientBoostingReg(train_x,train_y,test_x):
    modelgbr = GradientBoostingRegressor(n_estimators=100,max_depth=2, random_state=42)
    modelgbr.fit(train_x,train_y)
    return modelgbr.predict(test_x)
    #return mean_squared_error(test_y, modelgbr.predict(test_x), squared=False)
def AdaBoostReg(train_x,train_y,test_x):
    modelabr = AdaBoostClassifier(random_state=42)
    modelabr.fit(train_x,train_y)
    return modelabr.predict(test_x)
    #return mean_squared_error(test_y, modelabr.predict(test_x), squared=False)

## Classification  user-defined functions

In [21]:
def LogisticReg(train_x,train_y,test_x):
    modelLogistic=LogisticRegression()
    modelLogistic.fit(train_x,train_y)
    return modelLogistic.predict(test_x)
    #return mean_squared_error(test_y, modelLogistic.predict(test_x), squared=False)
def DecisionTreeClf(train_x,train_y,test_x):
    modeldtc = DecisionTreeClassifier(random_state=42)
    modeldtc.fit(train_x,train_y)
    return modeldtc.predict(test_x)
    #return mean_squared_error(test_y, modeldtc.predict(test_x), squared=False)
def RandomForestClf(train_x,train_y,test_x):
    modelrfc = RandomForestClassifier(max_depth=2, random_state=42)
    modelrfc.fit(train_x,train_y)
    return modelrfc.predict(test_x)
    #return mean_squared_error(test_y, modelrfc.predict(test_x), squared=False)
def GradientBoostingClf(train_x,train_y,test_x):
    modelgbc = GradientBoostingClassifier(n_estimators=100,max_depth=2, random_state=42)
    modelgbc.fit(train_x,train_y)
    return modelgbc.predict(test_x)
    #return mean_squared_error(test_y, modelgbc.predict(test_x), squared=False)
def AdaBoostClf(train_x,train_y,test_x):
    modelabc = AdaBoostClassifier(random_state=45)
    modelabc.fit(train_x,train_y)
    return modelabc.predict(test_x)
    #return mean_squared_error(test_y, modelabc.predict(test_x), squared=False)

In [22]:
predictions = LinearReg(train_x,train_y,test_x)

### Linear Regression

In [23]:
model = LinearRegression()
model.fit(train_x,train_y)

LinearRegression()

In [24]:
model.predict(test_x)

array([13846., 13482.,  7140., ...,  7110.,  4434.,  9698.])

In [25]:
mean_squared_error(test_y, model.predict(test_x), squared=False)

3020.9936244392647

### Decision Tree Regression

In [26]:
modeldtr = DecisionTreeRegressor(random_state=42)
modeldtr.fit(train_x,train_y)

DecisionTreeRegressor(random_state=42)

In [27]:
mean_squared_error(test_y, modeldtr.predict(test_x), squared=False)

3152.7088455753583

### Random Forest Regression

In [28]:
regressor = RandomForestRegressor(max_depth=2,random_state=42)
regressor.fit(train_x,train_y)

RandomForestRegressor(max_depth=2, random_state=42)

In [29]:
mean_squared_error(test_y, regressor.predict(test_x), squared=False)

4016.5357733997535

### Gradient Boosting Regressor

In [30]:
modelgbr = GradientBoostingRegressor(n_estimators=100,max_depth=2, random_state=42)
modelgbr.fit(train_x,train_y)

GradientBoostingRegressor(max_depth=2, random_state=42)

In [31]:
mean_squared_error(test_y, modelgbr.predict(test_x), squared=False)

3128.982680252029

### AdaBoostRegressor

In [32]:
regr = AdaBoostRegressor(random_state=42, n_estimators=100)
regr.fit(train_x,train_y)

AdaBoostRegressor(n_estimators=100, random_state=42)

In [33]:
mean_squared_error(test_y, regr.predict(test_x), squared=False)

3777.8869873474932

In [34]:
import pickle

In [35]:
file=open('random_forest_regression_model.pkl','wb')

In [36]:
pickle.dump(regressor, file)