# Principal Component Analysis - Black Friday dataset
* Dimensionality reduction with PCA
* K-Fold data split

In [1]:
import pandas as pd

## Load Data
* BlackFriday dataset (available from Kaggle)

In [2]:
bfri = pd.read_csv("../../Downloads/BlackFriday.csv")

## Explore Data

In [3]:
bfri.shape

(537577, 12)

* there are 12 variables
* there are 537577 variables

In [4]:
bfri.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871


In [5]:
bfri.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,537577.0,537577.0,537577.0,537577.0,370591.0,164278.0,537577.0
mean,1002992.0,8.08271,0.408797,5.295546,9.842144,12.66984,9333.859853
std,1714.393,6.52412,0.491612,3.750701,5.087259,4.124341,4981.022133
min,1000001.0,0.0,0.0,1.0,2.0,3.0,185.0
25%,1001495.0,2.0,0.0,1.0,5.0,9.0,5866.0
50%,1003031.0,7.0,0.0,5.0,9.0,14.0,8062.0
75%,1004417.0,14.0,1.0,8.0,15.0,16.0,12073.0
max,1006040.0,20.0,1.0,18.0,18.0,18.0,23961.0


* count the number of observations in each range of a categorical variable - using value.counts( )

In [6]:
bfri['Stay_In_Current_City_Years'].value_counts()

1     189192
2      99459
3      93312
4+     82889
0      72725
Name: Stay_In_Current_City_Years, dtype: int64

* count the number of null values for each variable

In [7]:
bfri.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            166986
Product_Category_3            373299
Purchase                           0
dtype: int64

## Clean Data
* address the null values in these 2 variables
* fill them with the max value, from each column

In [8]:
b = ['Product_Category_2','Product_Category_3']

for i in b:
    exec("bfri.%s.fillna(bfri.%s.value_counts().idxmax(), inplace=True)" %(i,i))

## Set X & y
* the Target is the Purchase variable
* X, will be every other column

In [9]:
X = bfri.drop(["Purchase"], axis=1)

* apply a label encoder to this data

In [10]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

X = X.apply(LE.fit_transform)

* convert Categorical data into Numerical

In [11]:
X.Gender = pd.to_numeric(X.Gender)
X.Age = pd.to_numeric(X.Age)
X.Occupation = pd.to_numeric(X.Occupation)
X.City_Category = pd.to_numeric(X.City_Category)
X.Stay_In_Current_City_Years = pd.to_numeric(X.Stay_In_Current_City_Years)
X.Marital_Status = pd.to_numeric(X.Marital_Status)
X.Product_Category_1 = pd.to_numeric(X.Product_Category_1)
X.Product_Category_2 = pd.to_numeric(X.Product_Category_2)
X.Product_Category_3 = pd.to_numeric(X.Product_Category_3)

* set the Target, y

In [12]:
y = bfri["Purchase"]

## Standardize Data
* center and scale

In [14]:
from sklearn.preprocessing import StandardScaler

SS = StandardScaler()

In [15]:
Xs = SS.fit_transform(X)

## Principal Component Analysis
* speeds up ML algorithms
* replaces original variables with, a smaller number of, Principal Components (PC)
* run PCA on Training data alone
* centering is required for PCA - Not for sparse data

In [16]:
from sklearn.decomposition import PCA

pc = PCA(4)

* 4 - represents the number of PCs desired

In [17]:
principalComponents = pc.fit_transform(X)

In [18]:
pc.explained_variance_ratio_

array([7.35761510e-01, 2.64215790e-01, 1.11932596e-05, 6.28064654e-06])

In [19]:
principalDf = pd.DataFrame(data = principalComponents, columns = ["component 1", "component 2", "component 3", 
                                                                 "component 4"])

#### Split dataset into k folds
* each fold serves as a validation set - while remaining folds (k - 1) are used for Training

In [20]:
from sklearn.model_selection import KFold

kf = KFold(20)

* use folds to train/test split the data 

In [21]:
for a,b in kf.split(principalDf):
    X_train, X_test = Xs[a], Xs[b]
    y_train, y_test = y[a], y[b]

## Using the Principal Components - Compare several models
* Linear Regression
* Decision Tree Regression
* Random Forest Regression
* Gradient Boost Regression

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


In [23]:
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()

#### Fit the Training data to each Regressor

In [24]:
fit1 = lr.fit(X_train, y_train)
fit2 = dtr.fit(X_train, y_train)
fit3 = rfr.fit(X_train, y_train)
fit4 = gbr.fit(X_train, y_train)

#### Check the Accuracy of the models on the Training data

In [25]:
print("Accuracy Score of Linear regression on train set", fit1.score(X_train, y_train)*100)
print("Accuracy Score of Decision Tree on train set", fit2.score(X_train, y_train)*100)
print("Accuracy Score of Random Forests on train set", fit3.score(X_train, y_train)*100)
print("Accuracy Score of Gradient Boosting on train set", fit4.score(X_train, y_train)*100)

Accuracy Score of Linear regression on train set 11.79662402594145
Accuracy Score of Decision Tree on train set 100.0
Accuracy Score of Random Forests on train set 94.24207748036615
Accuracy Score of Gradient Boosting on train set 65.51860129045335


#### Check the Accuracy on the models on the Test data

In [26]:
print("Accuracy Score of Linear regression on test set", fit1.score(X_test, y_test)*100)
print("Accuracy Score of Decision Tree on test set", fit2.score(X_test, y_test)*100)
print("Accuracy Score of Random Forests on test set", fit3.score(X_test, y_test)*100)
print("Accuracy Score of Gradient Boosting on testset", fit4.score(X_test, y_test)*100)

Accuracy Score of Linear regression on test set 12.490705794515025
Accuracy Score of Decision Tree on test set 41.17176733157753
Accuracy Score of Random Forests on test set 66.04072431959615
Accuracy Score of Gradient Boosting on testset 64.51076408418355
