In [1]:
# Importing libraries
import numpy as np
import pandas as pd
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix

# CV and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# sample example -- Principal Component Analysis
# Generally this is called a data reduction technique.

In [3]:
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(url, names=names)
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# copy data into arrays
array = df.values
X = array[:,0:8]
y = array[:,8]

In [5]:
# feature extraction
pca = PCA(n_components=3)

fit = pca.fit(X)

# summarize components
# The amount of variance explained by each of the selected components.
print("Explained Variance: ", fit.explained_variance_ratio_)

Explained Variance:  [0.88854663 0.06159078 0.02579012]


In [6]:
fit.components_

array([[-2.02176587e-03,  9.78115765e-02,  1.60930503e-02,
         6.07566861e-02,  9.93110844e-01,  1.40108085e-02,
         5.37167919e-04, -3.56474430e-03],
       [-2.26488861e-02, -9.72210040e-01, -1.41909330e-01,
         5.78614699e-02,  9.46266913e-02, -4.69729766e-02,
        -8.16804621e-04, -1.40168181e-01],
       [-2.24649003e-02,  1.43428710e-01, -9.22467192e-01,
        -3.07013055e-01,  2.09773019e-02, -1.32444542e-01,
        -6.39983017e-04, -1.25454310e-01]])

In [7]:
# so where are the features, which features ???

# PCA does not "discard" or "retain" any of your pre-defined features 
# (encoded by the columns you specify). It mixes all of them 
# (by weighted sums) to find directions of maximum variance.

In [8]:
# Example - using iris dataset ****************************************

In [9]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"  
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']  
dataset = pd.read_csv(url, names=names)  

In [10]:
dataset.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [11]:
# the first preprocessing step is to divide the dataset into a feature set and 
# corresponding labels. 

X = dataset.drop('Class', 1)  
y = dataset['Class']  

In [12]:
# The next preprocessing step is to divide data into training and test sets. 

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [13]:
# PCA performs best with a normalized feature set. We will perform standard 
# scalar normalization to normalize our feature set.

In [14]:
sc = StandardScaler()  

X_train_std = sc.fit_transform(X_train)  

X_test_std = sc.transform(X_test)  

In [15]:
# PCA depends only upon the feature set and not the label data. 
# PCA can be considered as an unsupervised machine learning technique.

In [16]:
# Performing PCA using Scikit-Learn is a two-step process:

# Initialize the PCA class by passing the number of components to the constructor.
# Call the fit and then transform methods by passing the feature set to these methods. 
# The transform method returns the specified number of principal components.

In [17]:
# with all 4 dimensions ********************************************

In [18]:
# We did not specify the number of components in the constructor. 
# Hence, all four of the features in the feature set will be returned for both the 
# training and test sets.
pca = PCA()  

X_train = pca.fit_transform(X_train_std)  

X_test  = pca.transform(X_test_std)  

In [19]:
# The PCA class contains explained_variance_ratio_ which returns the variance caused by 
# each of the principal components. 

explained_variance = pca.explained_variance_ratio_  

explained_variance

array([0.72226528, 0.23974795, 0.03338117, 0.0046056 ])

In [20]:
# It can be seen that first principal component is responsible for 72.22% variance. 
# Similarly, the second principal component causes 23.9% variance in the dataset. 
# Collectively we can say that (72.22 + 23.9) 96.21% percent of the classification information 
# contained in the feature set is captured by the first two principal components.


In [21]:
# Let's first try to use 1 principal component to train our algorithm.

In [22]:
pca1 = PCA(n_components = 1)  

X_train1 = pca.fit_transform(X_train_std)  
X_test1  = pca.transform(X_test_std)  

In [23]:
len(X_train1)

120

In [24]:
X_train1[:5, :]

array([[ 1.27228206,  0.35787873,  0.18033696, -0.21868382],
       [ 0.15223177, -0.29983528,  0.65712963, -0.24932075],
       [-2.18764183,  0.61715426, -0.18088705,  0.00366971],
       [ 0.9419134 ,  0.01218105,  0.32961625, -0.01239281],
       [ 1.76227837, -0.27106639,  0.33667892,  0.09299476]])

In [25]:
classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train1, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test1)  



In [26]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print(accuracy_score(y_test, y_pred))  

[[11  0  0]
 [ 0 10  3]
 [ 0  1  5]]
0.8666666666666667


In [27]:
# It can be seen from the output that with 4 components, the random forest algorithm is able to correctly predict 28 out of 30 instances, 
# resulting in 86.66% accuracy.

In [28]:
# Results with 2 dimensions **********************************************

In [29]:
pca2 = PCA(n_components=2)

In [30]:
X_train2 = pca2.fit_transform(X_train_std)  
X_test2 = pca2.transform(X_test_std)  

In [31]:
explained_variance = pca2.explained_variance_ratio_  

explained_variance

array([0.72226528, 0.23974795])

In [32]:
X_train2[:5]

array([[ 1.27228206,  0.35787873],
       [ 0.15223177, -0.29983528],
       [-2.18764183,  0.61715426],
       [ 0.9419134 ,  0.01218105],
       [ 1.76227837, -0.27106639]])

In [33]:
classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train2, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test2)  



In [34]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print(accuracy_score(y_test, y_pred))  

[[11  0  0]
 [ 0 10  3]
 [ 0  2  4]]
0.8333333333333334


In [35]:
# With two principal components the classification accuracy decreases to 83.33% 
# compared to 86.66% for 1 component.

In [36]:
# Results with 3 dimensions **********************************************

In [37]:
pca3 = PCA(n_components=3)

In [38]:
X_train3 = pca3.fit_transform(X_train_std)  
X_test3  = pca3.transform(X_test_std)  

In [39]:
explained_variance = pca3.explained_variance_ratio_  

explained_variance

array([0.72226528, 0.23974795, 0.03338117])

In [40]:
classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train3, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test3)  



In [41]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print(accuracy_score(y_test, y_pred))  

[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
0.9333333333333333


In [42]:
pca.explained_variance_ratio_


array([0.72226528, 0.23974795, 0.03338117, 0.0046056 ])

In [43]:
# With three principal components the classification accuracy again increases to 93.33%

In [44]:
# A general rule of thumb is to take number of principal of principal components that contribute 
# to significant variance and ignore those with diminishing variance returns. 

# A good way is to plot the variance against principal components and ignore the principal 
# components with diminishing values 