# Principle component analysis

In [7]:
import pandas as pd
from sklearn.datasets import load_digits
digit = load_digits()

In [13]:
dir(digit)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [25]:
digit.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [31]:
df = pd.DataFrame(digit.data, columns=digit.feature_names)

In [33]:
X = df
y = digit.target

array([0, 1, 2, ..., 8, 9, 8])

In [39]:
X

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


### Feature scaling technique and splitting into test train

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [46]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # learns mean & std from train
X_test_scaled = scaler.transform(X_test)         # uses same mean & std


### Now train model

In [62]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=200)

# Train
log_reg.fit(X_train_scaled, y_train)
log_reg.score(X_test_scaled, y_test)

0.9722222222222222

### Use of PCA to avoid unnecessary columns

In [67]:
from sklearn.decomposition import PCA

**Use components such that 95% of variance is retained**

In [73]:
pc = PCA(0.95)
X_pc = pc.fit_transform(X)
X_pc.shape

(1797, 29)

## PCA created 29 components out of 64 original columns 

In [77]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pc, y, test_size=0.2, random_state=42)
sc = StandardScaler()
X_train_pca_scaled = sc.fit_transform(X_train_pca)   # learns mean & std from train
X_test_pca_scaled = sc.transform(X_test_pca)         # uses same mean & std

In [79]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_pca_scaled, y_train)
model.score(X_test_pca_scaled, y_test)

0.9666666666666667

In [83]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
X_pca.shape

(1797, 2)

In [85]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_pca, y_train)
model.score(X_test_pca, y_test)

0.6083333333333333

We get less accuancy (~60%) as using only 2 components did not retain much of the feature information. However in real life you will find many cases where using 2 or few PCA components can still give you a pretty good accuracy