# ISLR Sec. 10-4 page 401

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

df = pd.read_csv('../Data/USArrests.csv')
df.head(3)

Unnamed: 0,State,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0


In [2]:
df.mean()

Murder        7.788
Assault     170.760
UrbanPop     65.540
Rape         21.232
dtype: float64

In [3]:
df.var()

Murder        18.970465
Assault     6945.165714
UrbanPop     209.518776
Rape          87.729159
dtype: float64

In [4]:
# scale data
data = df.iloc[:,1:].values
data_shuffled = shuffle(data,random_state=0)
data_scaled = scale(data_shuffled)

In [6]:
print(data_scaled.mean(axis=0))

[  1.02140518e-16   1.73194792e-16  -4.35207426e-16  -3.10862447e-16]


In [7]:
print(data_scaled.std(axis=0))

[ 1.  1.  1.  1.]


In [8]:
# compute principle components
pca = PCA()
principle_components = pca.fit_transform(data_scaled)
loadings = pca.components_
col = ['PC1','PC2','PC3','PC4']
df_PCA = pd.DataFrame(principle_components,columns=col)
df_loadings = pd.DataFrame(loadings.T,index=df.columns[1:],columns=col)

### Principle Components

Note that principle components have an indeterminant sign.

In [12]:
df_PCA.shape

(50, 4)

In [10]:
df_PCA.head(10)

Unnamed: 0,PC1,PC2,PC3,PC4
0,-2.383915,0.018082,-0.036855,0.033137
1,-1.6398,-0.210973,-0.259801,0.499104
2,-0.912657,1.57046,-0.050782,-0.902807
3,0.999742,-0.860251,-0.188083,-0.652864
4,1.763164,0.745957,-0.054781,0.834653
5,2.874395,0.7756,-1.16338,-0.314515
6,-0.863772,1.491978,1.369946,0.613569
7,1.682577,0.823184,0.643075,0.013484
8,-1.692682,0.632261,-0.15307,-0.067317
9,2.52398,1.542934,-0.598557,0.341996


### Loadings

In [13]:
df_loadings.shape

(4, 4)

In [14]:
df_loadings

Unnamed: 0,PC1,PC2,PC3,PC4
Murder,0.535899,-0.418181,0.341233,-0.649228
Assault,0.583184,-0.187986,0.268148,0.743407
UrbanPop,0.278191,0.872806,0.378016,-0.133878
Rape,0.543432,0.167319,-0.817778,-0.089024


After we scale the data, PCA concentrates the variance in the top principle components. The dimension of feature space can be reduced by dropping principle components if their variance is small enough.

In [15]:
df.var()

Murder        18.970465
Assault     6945.165714
UrbanPop     209.518776
Rape          87.729159
dtype: float64

In [16]:
df_PCA.var()

PC1    2.530859
PC2    1.009964
PC3    0.363840
PC4    0.176969
dtype: float64

### Proportion of Variance Explained (PVE)

In [17]:
PVE = pca.explained_variance_ratio_
PVE = np.expand_dims(PVE, axis=0)
df_PVE = pd.DataFrame(PVE,columns=col)
df_PVE

Unnamed: 0,PC1,PC2,PC3,PC4
0,0.62006,0.247441,0.089141,0.043358


Notice how most of the variance is concentrated in the first principle component, `PC1` and the 4th principle component `PC4` has the smallest proportion of the variance.