### 67 PCA (principal components analysis)

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
np.random.seed(42)
df = pd.DataFrame({'var1': [5.1,4.9,4.7,4.6,5.0,5.4,4.6,5.0],
                   'var2': [1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5],
                   'var3': [0.2,0.2,0.2,0.2,0.2,0.4,0.3,0.2],
                   'class': [0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0]})
df

Unnamed: 0,var1,var2,var3,class
0,5.1,1.4,0.2,0.0
1,4.9,1.4,0.2,0.0
2,4.7,1.3,0.2,0.0
3,4.6,1.5,0.2,0.0
4,5.0,1.4,0.2,0.0
5,5.4,1.7,0.4,1.0
6,4.6,1.4,0.3,1.0
7,5.0,1.5,0.2,1.0


In [4]:
X = df.copy()
y = X.pop('class')

scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std

array([[ 0.72932496, -0.4472136 , -0.53881591],
       [-0.04862166, -0.4472136 , -0.53881591],
       [-0.82656829, -1.34164079, -0.53881591],
       [-1.2155416 ,  0.4472136 , -0.53881591],
       [ 0.34035165, -0.4472136 , -0.53881591],
       [ 1.89624489,  2.23606798,  2.33486893],
       [-1.2155416 , -0.4472136 ,  0.89802651],
       [ 0.34035165,  0.4472136 , -0.53881591]])

### 68 decomposition, eigen value & eigen vector

In [14]:
cov = np.cov(X_std, rowvar=False)
cov

array([[1.14285714, 0.7206672 , 0.52895264],
       [0.7206672 , 1.14285714, 0.82616845],
       [0.52895264, 0.82616845, 1.14285714]])

In [15]:
eig_vals, eig_vecs = np.linalg.eig(cov)
eig_vals, eig_vecs

(array([2.53374217, 0.62287275, 0.27195651]),
 array([[-0.53822284, -0.78400934,  0.30926612],
        [-0.61876769,  0.11844348, -0.77659365],
        [-0.57222611,  0.60934432,  0.54886863]]))

In [16]:
eig_pairs=[(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
eig_pairs

[(2.533742170806442, array([-0.53822284, -0.61876769, -0.57222611])),
 (0.6228727503870667, array([-0.78400934,  0.11844348,  0.60934432])),
 (0.2719565073779216, array([ 0.30926612, -0.77659365,  0.54886863]))]

In [17]:
eig_pairs.sort(reverse=True)
eig_pairs

[(2.533742170806442, array([-0.53822284, -0.61876769, -0.57222611])),
 (0.6228727503870667, array([-0.78400934,  0.11844348,  0.60934432])),
 (0.2719565073779216, array([ 0.30926612, -0.77659365,  0.54886863]))]

In [19]:
W = np.hstack(
    (
        eig_pairs[0][1].reshape(3,1),
        eig_pairs[1][1].reshape(3,1)
    )
)
W

array([[-0.53822284, -0.78400934],
       [-0.61876769,  0.11844348],
       [-0.57222611,  0.60934432]])

In [20]:
X_pca = X_std.dot(W)
X_pca

array([[ 0.1925065 , -0.95309152],
       [ 0.61121514, -0.34317411],
       [ 1.58336643,  0.16080423],
       [ 0.68583546,  0.67764108],
       [ 0.40186082, -0.64813282],
       [-3.74028189,  0.2009131 ],
       [ 0.41707935,  1.44723378],
       [-0.15158182, -0.54219375]])

### 69 group X_pca and class

In [22]:
df_pca = pd.DataFrame(data=X_pca, columns=['pca_1','pca_2'])
df_pca

Unnamed: 0,pca_1,pca_2
0,0.192507,-0.953092
1,0.611215,-0.343174
2,1.583366,0.160804
3,0.685835,0.677641
4,0.401861,-0.648133
5,-3.740282,0.200913
6,0.417079,1.447234
7,-0.151582,-0.542194


In [24]:
df_pca['class'] = df['class']
df_pca

Unnamed: 0,pca_1,pca_2,class
0,0.192507,-0.953092,0.0
1,0.611215,-0.343174,0.0
2,1.583366,0.160804,0.0
3,0.685835,0.677641,0.0
4,0.401861,-0.648133,0.0
5,-3.740282,0.200913,1.0
6,0.417079,1.447234,1.0
7,-0.151582,-0.542194,1.0


### 70 sklearn PCA

In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)
X_pca

array([[-0.1925065 , -0.95309152],
       [-0.61121514, -0.34317411],
       [-1.58336643,  0.16080423],
       [-0.68583546,  0.67764108],
       [-0.40186082, -0.64813282],
       [ 3.74028189,  0.2009131 ],
       [-0.41707935,  1.44723378],
       [ 0.15158182, -0.54219375]])

In [26]:
df_pca = pd.DataFrame(data=X_pca, columns=['pca_1','pca_2'])
df_pca

Unnamed: 0,pca_1,pca_2
0,-0.192507,-0.953092
1,-0.611215,-0.343174
2,-1.583366,0.160804
3,-0.685835,0.677641
4,-0.401861,-0.648133
5,3.740282,0.200913
6,-0.417079,1.447234
7,0.151582,-0.542194


In [27]:
df_pca['class'] = df['class']
df_pca

Unnamed: 0,pca_1,pca_2,class
0,-0.192507,-0.953092,0.0
1,-0.611215,-0.343174,0.0
2,-1.583366,0.160804,0.0
3,-0.685835,0.677641,0.0
4,-0.401861,-0.648133,0.0
5,3.740282,0.200913,1.0
6,-0.417079,1.447234,1.0
7,0.151582,-0.542194,1.0


### 71 PCA distribution of variance

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
df = pd.DataFrame({'var1': [14.23,13.20,13.16,14.37,13.24],
                   'var2': [1.71,1.78,2.36, 1.95,2.59],
                   'var3': [2.43,2.14,2.67,2.50,2.87],
                   'var4': [15.6,11.2,18.6,16.8,21.0],
                   'var5': [127,100,101,113,118],
                   'var6': [2.8,2.65,2.8,3.85,2.8],
                   'var7': [3.06,2.76,3.24,3.49,2.69],
                   'var8': [0.28,0.26,0.3,0.24,0.39],
                   'var9': [2.29,1.28,2.81,2.18,1.82],
                   'var10': [5.64,4.38,5.68,7.8,4.32]})
df

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32


In [30]:
data = df.values
data

array([[ 14.23,   1.71,   2.43,  15.6 , 127.  ,   2.8 ,   3.06,   0.28,
          2.29,   5.64],
       [ 13.2 ,   1.78,   2.14,  11.2 , 100.  ,   2.65,   2.76,   0.26,
          1.28,   4.38],
       [ 13.16,   2.36,   2.67,  18.6 , 101.  ,   2.8 ,   3.24,   0.3 ,
          2.81,   5.68],
       [ 14.37,   1.95,   2.5 ,  16.8 , 113.  ,   3.85,   3.49,   0.24,
          2.18,   7.8 ],
       [ 13.24,   2.59,   2.87,  21.  , 118.  ,   2.8 ,   2.69,   0.39,
          1.82,   4.32]])

In [32]:
sc = StandardScaler()
data_std = sc.fit_transform(data)
pca = PCA(n_components=3)
data_pca = pca.fit_transform(data_std)
data_pca[:10]

array([[-1.08691029,  0.14280811, -1.67387628],
       [ 0.13230315,  3.58522339,  0.46236011],
       [ 0.75722232, -1.14202395,  1.82214925],
       [-3.07482769, -1.41918363,  0.19886055],
       [ 3.27221251, -1.16682392, -0.80949364]])

In [34]:
results = pd.DataFrame(data={'variance_ratio': pca.explained_variance_ratio_})
results

Unnamed: 0,variance_ratio
0,0.438684
1,0.35108
2,0.140614


In [36]:
results['cumulative'] = results['variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,variance_ratio,cumulative,component
0,0.438684,0.438684,1
1,0.35108,0.789764,2
2,0.140614,0.930378,3


### 72 95% variance ratio

In [37]:
data_std[:10]

array([[ 1.08997918, -1.07836773, -0.37692258, -0.31778288,  1.48083171,
        -0.41015156,  0.04029297, -0.26923077,  0.42057591,  0.06020259],
       [-0.81286583, -0.87324344, -1.56504811, -1.6622489 , -1.14959304,
        -0.75194452, -0.96703116, -0.65384615, -1.56438516, -0.93789306],
       [-0.88676272,  0.82635788,  0.60635372,  0.5988985 , -1.0521699 ,
        -0.41015156,  0.64468744,  0.11538462,  1.44253606,  0.09188817],
       [ 1.3486183 , -0.37508443, -0.09013366,  0.04888967,  0.11690777,
         1.98239919,  1.48412422, -1.03846154,  0.20439203,  1.77122371],
       [-0.73896893,  1.50033771,  1.42575063,  1.3322436 ,  0.60402346,
        -0.41015156, -1.20207346,  1.84615385, -0.50311885, -0.98542142]])

In [38]:
pca = PCA(n_components=0.95)
data_pca = pca.fit_transform(data_std)
data_pca[:10]

array([[-1.08691029,  0.14280811, -1.67387628,  1.09820867],
       [ 0.13230315,  3.58522339,  0.46236011, -0.35670337],
       [ 0.75722232, -1.14202395,  1.82214925,  0.88955187],
       [-3.07482769, -1.41918363,  0.19886055, -0.93026422],
       [ 3.27221251, -1.16682392, -0.80949364, -0.70079295]])

In [40]:
pca.n_components_

4