## PCA 【Principal Component Analysis】主成分分析

### 使用numpy 模拟 PCA 计算过程

In [1]:
import numpy as np

In [2]:
A = np.array([[3,2000],
             [2,3000],
             [4,5000],
             [5,8000],
             [1,2000]],dtype='float32')

#数据归一化
mean = np.mean(A,axis=0)
norm = A-mean

#数据缩放
scope = np.max(norm,axis=0)-np.min(norm,axis=0)
norm=norm/scope
norm

array([[ 0.        , -0.33333334],
       [-0.25      , -0.16666667],
       [ 0.25      ,  0.16666667],
       [ 0.5       ,  0.6666667 ],
       [-0.5       , -0.33333334]], dtype=float32)

In [3]:
U,S,V = np.linalg.svd(np.dot(norm.T,norm))
U

array([[-0.6771095, -0.7358823],
       [-0.7358823,  0.6771095]], dtype=float32)

In [4]:
U_reduce=U[:,0].reshape(2,1)
U_reduce

array([[-0.6771095],
       [-0.7358823]], dtype=float32)

In [5]:
R = np.dot(norm,U_reduce)
R

array([[ 0.2452941 ],
       [ 0.29192442],
       [-0.29192442],
       [-0.8291429 ],
       [ 0.58384883]], dtype=float32)

In [7]:
Z = np.dot(R,U_reduce.T)
Z

array([[-0.16609095, -0.18050757],
       [-0.1976648 , -0.21482201],
       [ 0.1976648 ,  0.21482201],
       [ 0.56142056,  0.6101516 ],
       [-0.3953296 , -0.42964402]], dtype=float32)

In [8]:
np.multiply(Z,scope) + mean

array([[2.3356361e+00, 2.9169546e+03],
       [2.2093408e+00, 2.7110679e+03],
       [3.7906592e+00, 5.2889321e+03],
       [5.2456822e+00, 7.6609092e+03],
       [1.4186816e+00, 1.4221360e+03]], dtype=float32)

## 使用sklearn进行PCA降维运算

In [9]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [14]:
def std_PCA(**argv):
    scaler = MinMaxScaler()
    pca = PCA(**argv)
    pipeline = Pipeline([('scaler',scaler),
                        ('pca',pca)])
    
    return pipeline

In [15]:
pca = std_PCA(n_components=1)1
R2 = pca.fit_transform(A)
R2

array([[-0.24529408],
       [-0.29192442],
       [ 0.29192442],
       [ 0.829143  ],
       [-0.58384883]], dtype=float32)

In [16]:
pca.inverse_transform(R2)

array([[2.3356361e+00, 2.9169546e+03],
       [2.2093408e+00, 2.7110679e+03],
       [3.7906592e+00, 5.2889321e+03],
       [5.2456822e+00, 7.6609097e+03],
       [1.4186816e+00, 1.4221359e+03]], dtype=float32)