주성분 분석

In [2]:
# 특이값 분해(SVD) 아용해 훈련셋 주성분 모두 구하기
import numpy as np
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [9]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0] # PC1
c2 = Vt.T[:, 1] # PC2

In [24]:
c1

array([0.93636116, 0.29854881, 0.18465208])

In [13]:
m, n = X.shape # (60, 3)

S = np.zeros(X_centered.shape)
S[:n, :n] = np.diag(s) # 대각성분 / 주 대각선 방향에만 있고, 나머진 0

In [15]:
np.allclose(X_centered, U.dot(S).dot(Vt)) # 동일한지

True

In [19]:
# 첫 두개 주성분으로 정의된 평면에 훈련셋 투영
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [20]:
X2D

array([[-1.26203346, -0.42067648],
       [ 0.08001485,  0.35272239],
       [-1.17545763, -0.36085729],
       [-0.89305601,  0.30862856],
       [-0.73016287,  0.25404049],
       [ 1.10436914, -0.20204953],
       [-1.27265808, -0.46781247],
       [ 0.44933007, -0.67736663],
       [ 1.09356195,  0.04467792],
       [ 0.66177325,  0.28651264],
       [-1.04466138,  0.11244353],
       [ 1.05932502, -0.31189109],
       [-1.13761426, -0.14576655],
       [-1.16044117, -0.36481599],
       [ 1.00167625, -0.39422008],
       [-0.2750406 ,  0.34391089],
       [ 0.45624787, -0.69707573],
       [ 0.79706574,  0.26870969],
       [ 0.66924929, -0.65520024],
       [-1.30679728, -0.37671343],
       [ 0.6626586 ,  0.32706423],
       [-1.25387588, -0.56043928],
       [-1.04046987,  0.08727672],
       [-1.26047729, -0.1571074 ],
       [ 1.09786649, -0.38643428],
       [ 0.7130973 , -0.64941523],
       [-0.17786909,  0.43609071],
       [ 1.02975735, -0.33747452],
       [-0.94552283,

In [21]:
# 사이킷런 사용
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [22]:
X2D[:5] # 사이킷런은 양쪽 축이 뒤집힌다.(SVD와 부호만 다름)

array([[ 1.26203346,  0.42067648],
       [-0.08001485, -0.35272239],
       [ 1.17545763,  0.36085729],
       [ 0.89305601, -0.30862856],
       [ 0.73016287, -0.25404049]])

In [23]:
pca.components_.T[:,0]  # PC1 단위벡터

array([-0.93636116, -0.29854881, -0.18465208])

In [26]:
# 설명된 분산의 비율
pca.explained_variance_ratio_

# 데이터셋 분산의 84.2%가 첫 번째 PC를 따라 놓여 있고, 14.6%가 두 번째 PC를 따라 놓여있다.

array([0.84248607, 0.14631839])

In [29]:
# 적절한 차원 수 선택하기
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)


pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_) # 누적합
d = np.argmax(cumsum>=0.95) + 1
d

  warn(


153

In [30]:
cumsum

array([0.09792858, 0.16918255, 0.23058781, 0.28488202, 0.33394139,
       0.37697755, 0.40965576, 0.43855474, 0.46609225, 0.48952161,
       0.51047203, 0.5308378 , 0.54786123, 0.56475213, 0.5806216 ,
       0.5954588 , 0.60867273, 0.6214041 , 0.63328935, 0.64482663,
       0.65548608, 0.6656263 , 0.67523811, 0.68437167, 0.69322418,
       0.70162074, 0.70970978, 0.71757011, 0.7249473 , 0.73182003,
       0.73834683, 0.74479121, 0.7508099 , 0.75671383, 0.76238186,
       0.76783612, 0.77286595, 0.77773048, 0.78251842, 0.78719169,
       0.79174491, 0.79619636, 0.80036573, 0.8043255 , 0.80815976,
       0.81190639, 0.81551796, 0.81899999, 0.8223805 , 0.82558925,
       0.82876061, 0.83185067, 0.83483074, 0.83769869, 0.84052413,
       0.84321858, 0.84590974, 0.84848407, 0.85102041, 0.85345062,
       0.85585449, 0.85823931, 0.86052105, 0.86273756, 0.86485904,
       0.86693298, 0.86895776, 0.87091498, 0.87282984, 0.87472256,
       0.87658385, 0.87836792, 0.8801357 , 0.88186572, 0.88350

In [31]:
pca = PCA(n_components = 0.95)  # 보존하려는 분산의 비율
X_reduced = pca.fit_transform(X_train)
pca.n_components_

153

In [34]:
X_reduced.shape

(52500, 153)

In [32]:
np.sum(pca.explained_variance_ratio_)

0.9500608823439146

In [43]:
# 압축한 PCA 복원
pca = PCA(n_components=153)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)
X_recovered.shape

(52500, 784)

In [42]:
# 랜덤 PCA : 확률적 알고리즘 사용
# d << n : SVD보다 빠름
rpca = PCA(n_components=153, svd_solver='randomized')
X_reduced_r = rpca.fit_transform(X_train)
X_reduced_r.shape

(52500, 153)

In [44]:
# 점진적 PCA - 미니배치로 나눈 뒤 한 번에 하나씩 주입
# SVD : 전체 훈련셋을 메모리에 올려야한다는 PCA 문제
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=153)
for X_batch in np.array_split(X_train, n_batches) :  # 100개의 미니배치
    inc_pca.partial_fit(X_batch)
    
X_reduced_i = inc_pca.transform(X_train)

In [45]:
np.allclose(X_reduced, X_reduced_i)

False

In [46]:
# memmap 사용
filename = "my_mnist.data"
m, n = X_train.shape

X_mm = np.memmap(filename, dtype='float32', mode='write', shape=(m, n))
X_mm[:] = X_train

In [47]:
del X_mm

In [48]:
X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=(m,n))
batch_size = m//n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)

커널 PCA

In [52]:
from sklearn.datasets import make_swiss_roll

X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

In [53]:
from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.04)
X_reduced_rbf = rbf_pca.fit_transform(X)

In [58]:
X.shape, X_reduced_rbf.shape

((1000, 3), (1000, 2))

In [64]:
y = t > 6.9

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('kpca', KernelPCA(n_components=2)),
    ('lr' , LogisticRegression(solver='lbfgs'))
])

params = [{
    'kpca__gamma' : np.linspace(0.03, 0.05, 10),
    'kpca__kernel' : ['rbf', 'sigmoid']
}]

grid = GridSearchCV(clf, params, cv=3)
grid.fit(X, y)

In [65]:
print(grid.best_params_)

{'kpca__gamma': 0.043333333333333335, 'kpca__kernel': 'rbf'}


In [69]:
rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.0433,
                   fit_inverse_transform=True) # KernelPCA : inverse_transform 없음
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [71]:
# 재구성 원상 오차
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)

32.786308795766104

지역 선형 임베딩(LLE) : 투영에 의존하지 않는 매니폴드 학습
- 각 훈련샘플이 가장 가까운 이웃에 얼마나 선형적으로 연관되어 있는지 측정

In [72]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced_lle = lle.fit_transform(X)

# 연습문제

In [73]:
# 9) MNIST 랜덤포레스트 분류기로 훈련시키고, 모델 평가한 다음
# PCA사용해 설명된 분산이 95%가 되도록 차원 축소
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]

X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]

In [74]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [75]:
import time

t0 = time.time()
rnd_clf.fit(X_train, y_train)
t1 = time.time()

print("Training took {:.2f}s".format(t1 - t0))

Training took 70.51s


In [76]:
from sklearn.metrics import accuracy_score

y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9705

In [77]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)

In [78]:
rnd_clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
t0 = time.time()
rnd_clf2.fit(X_train_reduced, y_train)
t1 = time.time()

print("Training took {:.2f}s".format(t1 - t0)) # 느려짐

Training took 322.22s


In [79]:
X_test_reduced = pca.transform(X_test)

y_pred = rnd_clf2.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9481

In [80]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42)
t0 = time.time()
log_clf.fit(X_train, y_train)
t1 = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
print("Training took {:.2f}s".format(t1 - t0))

Training took 11.02s


In [82]:
y_pred = log_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9255

In [84]:
log_clf2 = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42)
t0 = time.time()
log_clf2.fit(X_train_reduced, y_train)
t1 = time.time()
print("Training took {:.2f}s".format(t1 - t0))

Training took 3.57s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [85]:
y_pred = log_clf2.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9201