# Principal component analysis (PCA)
- PCA는 feature 공간에서 최대 분산을 가지는 직교 방향을 찾는것이 목적

In [None]:
import pandas as pd

df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'
                      'machine-learning-databases/wine/wine.data',
                      header=None)

# if the Wine dataset is temporarily unavailable from the
# UCI machine learning repository, un-comment the following line
# of code to load the dataset from a local path:

# df_wine = pd.read_csv('wine.data', header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue',
                   'OD280/OD315 of diluted wines', 'Proline']

df_wine.head()

## Data processing
- 학습 / 테스트 데이터셋으로 분리
- 데이터 standardization

In [None]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, 
                     stratify=y,
                     random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

## 공분산행렬(covariance matrix) 에 대한 eigen-decomposition
- 행렬 $A$에 대한 eigen-decomposition은 다음과 같이 계산됨  
$$
    A=Q\Lambda Q^T
$$

- $Q$는 열(column)로서 $A$의 고유벡터(eigen vector)에 대한 orthonomal matrix 
- $\Lambda$ 는 고유값들(eigenvalues)로 이루어진 대각행렬(diagonal matrix)
- $A$는 모든 고유값이 실수가 되도록 대칭이어야 함

$$
z := \begin{bmatrix} z^{(1)} \\ \vdots \\ z^{(n)} \end{bmatrix}= Xv \in 
\mathbb{R}^n
$$

In [None]:
import numpy as np
cov_mat = np.cov(X_train_std.T)

print('Dim of X_train_std', np.shape(X_train_std))
print('Dim of cov matrix', np.shape(cov_mat))

eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

print('\nEigenvalues \n', eigen_vals)

## Total and explained variance

In [None]:
tot = sum(eigen_vals)
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

In [None]:
var_exp

In [None]:
import matplotlib.pyplot as plt


plt.bar(range(1, 14), var_exp, align='center',
        label='Individual explained variance')
plt.step(range(1, 14), cum_var_exp, where='mid',
         label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('figures/05_02.png', dpi=300)
plt.show()

## Feature transformation (Projection onto PCs)

In [None]:
# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
               for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(key=lambda k: k[0], reverse=True)

w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
               eigen_pairs[1][1][:, np.newaxis]))
print('Matrix W:\n', w)

In [None]:
eigen_pairs

In [None]:
X_train_std[0], X_train_std[0].dot(w)

In [None]:
X_train_pca = X_train_std.dot(w)
colors = ['r', 'b', 'g']
markers = ['o', 's', '^']

for l, c, m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_pca[y_train == l, 0], 
                X_train_pca[y_train == l, 1], 
                c=c, label=f'Class {l}', marker=m)

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
# plt.savefig('figures/05_03.png', dpi=300)
plt.show()

## scikit learn 라이브러리를 통한 PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
pca.explained_variance_ratio_

- scikit learn 라이브러리를 활용한 PCA 에서도 동일하게 explained variance ratio가 형성됨을 확인할 수 있음

In [None]:
plt.bar(range(1, 14), pca.explained_variance_ratio_, align='center')
plt.step(range(1, 14), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')

plt.show()

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)


plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1])
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.show()

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('o', 's', '^', 'v', '<')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    lab = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    lab = lab.reshape(xx1.shape)
    plt.contourf(xx1, xx2, lab, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot class examples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.8, 
                    c=colors[idx],
                    marker=markers[idx], 
                    label=f'Class {cl}', 
                    edgecolor='black')

### Training logistic regression classifier using the first 2 principal components.



In [None]:
from sklearn.linear_model import LogisticRegression

pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
lr = lr.fit(X_train_pca, y_train)

- trainset plot

In [None]:
plot_decision_regions(X_train_pca, y_train, classifier=lr)
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
# plt.savefig('figures/05_04.png', dpi=300)
plt.show()

- testset plot

In [None]:
plot_decision_regions(X_test_pca, y_test, classifier=lr)
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
# plt.savefig('figures/05_05.png', dpi=300)
plt.show()

# Exercise

### 문제 1. 
- PCA를 활용해 아래에 주어진 toy 데이터셋을 5 principal components로 이루어진 feature 공간으로 변형하시오. (Hint: 데이터 정규화(standardization) 필요)
- 아래 plot의 경우 단순히 데이터 X의 0번째 feature와 1번째 feature를 plotting 한것이므로 feature transformation이 일어난것이 아님!!

In [None]:
from sklearn.datasets import make_blobs
# Toy example 생성
X, y = make_blobs(n_samples=200, n_features=20, centers=5, random_state=777)
# X, y = make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=1.0)

# Plot the generated dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm')
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('Toy Example Dataset')
plt.legend()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# 문제 1-1. X, y train / text split 진행 (train_test_split 함수 사용)

X_train, X_test, y_train, y_test = ...


# 문제 1-2. 데이터셋 정규화 (StandardScaler 를 활용해서 정규화 진행)

X_train_std = ...
X_test_std = ...

In [None]:
# 문제 1-3. scikit learn 라이브러리의 PCA를 활용하여 feature transformation 진행

pca = ...

X_train_pca = ...
X_test_pca = ...

In [None]:
# 문제 1-4. 지난 수업시간에 배운 RandomForestClassifier를 활용해 해당 데이터셋을 분류하는 분류기 모델 학습

rf_clf = ... # 모델 선언
# 모델 학습
...

# 학습한 모델 성능 평가
...
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

## 문제 2.

- 아래 주어진 데이터셋 (breast cancer wisconsin dataset) 을 PCA를 활용해 데이터 차원축소를 진행하고, (5 이하) 랜덤포레스트 분류기 모델을 학습하고 성능평가를 진행하시오.

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
dataset = load_breast_cancer()

In [None]:
X = dataset.data
y = dataset.target

In [None]:
# 문제 풀이 진행
...
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))