# 제 7 장 __비지도학습의 이해와 활용__
___

## __사전설정__
---

(1) 저장소 데이터 가져오기

In [None]:
!rm -rf /content/BizDataAnalysis/

In [None]:
!git clone https://github.com/BizStat/BizDataAnalysis.git


(2) matplotlib 환경에서 한글 사용

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -f -v
!rm ~/.cache/matplotlib -rf

런타임 메뉴에서 '세션 다시 시작' 후 다음의 명령문 실행

In [None]:
from matplotlib import rc
rc('font', family='NanumGothicCoding')
rc('axes', unicode_minus=False)

(3) 구글 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

___

## 7.1 __차원축소의 이해와 활용__

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### (1) 주성분 분석 사례 : Red Wine Qulaity data

* 데이터 가져오기

In [None]:
wine = pd.read_table('/content/BizDataAnalysis/DATA/winequality-red.csv',sep=',')

In [None]:
wine.info()

In [None]:
wine.head()

In [None]:
wine.shape

* 데이터 살펴보기

In [None]:
import seaborn as sns
sns.pairplot(wine.iloc[:,1:], hue="quality")
plt.show()

* 분석 데이터

In [None]:
X = wine.iloc[:,0:-1]

In [None]:
X.head()

* 주성분 수 결정

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

In [None]:
for i in range(0,len(scaler.feature_names_in_)):
  print(f'{scaler.feature_names_in_[i]} : {scaler.mean_[i]}')

In [None]:
for i in range(0,len(scaler.feature_names_in_)):
  print(f'{scaler.feature_names_in_[i]} : {scaler.var_[i]}')

In [None]:
Xt = scaler.transform(X)

In [None]:
from sklearn.decomposition import PCA
wine_pca = PCA()
wine_pca.fit(Xt)

In [None]:
wine_pca.explained_variance_

In [None]:
plt.plot(wine_pca.explained_variance_)
plt.show()

* 주성분 분석 결과 탐색

In [None]:
wine_pca.components_

In [None]:
Xprj = wine_pca.transform(Xt)
Xprj.shape

In [None]:
Xprj[:,0:2]

In [None]:
wine = wine.assign(PC1=Xprj[:,0],PC2=Xprj[:,1])

In [None]:
plt.scatter(wine['PC1'],wine['PC2'],c=wine['quality'])
plt.xlabel("주성분1")
plt.ylabel("주성분2")
plt.grid()
plt.show()

In [None]:
#@title PCA 결과 시각화를 위한 biplot 함수 (출처: https://sukhbinder.wordpress.com/2015/08/05/biplot-with-python/)
def biplot(score,coeff,pcax,pcay,labels=None):
  pca1=pcax-1
  pca2=pcay-1
  xs = score[:,pca1]
  ys = score[:,pca2]
  n=score.shape[1]
  scalex = 1.0/(xs.max()- xs.min())
  scaley = 1.0/(ys.max()- ys.min())
  plt.scatter(xs*scalex,ys*scaley)
  for i in range(n):
    plt.arrow(0, 0, coeff[i,pca1], coeff[i,pca2],color='r',alpha=0.5)
    if labels is None:
      plt.text(coeff[i,pca1]* 1.15, coeff[i,pca2] * 1.15, "Var"+str(i+1), color='g', ha='center', va='center')
    else:
      plt.text(coeff[i,pca1]* 1.15, coeff[i,pca2] * 1.15, labels[i], color='g', ha='center', va='center')
  plt.xlim(-1,1)
  plt.ylim(-1,1)
  plt.xlabel("주성분{}".format(pcax))
  plt.ylabel("주성분{}".format(pcay))
  plt.grid()

In [None]:
biplot(Xprj, wine_pca.components_, 1, 2, labels=scaler.feature_names_in_)

### (2) 주성분 분석 사례 : Boston house-price data