# 1. K-means로 학습하기

In [147]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [148]:
# data load

iris = load_iris()

iris.data = pd.DataFrame(iris.data)
iris.target = pd.DataFrame(iris.target)

In [149]:
# 10번 반복하는 pipeline 함수 제작

from sklearn.pipeline import Pipeline

def pipe(model: Pipeline, train, target):
    result = []
    for i in range(0,10,1):
        model.fit(train, target)
        result.append(model.predict(train))
    return result

In [150]:
# normal_iris_kmeans accuracy

from sklearn.cluster import KMeans

pipeline = Pipeline([('Kmeans', KMeans(n_clusters=3, init='random'))]) # normal predict 생성
result_norm = pipe(pipeline, iris.data, iris.target)


from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score # Adjusted rand score는 음수가 나올수 있으므로 백분율 계산을 위해서 Normalized mutual information을 사용했다. 

score = []
for i in result_norm:
    score.append(normalized_mutual_info_score(iris.target[0], i))

print(np.mean(score))

0.7581756800057784


In [151]:
# PCA _iris_kemans accuracy

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

# 각 정규화 된 값들을 PCA로 차원축소 시키고 값이 어떻게 변화하는지 살펴본다.
# 일단 컴포넌트는 2로 하고 1~3중 어느값이 가장 좋은 결과를 가지는지 학습시켜본다.
pipeline_minmax = Pipeline([('MinMax', MinMaxScaler()),('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_maxabs = Pipeline([('MaxAbs', MaxAbsScaler()),('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_standard = Pipeline([('Standard', StandardScaler()),('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_robust = Pipeline([('Robust', RobustScaler()),('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_normalize = Pipeline([('Normalize', Normalizer()),('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_log = Pipeline([('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])

result_minmax = pipe(pipeline_minmax, iris.data, iris.target)
result_maxabs = pipe(pipeline_maxabs, iris.data, iris.target)
result_standard = pipe(pipeline_standard, iris.data, iris.target)
result_robust = pipe(pipeline_robust, iris.data, iris.target)
result_normalize = pipe(pipeline_normalize, iris.data, iris.target)
result_log = pipe(pipeline_log, np.log1p(iris.data), iris.target)

score_minmax, score_maxabs, score_standard, score_robust, score_normalize, score_log = [], [], [], [], [], []
for i in range(0,10,1):
    score_minmax.append(normalized_mutual_info_score(iris.target[0], result_minmax[i]))
    score_maxabs.append(normalized_mutual_info_score(iris.target[0], result_maxabs[i]))
    score_standard.append(normalized_mutual_info_score(iris.target[0], result_standard[i]))
    score_robust.append(normalized_mutual_info_score(iris.target[0], result_robust[i]))
    score_normalize.append(normalized_mutual_info_score(iris.target[0], result_normalize[i]))
    score_log.append(normalized_mutual_info_score(iris.target[0], result_log[i]))

print('minmax_pca:',np.mean(score_minmax))
print('maxabs_pca:',np.mean(score_maxabs))
print('standard_pca:',np.mean(score_standard))
print('robust_pca:',np.mean(score_robust))
print('normalize_pca:',np.mean(score_normalize))
print('log_scaled_pca:',np.mean(score_log))

# normalizer의 accuracy가 가장 높으므로 normalizer로 전처리된 값을 이용해 PCA의 컴포넌트를 1~3까지 넣어본다.

minmax_pca: 0.7419116631817836
maxabs_pca: 0.8641855068202222
standard_pca: 0.6548417952241196
robust_pca: 0.6162206156967885
normalize_pca: 0.8996935451597474
log_scaled_pca: 0.8464828103876364


In [152]:
# PCA _iris_kemans accuracy (change_component)

pipeline1 = Pipeline([('Normalize', Normalizer()),('PCA', PCA(n_components=1)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline2 = Pipeline([('Normalize', Normalizer()),('PCA', PCA(n_components=2)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline3 = Pipeline([('Normalize', Normalizer()),('PCA', PCA(n_components=3)),('Kmeans', KMeans(n_clusters=3, init='random'))])

result1 = pipe(pipeline1, iris.data, iris.target)
result2 = pipe(pipeline2, iris.data, iris.target)
result3 = pipe(pipeline3, iris.data, iris.target)

score1, score2, score3 = [], [], []

for i in range(0,10,1):
    score1.append(normalized_mutual_info_score(iris.target[0], result1[i]))
    score2.append(normalized_mutual_info_score(iris.target[0], result2[i]))
    score3.append(normalized_mutual_info_score(iris.target[0], result3[i]))

print('component=1:',np.mean(score1))
print('component=2:',np.mean(score2))
print('component=3:',np.mean(score3))

# component는 1개일 때 가장 좋은 결과를 보였다. 다만 dataset 마다 결과는 다를 것으로보인다.

component=1: 0.9305506621576433
component=2: 0.8996935451597474
component=3: 0.8996935451597474


In [153]:
from sklearn.manifold import TSNE

pipeline_minmax = Pipeline([('MinMax', MinMaxScaler()),('t-SNE', TSNE(random_state=0)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_maxabs = Pipeline([('MaxAbs', MaxAbsScaler()),('t-SNE', TSNE(random_state=0)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_standard = Pipeline([('Standard', StandardScaler()),('t-SNE', TSNE(random_state=0)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_robust = Pipeline([('Robust', RobustScaler()),('t-SNE', TSNE(random_state=0)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_normalize = Pipeline([('Normalize', Normalizer()),('t-SNE', TSNE(random_state=0)),('Kmeans', KMeans(n_clusters=3, init='random'))])
pipeline_log = Pipeline([('t-SNE', TSNE(random_state=0)),('Kmeans', KMeans(n_clusters=3, init='random'))])

result_minmax = pipe(pipeline_minmax, iris.data, iris.target)
result_maxabs = pipe(pipeline_maxabs, iris.data, iris.target)
result_standard = pipe(pipeline_standard, iris.data, iris.target)
result_robust = pipe(pipeline_robust, iris.data, iris.target)
result_normalize = pipe(pipeline_normalize, iris.data, iris.target)
result_log = pipe(pipeline_log, np.log1p(iris.data), iris.target)

score_minmax, score_maxabs, score_standard, score_robust, score_normalize, score_log = [], [], [], [], [], []
for i in range(0,10,1):
    score_minmax.append(normalized_mutual_info_score(iris.target[0], result_minmax[i]))
    score_maxabs.append(normalized_mutual_info_score(iris.target[0], result_maxabs[i]))
    score_standard.append(normalized_mutual_info_score(iris.target[0], result_standard[i]))
    score_robust.append(normalized_mutual_info_score(iris.target[0], result_robust[i]))
    score_normalize.append(normalized_mutual_info_score(iris.target[0], result_normalize[i]))
    score_log.append(normalized_mutual_info_score(iris.target[0], result_log[i]))

print('minmax_TSNE:',np.mean(score_minmax))
print('maxabs_TSNE:',np.mean(score_maxabs))
print('standard_TSNE:',np.mean(score_standard))
print('robust_TSNE:',np.mean(score_robust))
print('normalize_TSNE:',np.mean(score_normalize))
print('log_scaled_TSNE:',np.mean(score_log))

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'TSNE(random_state=0)' (type <class 'sklearn.manifold._t_sne.TSNE'>) doesn't