In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import datasets

In [92]:
# dataSet 불러오기

cancer = datasets.load_breast_cancer()

target = pd.DataFrame(cancer.target)
target.groupby(target[0]).size()

0
0    212
1    357
dtype: int64

# 1. K-MEANS를 이용하여 Accuracy 측정

In [11]:
# K_MEANS
from sklearn.cluster import KMeans

model = KMeans(n_clusters=2,init='k-means++')
model.fit(cancer.data,cancer.target)
predict = pd.Series(model.predict(cancer.data)) # predict로 iris['class']와 accuracy 측정
predict

0      1
1      1
2      1
3      0
4      1
      ..
564    1
565    1
566    0
567    1
568    0
Length: 569, dtype: int32

In [12]:
# K_MEANS crosstab -> 데이터를 재구조화 시키고 싶은데 하는 방법을 모르겠음
# 따라서 crosstab으로 나타낸 뒤 가장 많이 겹치는 쪽으로 매핑하는 방향으로

le = preprocessing.LabelEncoder() # 먼저 iris['class']를 형변환
target = pd.DataFrame(le.fit_transform(cancer.target))

def mapping_func(target, predict): # 가장 정확도가 높은 mapping function
    predict_dict={}
    ct = pd.crosstab(target, predict)
    for i in range(0,len(ct),1):
        predict_dict[i] = ct.iloc[:,i].idxmax()
    return predict_dict

In [13]:
# 가장 정확도가 높은 쪽으로 매핑

predict_dict = mapping_func(target[0], predict)
predict = predict.map(predict_dict).astype('category')
predict

0      0
1      0
2      0
3      1
4      0
      ..
564    0
565    0
566    1
567    0
568    1
Length: 569, dtype: category
Categories (2, int64): [0, 1]

In [14]:
# K_MEANS Accuracy Measurement
from sklearn.metrics import accuracy_score

accuracy_score(target, predict) # K_MEANS의 정확도는 약 70.22%

0.8541300527240774

# 2. DBSCAN을 이용하여 Accuracy 측정

In [78]:
# DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN

# create model and prediction
model = DBSCAN(eps=34) # 주변 밀집도 3으로 설정
model = model.fit(cancer.data,cancer.target)
predict = pd.Series(model.fit_predict(cancer.data))

predict_dict = mapping_func(target[0], predict)
predict = predict.map(int,predict_dict) # 매핑함수를 적용하려 했지만, 하나의 값에 몰릴경우 어떻게 처리할지를 상정하지 않아서 값이 제대로 나오지 않음

pd.crosstab(target[0],predict)



col_0,-1,0
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,188,24
1,15,342


# 3. SVM을 이용하여 Accuracy 측정

In [97]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = \
    train_test_split(cancer.data, cancer.target, test_size=0.2)

svm = SVC(kernel='linear')
svm.fit(trainX, trainY)

svm.score(trainX,trainY) # (약 99.29%)

0.9692307692307692