In [136]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [137]:
# dataSet 불러오기

feature = ['sepal_length', 'sepal_width', 'petal length', 'petal width', 'class']
iris = pd.read_csv("dataset/iris.csv", names=feature)
iris_v = iris.iloc[:,:4]
iris_v

Unnamed: 0,sepal_length,sepal_width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


# 1. K-MEANS를 이용하여 Accuracy 측정

In [138]:
# K_MEANS
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3,init='k-means++')
model.fit(iris_v)
predict = pd.Series(model.predict(iris_v)) # predict로 iris['class']와 accuracy 측정
predict

0      0
1      0
2      0
3      0
4      0
      ..
145    1
146    2
147    1
148    1
149    2
Length: 150, dtype: int32

In [139]:
# K_MEANS crosstab -> 데이터를 재구조화 시키고 싶은데 하는 방법을 모르겠음
# 따라서 crosstab으로 나타낸 뒤 가장 많이 겹치는 쪽으로 매핑하는 방향으로

le = preprocessing.LabelEncoder() # 먼저 iris['class']를 형변환
target = pd.DataFrame(le.fit_transform(iris['class']))

def mapping_func(target, predict): # 가장 정확도가 높은 mapping function
    predict_dict={}
    ct = pd.crosstab(target, predict)
    for i in range(0,len(ct),1):
        predict_dict[i] = ct.iloc[:,i].idxmax()
    return predict_dict

In [140]:
# 가장 정확도가 높은 쪽으로 매핑

predict_dict = mapping_func(target[0], predict)
predict = predict.map(predict_dict).astype('category')
predict

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    1
147    2
148    2
149    1
Length: 150, dtype: category
Categories (3, int64): [0, 1, 2]

In [141]:
# K_MEANS Accuracy Measurement
from sklearn.metrics import accuracy_score

accuracy_score(target, predict) # K_MEANS의 정확도는 약 89.33%

0.8933333333333333

# 2. DBSCAN을 이용하여 Accuracy 측정

In [142]:
# DBSCAN
from sklearn.cluster import DBSCAN

# create model and prediction
model = DBSCAN(min_samples=6) # 주변 밀집도 6으로 설정
predict = pd.Series(model.fit_predict(iris_v))

# predict_dict = mapping_func(target[0], predict)
# predict = predict.map(int,predict_dict) # 매핑함수를 적용하려 했지만, 하나의 값에 몰릴경우 어떻게 처리할지를 상정하지 않아서 값이 제대로 나오지 않음
ct = pd.crosstab(target[0], predict)
ct # CT으로 최대 정확도를 산정해 보면 22/150의 오답률을 보인다. (약 85.3% 정도)

col_0,-1,0,1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,49,0
1,6,0,44
2,15,0,35


# 3. SVM을 이용하여 Accuracy 측정

In [166]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = \
    train_test_split(iris_v, target, test_size=0.2)

svm = SVC(kernel='linear')
svm.fit(trainX, trainY)

svm.score(trainX,trainY) # (약 99.16%)

  return f(**kwargs)


0.9916666666666667