## KNN 다중분류
- 아이리스 데이터를 통해 이진분로

## 1. 데이터 준비

In [None]:
# !wget https://raw.githubusercontent.com/devdio/flyai_datasets/main/iris.csv

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 42

In [None]:
file_path = 'iris.csv'
iris = pd.read_csv(file_path)
iris.shape

In [None]:
col_dict = {col: col.lower().replace(' ', '_' ) for col in iris.columns}
col_dict

In [None]:
iris.rename(columns=col_dict, inplace=True)

In [None]:
# 결측치
iris.isna().sum(axis=0)

# 중복치
iris.duplicated().sum()

In [None]:
# 학습용, 테스트 분리
iris = iris.sample(frac=1, random_state=SEED)
iris.head()

In [None]:
idx_train = int(len(iris) * 0.6)
idx_val = int(len(iris)* 0.8)
idx_train, idx_val

In [None]:
train = iris.iloc[:idx_train, :]
val = iris.iloc[idx_train:idx_val, :]
test = iris.iloc[idx_val:, :]

train.shape, val.shape, test.shape

In [None]:
X_train = train.drop('species', axis=1)
y_train = train['species']

X_val = val.drop('species', axis=1)
y_val = val['species']

y_train.value_counts(), y_val.value_counts()

In [None]:
u = X_train.mean()
std = X_train.std()

X_train_s = (X_train - u)/std
X_train_s.head()

In [None]:
X_val_s = (X_val - u)/std
X_val_s.head()

In [None]:
ss_dic = {'mean':u, 'std':std}
ss_dic

In [None]:
label_dict = {specie:i  for i, specie in enumerate(y_val.unique())}
label_dict

In [None]:
y_train_e = y_train.map(label_dict)
y_val_e = y_val.map(label_dict)

y_train_e, y_val_e

In [None]:
X_train_s = X_train_s.to_numpy()
y_train_e = y_train_e.to_numpy()

X_val_s = X_val_s.to_numpy()
y_val_e = y_val_e.to_numpy()

print(X_train_s.shape, y_train_e.shape)
print(X_val_s.shape, y_val_e.shape)
print(type(X_train_s), type(y_train_e))
print(type(X_val_s), type(y_val_e))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train_s, y_train_e)

y_pred = clf.predict(X_val_s)
y_pred


In [None]:
(y_pred == y_val_e).sum()/len(y_val_e)

In [None]:
from sklearn.metrics import accuracy_score

scores = []
for k in range(3, 30):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_s, y_train_e)
    y_pred = clf.predict(X_val_s)
    acc = accuracy_score(y_val_e, y_pred)
    scores.append(acc)

In [None]:
plt.plot(scores)

In [None]:
X_test = test.drop('species', axis=1)
y_test = test['species']

X_test_s = (X_test - ss_dic['mean'])/ss_dic['std']
y_test_e = y_test.map(label_dict)

X_test_s = X_test_s.to_numpy()
y_test_e = y_test_e.to_numpy()

y_pred = clf.predict(X_test_s)

In [None]:
(y_test_e == y_pred).sum()/len(y_test_e)

In [None]:
from sklearn.metrics import confusion_matrix

cfm = confusion_matrix(y_test_e, y_pred)
cfm

In [None]:
s = sns.heatmap(cfm, annot=True, cmap='Blues', fmt='d', cbar=False)
s.set(xlabel='Prediction', ylabel='Actual')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score

print('accuracy:', accuracy_score(y_test_e, y_pred))
print('recall:', recall_score(y_test_e, y_pred, average='macro'))
print('precision:', precision_score(y_test_e, y_pred, average='macro'))
print('f1 :', f1_score(y_test_e, y_pred, average='macro'))

