In [None]:
import pandas as pd
import os

In [None]:
df = pd.read_csv('./data/iris.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Species'].value_counts()

In [None]:
iris = df.copy()

In [None]:
train = iris.sample(100, replace=False, random_state=7).reset_index(drop=True)
train

In [None]:
test = iris.loc[ ~iris['Id'].isin(train['Id']) ]
# test = test.reset_index().drop(['index'],axis=1)  # 밑과 같은 코드
test = test.reset_index(drop=True)

### KNN 학습 (K=3일 때)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=3) # 모델 정의

In [None]:
knn.fit( train.iloc[:,1:-1] , train.iloc[:,-1] ) # 모델 학습

In [None]:
predictions = knn.predict( test.iloc[:,1:-1] ) # 예측

In [None]:
# 방법1 : mean()
#(pd.Series(predictions) == test['Species']).mean()

# 방법2 : score
knn.score(test.iloc[:,1:-1] , test.iloc[:,-1])

### 최적 K 찾기

- train & test 데이터 사용
- K값 1 ~ 30 까지 순차적 분석

In [None]:
for k in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit( train.iloc[:,1:-1] , train['Species'] )
    print(knn.score(test.iloc[:,1:-1] , test.iloc[:,-1]))

In [None]:
from sklearn import model_selection
import matplotlib.pyplot as plt

k_range = range(1,30)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit( train.iloc[:,1:-1] , train['Species'] )
    k_scores.append(knn.score(test.iloc[:,1:-1] , test.iloc[:,-1]))

plt.plot(k_range, k_scores, marker='o', color='green', linestyle='dashed', markersize=5)
plt.xlabel('Value of K for KNN')
plt.ylabel('Accuracy')
plt.show()

### Cross Validation 적용
- 신뢰성 문제 고려 Train, Test 셋에 따라 결과 다르기 때문에
- cross validation을 이용해 정학도 구해볼 수 있다.

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
for k in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, iris.iloc[:,1:-1], iris['Species'], cv=5)
    print(f"{k} : " ,np.mean(scores))

In [None]:
k_range = range(1,30)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, iris.iloc[:,1:-1], iris['Species'], cv=5)
    #print(f"{k} : " ,np.mean(scores))
    k_scores.append(np.mean(scores))

plt.plot(k_range, k_scores, marker='o', color='green', linestyle='dashed', markersize=5)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

### Scaling 적용시 비교

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler , RobustScaler


scaler = StandardScaler()
scaler.fit(train.iloc[:,1:-1])
X_scaled = scaler.transform(train.iloc[:,1:-1])

In [None]:
train_X = pd.DataFrame(X_scaled, columns=train.columns[1:-1])

In [None]:
train_X

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(train_X, train['Species'])

In [None]:
print(knn.score(train_X, train['Species']))

In [None]:
scaler.fit(test.iloc[:,1:-1])
X_scaled = scaler.transform(test.iloc[:,1:-1])
test_X = pd.DataFrame(X_scaled, columns=test.columns[1:-1])

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(test_X, test['Species'])
print(knn.score(test_X, test['Species']))

In [None]:
y_pred = knn.predict(test_X)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print("✅ Classification Report:")
print(classification_report(test['Species'], y_pred))

In [None]:
print("🧩 Confusion Matrix:")
print(confusion_matrix(test['Species'], y_pred))

In [None]:
# 10. 시각화 (혼동 행렬 heatmap)
import seaborn as sns
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(test['Species'], y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'],
            yticklabels=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()