In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import pickle
import  sklearn.metrics as metrics

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
print(pickle.format_version)

4.0


# 데이터

In [4]:
X_samples = pickle.load(open('./datasets/X_samples.pickle', 'rb')) 
X_test = pickle.load(open('./datasets/X_test.pickle', 'rb'))
y_samples = pickle.load(open('./datasets/y_samples.pickle', 'rb'))
y_test = pickle.load(open('./datasets/y_test.pickle', 'rb'))

In [5]:
# pd.__version__

'1.1.5'

In [6]:
# pip install pandas==1.1.5

Note: you may need to restart the kernel to use updated packages.


In [7]:
# samples는 dict,test는 df 
# type(X_samples)
# X_samples['Raw']
print(X_samples.get('Raw'))
print('X_samples key : ' , X_samples.keys()) 
print('y_samples key : ' , y_samples.keys())

            Time        V1        V2        V3        V4        V5        V6  \
143144  0.005428 -0.540939  0.637584  2.439590  1.316115  0.742650  1.671102   
258914  0.871545  1.923123 -1.555096  0.211476 -0.174906 -1.760650  0.456333   
51111  -0.468709 -1.939810 -1.039497  0.429346 -0.198014  2.693946 -2.792994   
146949  0.038652 -0.801246  1.067120  0.506310 -2.533708  0.597024 -1.446026   
135606 -0.039556 -0.280807  1.109719  0.944761 -0.132693  0.423860 -0.509289   
...          ...       ...       ...       ...       ...       ...       ...   
221661  0.680894 -1.196213  1.679891 -0.939175 -1.133975  0.521005 -0.211571   
129871 -0.064169  1.175454 -0.001898  1.235163  1.395858 -0.990710 -0.272728   
185352  0.492710  0.465324 -3.487401 -4.042365 -0.107532  0.022031 -0.114303   
61180  -0.411119 -0.383532  0.981614  1.235659  0.025899  0.081234 -0.545057   
88583  -0.264218  1.108338 -0.152494  0.987418  1.017217 -0.395630  0.992727   

              V7        V8        V9   

In [8]:
data_list = ['Raw', 'SMOTE', 'ADASYN', 'CNN', 'SMOTE + ENN', 'ADASYN + ENN']
cols = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC_AUC']

# KNN

각 분류 모델의 성능을 평가(model selection) 방법
1. Accuracy
2. Confusion matrix
3. Precision, Recall and F-measure(f1score, f-beta-score) 
4. Receiver operating characteristic (AUC-ROC)

In [None]:
%%time

clf =  KNeighborsClassifier(n_neighbors=5)

data_dict = {}
for i in data_list:
    print(i)
    clf.fit(X_samples.get(i), y_samples.get(i))
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)

    acc = accuracy_score(y_pred, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba[:, 1])

    data_dict[i] = [acc, precision, recall, f1, roc_auc]

df_knn = pd.DataFrame(data_dict, index = cols)
print(df_knn)



Raw


In [None]:
df_knn = pd.DataFrame(data_dict, index = cols)
print(df_knn)

In [None]:
df_knn.plot(kind = 'bar', figsize = (10, 5))
plt.legend(loc=(1.01, 0.))
plt.xticks(rotation = 0)
plt.show()


# Balanced

In [None]:
knn = KNNClassifier(class_weight='balanced')


In [None]:
%%time

knn.fit(X_samples['Raw'], y_samples['Raw'], eval_set=(X_test, y_test))
y_pred = knn.predict(X_test)
y_proba = knn.predict_proba(X_test)

acc = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba[:, 1])

data_dict['Balanced'] = [acc, precision, recall, f1, roc_auc]

In [None]:
df_knn = pd.DataFrame(data_dict, index = cols)
print(df_knn)

In [None]:
df_knn.plot(kind = 'bar', figsize = (10, 5))
plt.legend(loc=(1.01, 0.))
plt.xticks(rotation = 0)
plt.show()


# goss : Gradient-based One-Side Sampling

In [None]:
knn = KNNClassifier(boosting_type = 'goss')

In [None]:
data_dict_param = {}
for data in data_list:
    print(data)
    knn.fit(X_samples[data], y_samples[data])
    y_pred = knn.predict(X_test)
    y_proba = knn.predict_proba(X_test)
    
    acc = accuracy_score(y_pred, y_test)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba[:, 1])
    
    data_dict_param[data] = [acc, precision, recall, f1, roc_auc]
# print(data_dict)

In [None]:
df_goss = pd.DataFrame(data_dict_param, index = cols)
print(df_goss)

In [None]:
df_goss.plot(kind='bar', figsize = (10, 5))
plt.legend(loc=(1.01, 0.))
plt.xticks(rotation = 0)
plt.show()