In [None]:
%matplotlib                    # Pyplot 그래프 새창에서 열기 옵션

In [None]:
%matplotlib inline             # Pyplot 그래프 새창에서 열기 '취소' 옵션

In [None]:
pd.set_option('display.max_rows',    100)      # DataFrame 데이터 확인 시 최대 표시 행 수
pd.set_option('display.max_columns', 100)      # DataFrame 데이터 확인 시 최대 표시 열 수

## 라이브러리 import

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import pickle
from sklearn.neighbors import KNeighborsClassifier
import joblib
from sklearn.model_selection import GridSearchCV

## 데이터 불러오기

In [2]:
Fold = 5

# k-fold 학습/검증 데이터
for i in range(Fold):
    
    path1 = './K_FoldData/Training_Fold%d'%(i+1)
    path2 = './K_FoldData/Validation_Fold%d'%(i+1)
    c1 = 'Training_Fold%d   = np.array(pd.read_csv(path1, sep=",", header=None))'%(i+1)
    c2 = 'Validation_Fold%d = np.array(pd.read_csv(path2, sep=",", header=None))'%(i+1)
    exec(c1)
    exec(c2)

# K-fold 학습/검증 레이블
TrainingFold_Label   = np.array(pd.read_csv('./K_FoldData/TrainingFold_Label'  , sep=",", header=None).T.squeeze())
ValidationFold_Label = np.array(pd.read_csv('./K_FoldData/ValidationFold_Label', sep=",", header=None).T.squeeze())
    
    
# 전체 학습용 데이터
Training_All       = np.array(pd.read_csv('./K_FoldData/Training_All', sep = ",", header = None))
Training_All_Label = np.array(pd.read_csv('./K_FoldData/Training_All_Label', sep = ",", header = None).T.squeeze())


# 평가 데이터
Test_Data  = np.array(pd.read_csv('./K_FoldData/Test_Data' , sep=",", header=None))
Test_Label = np.array(pd.read_csv('./K_FoldData/Test_Label', sep=",", header=None))

## Grid Search 기반 Hyperparameter 별 성능 확인

#### 1. 반복문 활용

In [3]:
Fold = 5

# 비교할 하이퍼파라미터들 리스트 형태로 만들기
param_weight      = ['uniform', 'distance']                 # 데이터 간 거리에 따른 가중치 유무
param_n_neighbors = list(range(3, 20, 4))                   # 계산할 주변(이웃) 데이터 개수
param_metric      = ['euclidean', 'manhattan', 'minkowski'] # 거리 계산 방법

# 파라미터별 정확도 기록할 변수 설정
Accuracy_df = pd.DataFrame(np.zeros(shape=(len(param_n_neighbors)*len(param_weight)*len(param_metric),4)), #2*5*3
                           columns=['weight', 'n_neighbors', 'metric', 'Accuracy'])
cnt = 0 #폴드 들어갈때 카운트

# Grid Search 코드
for Weight in param_weight: #리스트에 있는 원소 하나하나다 넣어주면서 감

    for Nnb in param_n_neighbors: #변수들이 할당해서 for문 안에서 돌아간다

        for Metric in param_metric:
            
            ValidScore = 0 # 모델 바뀔 때마다 검증정확도 변수 초기화
            
            for i in range(Fold):
                c1 = 'Training_CurrentFold = Training_Fold%d'%(i+1)
                exec(c1)
                c2 = 'Validation_CurrentFold = Validation_Fold%d'%(i+1)
                exec(c2)
                
                tempsvmModel   = KNeighborsClassifier(n_neighbors=Nnb,
                                                      weights=Weight,
                                                      metric=Metric).fit(Training_CurrentFold, TrainingFold_Label) # 학습
                tempValidScore = tempsvmModel.score(Validation_CurrentFold, ValidationFold_Label)                  # 검증
                
                ValidScore += tempValidScore # Fold 별 검증 정확도 합산 
                
            Acc = ValidScore/Fold # 평균 검증 정확도
            Accuracy_df.iloc[cnt, :] = [Weight, Nnb, Metric, Acc] # 모델의 하이퍼파라미터, 검증정확도 대입
            cnt += 1

In [4]:
Accuracy_df

Unnamed: 0,weight,n_neighbors,metric,Accuracy
0,uniform,3.0,euclidean,0.99375
1,uniform,3.0,manhattan,0.99375
2,uniform,3.0,minkowski,0.99375
3,uniform,7.0,euclidean,0.9875
4,uniform,7.0,manhattan,0.99375
5,uniform,7.0,minkowski,0.9875
6,uniform,11.0,euclidean,0.984375
7,uniform,11.0,manhattan,0.99375
8,uniform,11.0,minkowski,0.984375
9,uniform,15.0,euclidean,0.984375


In [5]:
Accuracy_df_sorted = Accuracy_df.sort_values(by = ['Accuracy'], ascending = False) # 성능 확인
Accuracy_df_sorted

Unnamed: 0,weight,n_neighbors,metric,Accuracy
0,uniform,3.0,euclidean,0.99375
10,uniform,15.0,manhattan,0.99375
28,distance,19.0,manhattan,0.99375
25,distance,15.0,manhattan,0.99375
22,distance,11.0,manhattan,0.99375
19,distance,7.0,manhattan,0.99375
17,distance,3.0,minkowski,0.99375
16,distance,3.0,manhattan,0.99375
1,uniform,3.0,manhattan,0.99375
15,distance,3.0,euclidean,0.99375


#### 2. sklearn.model_selection.GridSearchCV 함수 활용 (K-fold Cross Validation 내장)

In [6]:
# 비교할 하이퍼파라미터 사전(Dictionary) 형태로 만들기 gridcvv때문에 특정 키가 있고 거기에 들어간다
GridParams = {
    'weights' : ['uniform', 'distance'],
    'n_neighbors' : list(range(3, 20, 4)),
    'metric' : ['euclidean', 'manhattan', 'minkowski']
}

In [7]:
tempknnModel = KNeighborsClassifier() #위의 인자는 고정
GS_model = GridSearchCV(tempknnModel, GridParams, cv = 5)

GS_model.fit(Training_All, Training_All_Label) #test 데이터 제외 . tab키 사용하면 함수 사용가능한거 나옴

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski'],
                         'n_neighbors': [3, 7, 11, 15, 19],
                         'weights': ['uniform', 'distance']})

In [8]:
print('Total Combination of Hyperparameters :', len(GS_model.cv_results_.get('params')))

Total Combination of Hyperparameters : 30


In [9]:
pd.DataFrame(GS_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001643,0.000549,0.00993,0.003402,euclidean,3,uniform,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.96875,1.0,1.0,1.0,1.0,0.99375,0.0125,1
1,0.000743,0.000415,0.002571,0.000474,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.96875,1.0,1.0,1.0,1.0,0.99375,0.0125,1
2,0.001122,0.000369,0.006546,0.000826,euclidean,7,uniform,"{'metric': 'euclidean', 'n_neighbors': 7, 'wei...",0.9375,1.0,1.0,1.0,1.0,0.9875,0.025,15
3,0.001042,0.000196,0.002038,6.8e-05,euclidean,7,distance,"{'metric': 'euclidean', 'n_neighbors': 7, 'wei...",0.9375,1.0,1.0,1.0,1.0,0.9875,0.025,15
4,0.001015,0.000183,0.005805,0.0002,euclidean,11,uniform,"{'metric': 'euclidean', 'n_neighbors': 11, 'we...",0.9375,0.984375,1.0,1.0,1.0,0.984375,0.024206,25
5,0.000995,2.1e-05,0.002603,0.000478,euclidean,11,distance,"{'metric': 'euclidean', 'n_neighbors': 11, 'we...",0.9375,1.0,1.0,1.0,1.0,0.9875,0.025,15
6,0.000997,2.8e-05,0.006416,0.000785,euclidean,15,uniform,"{'metric': 'euclidean', 'n_neighbors': 15, 'we...",0.9375,0.984375,1.0,1.0,1.0,0.984375,0.024206,25
7,0.000991,1.3e-05,0.002248,0.000485,euclidean,15,distance,"{'metric': 'euclidean', 'n_neighbors': 15, 'we...",0.9375,1.0,1.0,1.0,1.0,0.9875,0.025,15
8,0.001016,5.5e-05,0.006986,0.000663,euclidean,19,uniform,"{'metric': 'euclidean', 'n_neighbors': 19, 'we...",0.9375,0.984375,1.0,1.0,1.0,0.984375,0.024206,25
9,0.001369,0.000454,0.003245,0.000769,euclidean,19,distance,"{'metric': 'euclidean', 'n_neighbors': 19, 'we...",0.9375,1.0,1.0,1.0,1.0,0.9875,0.025,15


In [10]:
print('Best Parameters : ', GS_model.best_params_)
print('Best Score : {}%'.format(round(GS_model.best_score_*100, 2)) )

# 모델 평가
# Test_Score = GS_model.score(Test_Data, Test_Label)
# print('Best Test Score : {}%'.format(round(Test_Score*100, 2)) )

Best Parameters :  {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best Score : 99.38%
