In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Data csv파일 받아오기

In [14]:
N100_df = pd.read_csv('N100-NA제거+random.csv')
N100_df.head()

Unnamed: 0,subject,trial,condition,group,gender,age,education,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,1,1,1,0,M,44,16.0,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,1,2,1,0,M,44,16.0,8.717519,11.462619,7.97911,9.429738,6.912862,6.242343,4.6901,2.7952,1.938529
2,1,3,1,0,M,44,16.0,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,1,4,1,0,M,44,16.0,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.5045,2.945,-1.082224
4,1,5,1,0,M,44,16.0,-9.36809,-12.029829,-10.2801,-7.846471,-8.170986,-10.192229,-6.907095,-14.041,-9.66671


<칼럼명 해석>

1. Subject: 1~81명의 피험자 (49명의 환자, 32명의 건강인)

2. Trial: 한 실험 조건 당 100번의 반복 실험

3. Condition: 실험 조건

     (1) pressed a button to immediately generated a tone

     (2) passively listened to the same tone

     (3) pressed a button without generating a tone

4. Group: 1- patient / 0- Healthy

5. Gender: Male / Female

6. Age: 나이, Education: 12- highschool / 16- college or university

7. Channel: Fz, FCz, Cz: center channel / FC3, C3, CP3: left channel / FC4, C4, CP4: right channel

머신 러닝 5가지 모델 학습

In [15]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

-> 데이터 분리: X-channels / Y-group(조현병, 건강인) 칼럼 전체

In [16]:
col_names = N100_df.columns.values
X = N100_df[col_names[7:]]
Y = N100_df[col_names[3]]

In [17]:
X.head(5)

Unnamed: 0,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,8.717519,11.462619,7.97911,9.429738,6.912862,6.242343,4.6901,2.7952,1.938529
2,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.5045,2.945,-1.082224
4,-9.36809,-12.029829,-10.2801,-7.846471,-8.170986,-10.192229,-6.907095,-14.041,-9.66671


In [18]:
Y.head(5)

0    0
1    0
2    0
3    0
4    0
Name: group, dtype: int64

->train, validation, test data set 분리

In [19]:
def train_val_test_split(X, Y, test_size=0.3, random_state=123):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
   
    return X_train, X_test, Y_train, Y_test

In [20]:
X_train, X_test, Y_train, Y_test = train_val_test_split(X, Y,
                                                                      test_size=0.3,
                                                                      random_state=123)

In [21]:
print(X_train.shape)
print(X_test.shape)

(32481, 9)
(13921, 9)


1. Logistic Regression을 위한 다양한 가중치 (cross validation적용 전)

In [22]:
penalty_set = ['l1', 'l2']
C_set = [0.0001, 0.001, 0.01, 0.1, 1, 10, 1e2, 1e3, 1e4, 1e5, 1e6]

In [23]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_train, Y_train)
#         Y_val_score = model.decision_function(X_val)
        Y_test_score = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(Y_test, Y_test_score)
        result.append((model, penalty, C, auc(fpr, tpr)))



-> 회귀 분석 결과를 정렬 후, 최상의 정확도를 갖는 모델 찾기

In [24]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [25]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'l1', 0.01, 0.5370834776793121)


-> eeg channel training

In [26]:
X_eeg = N100_df[col_names[7:]]
Y_eeg = N100_df[col_names[3]]

In [27]:
X_eeg.head(5)

Unnamed: 0,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,8.717519,11.462619,7.97911,9.429738,6.912862,6.242343,4.6901,2.7952,1.938529
2,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.5045,2.945,-1.082224
4,-9.36809,-12.029829,-10.2801,-7.846471,-8.170986,-10.192229,-6.907095,-14.041,-9.66671


In [28]:
X_eeg_train, X_eeg_test, Y_eeg_train, Y_eeg_test = train_val_test_split(X_eeg, Y_eeg,
                                                                      
                                                                      test_size=0.2,
                                                                      random_state=123)

In [29]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_eeg_train, Y_eeg_train)
#         Y_val_score = model.decision_function(X_val)
        Y_test_score = model.predict_proba(X_eeg_test)[:, 1]
        fpr, tpr, _ = roc_curve(Y_eeg_test, Y_test_score)
        result.append((model, penalty, C, auc(fpr, tpr)))



In [30]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [31]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'l1', 0.01, 0.5393480266305981)


-> condition별 데이터 분리

In [32]:
X_condition = N100_df[col_names[0:]]
Y_condition = N100_df[col_names[0:4]]

In [33]:
X_condition.head()

Unnamed: 0,subject,trial,condition,group,gender,age,education,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,1,1,1,0,M,44,16.0,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,1,2,1,0,M,44,16.0,8.717519,11.462619,7.97911,9.429738,6.912862,6.242343,4.6901,2.7952,1.938529
2,1,3,1,0,M,44,16.0,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,1,4,1,0,M,44,16.0,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.5045,2.945,-1.082224
4,1,5,1,0,M,44,16.0,-9.36809,-12.029829,-10.2801,-7.846471,-8.170986,-10.192229,-6.907095,-14.041,-9.66671


In [34]:
Y_condition.head()

Unnamed: 0,subject,trial,condition,group
0,1,1,1,0
1,1,2,1,0
2,1,3,1,0
3,1,4,1,0
4,1,5,1,0


-> Y_condition의 필요 없는 column 제거 후, condition별 분리

In [35]:
Y_condition=Y_condition.drop(['subject','trial'],axis=1)
Y_condition.head()

Unnamed: 0,condition,group
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [36]:
Y_condition1=Y_condition[Y_condition['condition']==1]
Y_condition1.head(101)

Unnamed: 0,condition,group
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


In [37]:
Y_condition1=Y_condition1.drop('condition', axis=1)
Y_condition1.head()

Unnamed: 0,group
0,0
1,0
2,0
3,0
4,0


In [38]:
Y_condition2=Y_condition[Y_condition['condition']==2]
Y_condition2.head(101)

Unnamed: 0,condition,group
100,2,0
101,2,0
102,2,0
103,2,0
104,2,0
105,2,0
106,2,0
107,2,0
108,2,0
109,2,0


In [39]:
Y_condition2=Y_condition2.drop('condition', axis=1)
Y_condition2.head()

Unnamed: 0,group
100,0
101,0
102,0
103,0
104,0


In [40]:
Y_condition3=Y_condition[Y_condition['condition']==3]
Y_condition3.head(101)

Unnamed: 0,condition,group
194,3,0
195,3,0
196,3,0
197,3,0
198,3,0
199,3,0
200,3,0
201,3,0
202,3,0
203,3,0


In [41]:
Y_condition3=Y_condition3.drop('condition', axis=1)
Y_condition3.head()

Unnamed: 0,group
194,0
195,0
196,0
197,0
198,0


-> X_condition의 결과값 칼럼 제거

In [42]:
X_condition=X_condition.drop('group', axis=1)
X_condition.head()

Unnamed: 0,subject,trial,condition,gender,age,education,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,1,1,1,M,44,16.0,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,1,2,1,M,44,16.0,8.717519,11.462619,7.97911,9.429738,6.912862,6.242343,4.6901,2.7952,1.938529
2,1,3,1,M,44,16.0,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,1,4,1,M,44,16.0,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.5045,2.945,-1.082224
4,1,5,1,M,44,16.0,-9.36809,-12.029829,-10.2801,-7.846471,-8.170986,-10.192229,-6.907095,-14.041,-9.66671


-> 연관성 없는 칼럼 제거

In [43]:
X_condition=X_condition.drop(['gender','age','education','subject','trial'],axis=1)
X_condition.head()

Unnamed: 0,condition,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,1,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,1,8.717519,11.462619,7.97911,9.429738,6.912862,6.242343,4.6901,2.7952,1.938529
2,1,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,1,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.5045,2.945,-1.082224
4,1,-9.36809,-12.029829,-10.2801,-7.846471,-8.170986,-10.192229,-6.907095,-14.041,-9.66671


In [44]:
X_condition1=X_condition[X_condition['condition']==1]
X_condition1.head(101)

Unnamed: 0,condition,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
0,1,-9.761338,-11.545219,-16.775214,-3.576595,-11.745167,-0.895419,-16.679205,-5.854429,-14.473143
1,1,8.717519,11.462619,7.979110,9.429738,6.912862,6.242343,4.690100,2.795200,1.938529
2,1,-4.027286,-3.992038,-1.104119,-3.319471,0.321314,0.419538,0.030619,2.828195,2.731533
3,1,0.394095,-0.663186,-1.372871,3.974643,-1.613667,2.376024,-1.504500,2.945000,-1.082224
4,1,-9.368090,-12.029829,-10.280100,-7.846471,-8.170986,-10.192229,-6.907095,-14.041000,-9.666710
5,1,-10.790257,-10.572367,-14.636533,-9.880271,-15.229224,-14.060162,-14.101090,-15.782033,-12.055395
6,1,5.616595,9.261605,11.472643,5.892076,3.369305,13.736567,5.326267,11.242681,6.140795
7,1,12.262348,14.245062,16.989962,10.314271,9.561067,9.339833,12.025652,7.889762,11.902900
8,1,-0.596852,-1.241681,-3.727729,8.685595,-4.913276,6.423724,-3.755538,2.639981,-4.314110
9,1,-6.015471,-7.863443,-9.716157,-6.245338,-5.074048,-1.253329,-4.245752,-0.591100,-5.665014


In [45]:
X_condition2=X_condition[X_condition['condition']==2]
X_condition2.head(101)

Unnamed: 0,condition,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
100,2,-16.716552,-20.971295,-20.478290,-10.182510,-19.729390,-17.714986,-17.048510,-10.823119,-18.070057
101,2,-3.722819,0.513381,5.512324,-6.093929,-3.810562,-0.303200,-2.394881,0.987876,1.415952
102,2,-8.247662,-9.755552,-10.301010,-4.977605,-2.362848,-12.136810,-2.110810,-11.478605,-4.584295
103,2,-8.027805,-7.270267,-6.318910,-13.784110,-3.382119,-10.864186,-2.816748,-14.083590,-1.270076
104,2,-6.368943,-8.376862,-10.965705,-16.191648,-9.046586,-12.507938,-7.621748,-7.771948,-6.955390
105,2,-23.887971,-25.746367,-26.506876,-17.793667,-18.992681,-23.638824,-19.264057,-19.958895,-19.158962
106,2,-11.405690,-11.201986,-12.185310,-11.642905,-15.096833,-14.883738,-14.107133,-12.014000,-10.282324
107,2,-2.690552,0.353181,-0.090357,0.199086,-5.209033,-0.987038,-3.627448,-2.303571,-2.124890
108,2,11.067590,11.486171,7.105162,9.595990,7.694167,8.186790,2.874810,7.473910,5.128400
109,2,2.553186,-0.695638,-7.052710,3.339743,-0.394033,2.220914,-0.928914,1.195905,-3.296943


In [46]:
X_condition3=X_condition[X_condition['condition']==3]
X_condition3.head(101)

Unnamed: 0,condition,Fz_N100,FCz_N100,Cz_N100,FC3_N100,FC4_N100,C3_N100,C4_N100,CP3_N100,CP4_N100
194,3,7.927462,10.191724,14.238219,6.325543,9.125038,6.551529,8.492952,3.189105,6.345057
195,3,1.525724,3.350157,4.831110,5.704062,1.612810,4.315371,2.633157,-3.105952,0.145895
196,3,8.472619,12.202405,15.034671,6.784305,12.998590,6.405124,12.207514,7.003910,10.438271
197,3,6.251105,11.988548,18.587343,13.116338,8.820376,12.936419,13.451738,15.863624,12.711000
198,3,4.445529,1.646410,-2.357400,6.164052,-2.160619,1.599310,-5.571371,1.012348,-6.457757
199,3,1.720248,0.753138,2.764776,-0.042848,3.739200,-0.610029,4.536124,1.817062,3.933505
200,3,-4.061529,-5.716195,-8.323124,-2.957110,-7.120652,-4.070543,-6.590886,-1.491000,-5.716700
201,3,5.132795,5.039795,8.241124,6.452662,7.013976,3.528300,7.192176,4.883643,7.636348
202,3,0.083867,0.035476,0.798010,-2.412929,-2.835867,1.826395,-2.489719,-3.747086,-3.580700
203,3,11.660967,12.732457,13.322486,9.178481,14.236833,10.541295,13.371943,9.941881,12.263457


-> condition1에 따른 dataset 분리및 학습

In [47]:
X_condition1_train, X_condition1_test, Y_condition1_train, Y_condition1_test = train_val_test_split(X_condition1, Y_condition1,
                                                                      
                                                                      test_size=0.2,
                                                                      random_state=123)

In [48]:
print(X_condition1_train.shape)
print(X_condition1_test.shape)
print(Y_condition1_train.shape)
print(Y_condition1_test.shape)

(12537, 10)
(3135, 10)
(12537, 1)
(3135, 1)


In [49]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_condition1_train, Y_condition1_train)
#         Y_val_score = model.decision_function(X_val)
        Y_test_score = model.predict_proba(X_condition1_test)[:, 1]
        fpr, tpr, _ = roc_curve(Y_condition1_test, Y_test_score)
        result.append((model, penalty, C, auc(fpr, tpr)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [50]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [51]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'l1', 0.01, 0.538479269702342)


-> condition2에 따른 dataset 분리및 학습

In [52]:
X_condition2_train, X_condition2_test, Y_condition2_train, Y_condition2_test = train_val_test_split(X_condition2, Y_condition2,
                                                                      
                                                                      test_size=0.2,
                                                                      random_state=123)

In [53]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_condition2_train, Y_condition2_train)
#         Y_val_score = model.decision_function(X_val)
        Y_test_score = model.predict_proba(X_condition2_test)[:, 1]
        fpr, tpr, _ = roc_curve(Y_condition2_test, Y_test_score)
        result.append((model, penalty, C, auc(fpr, tpr)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [54]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [55]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'l2', 0.0001, 0.5336620151719893)


-> condition3에 따른 dataset 분리및 학습

In [56]:
X_condition3_train, X_condition3_test, Y_condition3_train, Y_condition3_test = train_val_test_split(X_condition3, Y_condition3,
                                                                      
                                                                      test_size=0.2,
                                                                      random_state=123)

In [57]:
result = []
for penalty in penalty_set:
    for C in C_set:
        model = LogisticRegression(penalty=penalty, C=C, class_weight='balanced')
        model = model.fit(X_condition3_train, Y_condition3_train)
#         Y_val_score = model.decision_function(X_val)
        Y_test_score = model.predict_proba(X_condition3_test)[:, 1]
        fpr, tpr, _ = roc_curve(Y_condition3_test, Y_test_score)
        result.append((model, penalty, C, auc(fpr, tpr)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [58]:
logreg_result = sorted(result, key=lambda x: x[3], reverse=True)

In [59]:
best_logreg_result = logreg_result[0]
print(best_logreg_result)

(LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'l2', 0.01, 0.5475997219084036)


-> 임의로 분리한 train, test 데이터 셋에 대한 최상의 모델 찾기는 그 신뢰성을 보장하기 어렵기 때문에, 반복적으로 train, test로 데이터를 나누어 검증하는 교차 검증을 도입하여 정확도가 높은 최상의 모델을 찾아야 한다.

In [60]:
from sklearn.model_selection import cross_val_score

-> 30번의 교차 검증을 진행하여 모델에 대한 정확도 측정

In [61]:
cross_val_score(LogisticRegression(), X_condition3, Y_condition3, scoring='accuracy',cv=30)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.59574468, 0.58994197, 0.59574468, 0.58800774, 0.5860735 ,
       0.59767892, 0.59689922, 0.60077519, 0.59108527, 0.59883721,
       0.60271318, 0.5872093 , 0.58640777, 0.60776699, 0.59029126,
       0.5961165 , 0.59029126, 0.5961165 , 0.59029126, 0.58834951,
       0.59805825, 0.59805825, 0.6       , 0.59223301, 0.6       ,
       0.6038835 , 0.58834951, 0.58834951, 0.60776699, 0.58834951])

In [62]:
cross_val_score(LogisticRegression(), X_condition1, Y_condition1, scoring='accuracy',cv=30)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.59923664, 0.59160305, 0.60687023, 0.60114504, 0.60496183,
       0.60305344, 0.60727969, 0.59770115, 0.60344828, 0.60344828,
       0.6091954 , 0.59578544, 0.59003831, 0.58812261, 0.6091954 ,
       0.59961686, 0.59386973, 0.60536398, 0.60153257, 0.60536398,
       0.60344828, 0.60727969, 0.59770115, 0.60344828, 0.60344828,
       0.6091954 , 0.59770115, 0.58812261, 0.58812261, 0.6091954 ])

In [63]:
cross_val_score(LogisticRegression(), X_condition2, Y_condition2, scoring='accuracy',cv=30)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.59215686, 0.59215686, 0.6       , 0.60196078, 0.61176471,
       0.60196078, 0.60980392, 0.60588235, 0.60707269, 0.60117878,
       0.60903733, 0.60117878, 0.55795678, 0.60314342, 0.62401575,
       0.59448819, 0.59251969, 0.6023622 , 0.60433071, 0.61220472,
       0.6023622 , 0.61023622, 0.60629921, 0.60629921, 0.6023622 ,
       0.61023622, 0.6023622 , 0.55905512, 0.60433071, 0.62795276])

In [64]:
cross_val_score(LogisticRegression(), X_eeg, Y_eeg, scoring='accuracy',cv=30)



array([0.60206718, 0.59431525, 0.60051713, 0.60245637, 0.6043956 ,
       0.60180995, 0.60180995, 0.59987072, 0.60310278, 0.60245637,
       0.60310278, 0.59857789, 0.58306399, 0.60310278, 0.60568843,
       0.60245637, 0.59469942, 0.60051713, 0.60245637, 0.60504202,
       0.60155239, 0.60155239, 0.60025873, 0.60284605, 0.60219922,
       0.60284605, 0.59831824, 0.58344114, 0.60349288, 0.60543338])

-> 정확도 약 60% 나옴

2. K-nearest neighbor classifier = KNN

-> weights:‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.

In [65]:
weights_set = ['uniform', 'distance']
n_neighbors_set = [1, 3, 5, 7, 9, 11, 13, 15]

In [66]:
result = []
for weights in weights_set:
    for n_neighbors in n_neighbors_set:
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
        model = model.fit(X_condition1_train, Y_condition1_train)
        Y_test_score = model.predict_proba(X_condition1_test)[:, 1]
        fpr, tpr, _ = roc_curve(Y_condition1_test, Y_test_score)
        result.append((model, weights, n_neighbors, auc(fpr, tpr)))        

  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """


In [67]:
knn_result = sorted(result, key=lambda x: x[3], reverse=True)

In [68]:
best_knn_result = knn_result[0]
print(best_knn_result)

(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance'), 'distance', 5, 0.9226702779293771)


In [69]:
cross_val_score(KNeighborsClassifier(), X_condition1, Y_condition1, scoring='accuracy',cv=30)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


array([0.67748092, 0.69274809, 0.70229008, 0.68320611, 0.69083969,
       0.70801527, 0.72796935, 0.71264368, 0.69348659, 0.69923372,
       0.69923372, 0.72796935, 0.68007663, 0.69731801, 0.71072797,
       0.69731801, 0.70114943, 0.71455939, 0.69923372, 0.67241379,
       0.69348659, 0.70306513, 0.7394636 , 0.69348659, 0.70114943,
       0.70114943, 0.73371648, 0.67624521, 0.70498084, 0.69731801])

In [70]:
cross_val_score(KNeighborsClassifier(), X_eeg, Y_eeg, scoring='accuracy',cv=30)

array([0.71963824, 0.69896641, 0.72333549, 0.70071105, 0.68778281,
       0.70265029, 0.7136393 , 0.71493213, 0.71040724, 0.69424693,
       0.70588235, 0.71557854, 0.70652877, 0.68196509, 0.69683258,
       0.72010343, 0.68778281, 0.71622495, 0.7032967 , 0.69489334,
       0.69793014, 0.70892626, 0.72445019, 0.71216041, 0.69016818,
       0.71021992, 0.72574386, 0.70504528, 0.69534282, 0.70310479])

In [71]:
cross_val_score(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='distance'), X_condition1, Y_condition1, scoring='accuracy',cv=30)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


array([0.92557252, 0.92557252, 0.94274809, 0.94656489, 0.95610687,
       0.9389313 , 0.95785441, 0.96360153, 0.96168582, 0.9559387 ,
       0.97318008, 0.95977011, 0.97126437, 0.95402299, 0.97318008,
       0.93103448, 0.9348659 , 0.94061303, 0.94636015, 0.96743295,
       0.94444444, 0.97318008, 0.96360153, 0.9559387 , 0.95785441,
       0.97701149, 0.96360153, 0.97509579, 0.96934866, 0.97126437])

-> 정확도 약 95.5% 나옴

3. Naive Bayes clasifier

In [72]:
cross_val_score(GaussianNB(), X_condition1, Y_condition1, scoring='accuracy',cv=30)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.54580153, 0.55725191, 0.48282443, 0.54198473, 0.44274809,
       0.51145038, 0.44252874, 0.51532567, 0.55938697, 0.44444444,
       0.48467433, 0.45977011, 0.53448276, 0.35632184, 0.41762452,
       0.54597701, 0.55555556, 0.48467433, 0.5440613 , 0.44444444,
       0.51724138, 0.44636015, 0.51532567, 0.55172414, 0.44636015,
       0.47701149, 0.46551724, 0.53831418, 0.35823755, 0.40996169])

-> 다른 모델들에 비해 정확도 낮고 편차가 심함

4. Decision Tree

In [73]:
cross_val_score(DecisionTreeClassifier(), X_condition1, Y_condition1, scoring='accuracy',cv=30)

array([0.73854962, 0.77099237, 0.77671756, 0.79961832, 0.80534351,
       0.76335878, 0.80268199, 0.80268199, 0.79310345, 0.77777778,
       0.80076628, 0.81992337, 0.83716475, 0.7835249 , 0.81609195,
       0.79118774, 0.79693487, 0.77969349, 0.75862069, 0.80842912,
       0.77011494, 0.81417625, 0.78735632, 0.80268199, 0.79118774,
       0.80842912, 0.79501916, 0.82183908, 0.81417625, 0.82183908])

-> 정확도 약 79.4% 나옴

5. RandomForest Classifier

In [74]:
cross_val_score(RandomForestClassifier(), X_condition1, Y_condition1, scoring='accuracy',cv=30)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


array([0.83969466, 0.84923664, 0.82251908, 0.83778626, 0.84160305,
       0.81870229, 0.86398467, 0.8697318 , 0.82758621, 0.84291188,
       0.87739464, 0.85823755, 0.88505747, 0.83908046, 0.87164751,
       0.86206897, 0.86590038, 0.84099617, 0.80842912, 0.85249042,
       0.82758621, 0.87356322, 0.86015326, 0.84291188, 0.86781609,
       0.87356322, 0.87164751, 0.89272031, 0.85440613, 0.87547893])

-> 정확도 약 86.6%으로 Decision Tree보다 정확도 높음

<반복 교차 검증을 통한 모델의 최상 파라미터 찾기>

In [63]:
from sklearn.model_selection import RepeatedKFold

In [64]:
rkfold = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
cross_val_score(LogisticRegression(), X_condition1, Y_condition1, scoring='accuracy',cv=rkfold)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.60714286, 0.60395408, 0.59668156, 0.60689215, 0.56604978,
       0.60370134, 0.60370134, 0.61072112, 0.60880664, 0.62029355,
       0.62882653, 0.60076531, 0.59668156, 0.61135929, 0.60051053,
       0.60561583, 0.60242502, 0.59731972, 0.58710913, 0.59795788,
       0.58609694, 0.58864796, 0.60753031, 0.6094448 , 0.60178685,
       0.59668156, 0.60242502, 0.61327377, 0.61518826, 0.60880664,
       0.62244898, 0.60204082, 0.59476707, 0.61263561, 0.61199745,
       0.61199745, 0.60306318, 0.59476707, 0.58647096, 0.59412891,
       0.59630102, 0.60140306, 0.59285258, 0.61391193, 0.60753031,
       0.62029355, 0.58902361, 0.59476707, 0.61263561, 0.6043395 ,
       0.60012755, 0.60522959, 0.60816847, 0.59731972, 0.6228462 ,
       0.61710274, 0.58455648, 0.5807275 , 0.62858966, 0.58838545,
       0.58418367, 0.5880102 , 0.58455648, 0.60689215, 0.60242502,
       0.60816847, 0.61710274, 0.61072112, 0.61901723, 0.61199745,
       0.6122449 , 0.60395408, 0.57562221, 0.6043395 , 0.59668

-> condition1에 대해서 최고 62.8%까지 보임