In [1]:
import pandas as pd
import numpy as np

from SearchCV import GridSearchCV_test

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

데이터 불러오기

In [2]:
df = pd.read_csv("CancerDataRm_normalized.csv")
df.drop("id", axis=1, inplace=True)
df.loc[df['diagnosis']=="M", "diagnosis"] = 1
df.loc[df['diagnosis']=="B", "diagnosis"] = 0
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave.points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave.points_worst,symmetry_worst,fractal_dimension_worst
0,1,4.950186,2.240531,7.047561,14.828151,-1.988686,-1.623612,-1.349690,-1.402936,-3.184384,...,2.924031,4.568259,3.040733,8.189596,-1.671248,-1.446771,-1.154277,-0.819086,-2.109168,-6.407820
1,1,4.841597,2.344816,7.003579,14.471514,-1.820551,-1.316248,-1.076700,-1.236188,-2.787962,...,2.875812,4.744650,3.027547,8.032483,-1.572818,-0.793435,-0.708545,-0.761191,-1.490779,-6.498541
2,1,4.915992,2.110926,7.080380,14.746585,-1.879396,-1.404931,-1.075447,-1.299064,-3.189561,...,2.838866,3.909182,3.026903,7.937658,-1.604964,-1.376870,-0.800080,-0.843005,-2.506857,-7.270229
3,1,3.785711,2.166301,6.128541,11.380394,-1.716326,-1.285584,-1.163687,-1.368232,-2.763271,...,2.523655,4.597363,2.894755,7.070293,-1.429326,-0.608213,-0.569012,-0.831194,-1.296296,-4.759665
4,1,4.656367,2.309265,6.838591,13.949758,-1.917501,-1.492561,-1.276008,-1.390277,-3.215665,...,2.851257,4.910747,3.029045,7.960280,-1.573720,-1.202044,-0.841782,-0.811765,-1.850597,-6.757553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,0,8.218924,1.546142,35.206121,26.026255,0.072305,0.053520,0.056635,0.022632,0.126231,...,9.387879,2.643778,28.588769,33.122511,0.103318,0.129697,0.176854,0.059186,0.179736,0.065249
483,0,6.851817,1.539042,29.547008,21.309710,0.079392,0.068603,0.054993,0.026295,0.127970,...,7.627115,2.631887,24.115608,26.229290,0.114184,0.135367,0.152778,0.080276,0.171275,0.073568
484,0,9.085956,1.507062,39.660641,28.844704,0.075061,0.110498,0.088947,0.035366,0.118885,...,10.274744,2.504390,32.286489,36.391921,0.088723,0.211774,0.232119,0.094748,0.173152,0.071362
485,0,7.504673,1.516090,32.562762,23.485513,0.081168,0.088350,0.095065,0.038654,0.114455,...,8.584468,2.663404,27.105644,29.855804,0.108288,0.180869,0.230860,0.084309,0.164480,0.077083


데이터 분해

In [3]:
dataset = df.to_numpy()
X = dataset[:, 1:]
y = dataset[:, 0].astype('int')

In [4]:
print(np.unique(y, return_counts=True)[1])

[297 190]


In [5]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
idx_list = list(skf.split(X, y))
scaler = StandardScaler()

서포트 벡터 머신 분류

In [6]:
def nested_k_fold(model, scaler, fold, params) :
    print(f'{model.__class__.__name__} Nested K-Fold CV')
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    idx_list = list(skf.split(X, y))
    result = {'acc' : [],
                'pre' : [],
                  'rec' : [],
                  'f1' : [],
                  'auc' : []}
    
    for k in range(fold) :
        CV = GridSearchCV_test(model, scaler, fold-1, params, nested=True)
        
        x_train = X[idx_list[k][0]]
        y_train = y[idx_list[k][0]]
        x_test = X[idx_list[k][1]]
        y_test = y[idx_list[k][1]]
        
        CV.fit(x_train, y_train)
        acc, pre, rec, f1, auc = CV.predict(x_test, y_test)
        result['acc'].append(acc)
        result['pre'].append(pre)
        result['rec'].append(rec)
        result['f1'].append(f1)
        result['auc'].append(auc)
        
    print(f'{fold}-Fold Nested Cross-Validation Result')
    print("accuracy :", CV.list_mean(result['acc']))
    print("precision :", CV.list_mean(result['pre']))
    print("recall :", CV.list_mean(result['rec']))
    print("f1 score :", CV.list_mean(result['f1']))
    print("auroc :", CV.list_mean(result['auc']))
    
    df = pd.DataFrame(result)
    df.to_csv(f'{model.__class__.__name__}_{fold}_Fold_CV_Result.csv')

In [7]:
params = {
        'C' : np.logspace(-3, 3, 7),
        'gamma' : np.logspace(-3, 3, 7),
        'kernel' : ["linear", "rbf", "poly"]
}

nested_k_fold(SVC(random_state=42), scaler, 10, params)

SVC Nested K-Fold CV
best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}
best accuracy : 1.0
best f1 score : 1.0
best 

로지스틱 회귀 분류

In [8]:
params = {
        'C' : np.logspace(-3, 3, 7),
        'solver' : ['lbfgs', 'sag', 'saga', 'liblinear', 'newton-cg']
}
nested_k_fold(LogisticRegression(random_state=42), scaler, 10, params)

LogisticRegression Nested K-Fold CV




best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0





best estimator : {'C': 0.001, 'solver': 'lbfgs'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

10-Fold Nested Cross-Validation Result
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 score : 1.0
auroc : 1.0


K-이웃 알고리즘 분류

In [10]:
params = {
        'n_neighbors' : [3, 5, 7, 9, 11]
}

nested_k_fold(KNeighborsClassifier(), scaler, 10, params)

KNeighborsClassifier Nested K-Fold CV
best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'n_neighbors': 3}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

10-Fold Nested Cross-V

In [9]:
params = {
        'criterion' : ['gini', 'entropy', 'log_loss'],
        'max_features' : ['sqrt', 'log2', None],
}
nested_k_fold(RandomForestClassifier(random_state=42), scaler, 10, params)

RandomForestClassifier Nested K-Fold CV
best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'criterion': 'gini', 'max_features': 'sqrt'}
best accuracy : 1.0
best f1 score : 1.0
best auroc

가우시안 나이브 베이즈 분류

In [12]:
params = {
    "priors" : [None],
    "var_smoothing" : [1e-9, 1e-10, 1e-8]
}
nested_k_fold(GaussianNB(), scaler, 10, params)

GaussianNB Nested K-Fold CV
best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoothing': 1e-09}
best accuracy : 1.0
best f1 score : 1.0
best auroc : 1.0

best estimator : {'priors': None, 'var_smoot

In [None]:
df = pd.read_csv("CancerData_normalized_outlier.csv")
df.drop("id", axis=1, inplace=True)
df.loc[df['diagnosis']=="M", "diagnosis"] = 1
df.loc[df['diagnosis']=="B", "diagnosis"] = 0
df