<a href="https://colab.research.google.com/github/BAEK-SEUNGYUN/modu_deep/blob/main/modu_sonar_k_fold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# k-fold (k겹 교차 검증)
딥러닝, 머신러닝 작업을 할 떄 알고리즘을 충분히 테스트하였어도 데이터가 충분치 않으면 좋은 결과를 내기가 어렵다.

k-fold 란 데이터셋을 여러개로 나누어 하나씩 테스트셋으로 사용하고 나머지를 모두 합해서 학습셋으로 사용하는 방법





###사용이유
- 총 데이터 갯수가 적은 데이터 셋에 대하여 정확도를 향상시킬수 있음
- 이는 기존에 Training/Validation/Test 세 개의 집단으로 분류하는 것보다, Training과 Test로만 분류할 때 학습 데이터 셋이 더 많기 때문
- 데이터 수가 적은데 검증과 테스트에 데이터를 더 뺏기면 underfitting 등 성능이 미달되는 모델이 학습됨


In [1]:
from keras.models import Sequential   
from keras.layers.core import Dense
from sklearn.preprocessing import LabelEncoder

# 데이터를 원하는 숫자만큼 쪼개 각각 학습셋과 테스트셋으로 사용되게 만드는 함수
from sklearn.model_selection import StratifiedKFold

import numpy
import pandas as pd
import tensorflow as tf


In [2]:
# seed 값 설정
seed = 0
numpy.random.seed(seed)
tf.random.set_seed(3)

In [3]:
# sonar 데이터 가져오기
df = pd.read_csv('/content/drive/MyDrive/dataset/sonar.csv')

In [4]:
# 데이터 확인
df.head()

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
0,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,0.4918,0.6552,0.6919,0.7797,0.7464,0.9444,1.0,0.8874,0.8024,0.7818,0.5212,0.4052,0.3957,0.3914,0.325,0.32,0.3271,0.2767,0.4423,0.2028,0.3788,0.2947,0.1984,0.2341,0.1306,0.4182,0.3835,0.1057,0.184,0.197,0.1674,0.0583,0.1401,0.1628,0.0621,0.0203,0.053,0.0742,0.0409,0.0061,0.0125,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
1,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,0.6333,0.706,0.5544,0.532,0.6479,0.6931,0.6759,0.7551,0.8929,0.8619,0.7974,0.6737,0.4293,0.3648,0.5331,0.2413,0.507,0.8533,0.6036,0.8514,0.8512,0.5045,0.1862,0.2709,0.4232,0.3043,0.6116,0.6756,0.5375,0.4719,0.4647,0.2587,0.2129,0.2222,0.2111,0.0176,0.1348,0.0744,0.013,0.0106,0.0033,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
2,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,0.0881,0.1992,0.0184,0.2261,0.1729,0.2131,0.0693,0.2281,0.406,0.3973,0.2741,0.369,0.5556,0.4846,0.314,0.5334,0.5256,0.252,0.209,0.3559,0.626,0.734,0.612,0.3497,0.3953,0.3012,0.5408,0.8814,0.9857,0.9167,0.6121,0.5006,0.321,0.3202,0.4295,0.3654,0.2655,0.1576,0.0681,0.0294,0.0241,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
3,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,0.4152,0.3952,0.4256,0.4135,0.4528,0.5326,0.7306,0.6193,0.2032,0.4636,0.4148,0.4292,0.573,0.5399,0.3161,0.2285,0.6995,1.0,0.7262,0.4724,0.5103,0.5459,0.2881,0.0981,0.1951,0.4181,0.4604,0.3217,0.2828,0.243,0.1979,0.2444,0.1847,0.0841,0.0692,0.0528,0.0357,0.0085,0.023,0.0046,0.0156,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R
4,0.0286,0.0453,0.0277,0.0174,0.0384,0.099,0.1201,0.1833,0.2105,0.3039,0.2988,0.425,0.6343,0.8198,1.0,0.9988,0.9508,0.9025,0.7234,0.5122,0.2074,0.3985,0.589,0.2872,0.2043,0.5782,0.5389,0.375,0.3411,0.5067,0.558,0.4778,0.3299,0.2198,0.1407,0.2856,0.3807,0.4158,0.4054,0.3296,0.2707,0.265,0.0723,0.1238,0.1192,0.1089,0.0623,0.0494,0.0264,0.0081,0.0104,0.0045,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062,R


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0.0200  207 non-null    float64
 1   0.0371  207 non-null    float64
 2   0.0428  207 non-null    float64
 3   0.0207  207 non-null    float64
 4   0.0954  207 non-null    float64
 5   0.0986  207 non-null    float64
 6   0.1539  207 non-null    float64
 7   0.1601  207 non-null    float64
 8   0.3109  207 non-null    float64
 9   0.2111  207 non-null    float64
 10  0.1609  207 non-null    float64
 11  0.1582  207 non-null    float64
 12  0.2238  207 non-null    float64
 13  0.0645  207 non-null    float64
 14  0.0660  207 non-null    float64
 15  0.2273  207 non-null    float64
 16  0.3100  207 non-null    float64
 17  0.2999  207 non-null    float64
 18  0.5078  207 non-null    float64
 19  0.4797  207 non-null    float64
 20  0.5783  207 non-null    float64
 21  0.5071  207 non-null    float64
 22  0.

In [6]:
df.describe()

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032
count,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0
mean,0.029208,0.038443,0.043837,0.054053,0.075105,0.104599,0.121591,0.134677,0.177361,0.208245,0.236376,0.250666,0.273544,0.297689,0.321429,0.379217,0.416495,0.453055,0.504797,0.563449,0.609209,0.624841,0.64801,0.673223,0.675444,0.700148,0.702115,0.693473,0.641895,0.581871,0.506281,0.439903,0.416761,0.401535,0.390343,0.382597,0.362331,0.338353,0.324986,0.311385,0.290403,0.278269,0.246368,0.213053,0.196909,0.160738,0.122537,0.091217,0.051995,0.020366,0.016034,0.013472,0.010729,0.010917,0.0093,0.008181,0.007771,0.007947,0.007936,0.006523
std,0.023038,0.03304,0.038521,0.046583,0.055669,0.059247,0.061897,0.08534,0.118311,0.134741,0.132923,0.140264,0.141262,0.164075,0.205158,0.232975,0.264213,0.261947,0.258614,0.263225,0.258434,0.256373,0.250335,0.239555,0.24552,0.237768,0.246252,0.237631,0.240818,0.220864,0.212917,0.213389,0.206907,0.230499,0.257756,0.262755,0.239546,0.212655,0.19921,0.179076,0.170717,0.169137,0.139308,0.132795,0.151924,0.134254,0.087155,0.062496,0.036029,0.013673,0.012027,0.009628,0.007071,0.00731,0.007103,0.005719,0.005756,0.006485,0.006196,0.005038
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,0.0289,0.0236,0.0184,0.0273,0.0031,0.0162,0.0349,0.0375,0.0494,0.0656,0.0512,0.0219,0.0563,0.0239,0.024,0.0921,0.0481,0.0284,0.0144,0.0613,0.0482,0.0404,0.0477,0.0212,0.0223,0.008,0.0351,0.0383,0.0371,0.0117,0.036,0.0056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.0133,0.0164,0.0189,0.02445,0.0377,0.06695,0.0806,0.08035,0.09675,0.11115,0.1282,0.13345,0.16575,0.1761,0.1665,0.1959,0.2055,0.24195,0.29905,0.35045,0.39755,0.40635,0.45525,0.54045,0.5248,0.54355,0.5298,0.53395,0.4613,0.41425,0.3493,0.2841,0.25735,0.21755,0.17855,0.154,0.16,0.17425,0.17245,0.1859,0.1641,0.1587,0.1552,0.12685,0.09445,0.0684,0.0642,0.04495,0.0263,0.0115,0.00835,0.00735,0.00505,0.00535,0.0041,0.0044,0.0037,0.0036,0.00365,0.0031
50%,0.0228,0.0308,0.0342,0.0441,0.062,0.0921,0.1056,0.1119,0.1522,0.181,0.2251,0.2497,0.2655,0.2819,0.284,0.3075,0.3068,0.3709,0.4348,0.543,0.6275,0.667,0.7007,0.7012,0.7221,0.7545,0.7539,0.7317,0.6825,0.6074,0.4906,0.4303,0.3903,0.3497,0.3108,0.3195,0.3039,0.3104,0.2829,0.279,0.2605,0.2444,0.2211,0.1776,0.1473,0.1211,0.1015,0.0777,0.0449,0.0179,0.0138,0.0115,0.0096,0.0093,0.0075,0.0068,0.0059,0.0058,0.0063,0.0053
75%,0.0358,0.0481,0.0582,0.0657,0.10105,0.13415,0.15305,0.1698,0.2315,0.269,0.3018,0.3316,0.3515,0.38695,0.45305,0.53605,0.66005,0.6791,0.7319,0.80945,0.81805,0.83215,0.85225,0.87335,0.87455,0.8938,0.9174,0.90185,0.85235,0.73695,0.6432,0.5857,0.55675,0.5844,0.5914,0.55395,0.5102,0.43875,0.43055,0.4247,0.38915,0.3851,0.32505,0.26765,0.22975,0.20065,0.15475,0.1197,0.06895,0.0251,0.0207,0.01675,0.0149,0.01445,0.0121,0.01035,0.01035,0.0104,0.01035,0.00855
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,0.7342,0.706,0.7131,0.997,1.0,0.9988,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9657,0.9306,1.0,0.9647,1.0,1.0,0.9497,1.0,0.9857,0.9297,0.8995,0.8246,0.7733,0.7762,0.7034,0.7292,0.5522,0.3339,0.1981,0.0825,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [7]:
df.shape

(207, 61)

In [8]:
print(df.values)

dataset = df.values

[[0.0453 0.0523 0.0843 ... 0.0052 0.0044 'R']
 [0.0262 0.0582 0.1099 ... 0.0095 0.0078 'R']
 [0.01 0.0171 0.0623 ... 0.004 0.0117 'R']
 ...
 [0.0522 0.0437 0.018 ... 0.0077 0.0031 'M']
 [0.0303 0.0353 0.049 ... 0.0036 0.0048 'M']
 [0.026 0.0363 0.0136 ... 0.0061 0.0115 'M']]


In [18]:
# X = dataset[:,:60]     이 코드를 아래 코드처럼 바꾸어주어야한다.
# 그렇지 않으면 model.fit에서 ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).
# 이런 에러가 발생
X = dataset[:,:60].astype(float)
Y_obj = dataset[:,60]
print(X)


# https://devbull.xyz/failed-to-convert-a-numpy-array-to-a-tensor-unsupported-object-type-float/
#

[[0.0453 0.0523 0.0843 ... 0.0049 0.0052 0.0044]
 [0.0262 0.0582 0.1099 ... 0.0164 0.0095 0.0078]
 [0.01   0.0171 0.0623 ... 0.0044 0.004  0.0117]
 ...
 [0.0522 0.0437 0.018  ... 0.0138 0.0077 0.0031]
 [0.0303 0.0353 0.049  ... 0.0079 0.0036 0.0048]
 [0.026  0.0363 0.0136 ... 0.0036 0.0061 0.0115]]


In [19]:
print(Y_obj)

['R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'R' 'R' 'R' 'R' 'R' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M']


In [20]:
# 문자열 변환
e = LabelEncoder()
e.fit(Y_obj)
Y = e.transform(Y_obj)
print(Y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [21]:
# 데이터를 10개의 파일로 쪼갠다
n_fold = 10
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [22]:
skf

StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

In [23]:
# 빈 accuracy 배열
accuracy = []

In [24]:
# 모델의 설정, 컴파일, 실행
for train, test in skf.split(X, Y):
    model = Sequential()
    model.add(Dense(24, input_dim=60, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(X[train], Y[train], epochs=100, batch_size=5)
    k_accuracy = "%.4f" % (model.evaluate(X[test], Y[test])[1])
    accuracy.append(k_accuracy)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [25]:
# 결과 출력
print("\n %.f fold accuracy:" % n_fold, accuracy)



 10 fold accuracy: ['0.9048', '0.8571', '0.9048', '0.7143', '0.8571', '0.6667', '0.7619', '0.9000', '0.7500', '0.8500']


10번의 테스트가 모두 정상 작동이 되었다.
