### 교차 검증
- 부족한 데이터셋 및 특정 데이터에 과대적합되는 문제 해결하기 위한 방안
- 학습 데이터셋을 일정 크기의 데이터로 n개 분리 후 1/n 검증용, 나머지는 학습용으로 사용 

In [33]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

In [34]:
# KFold 인스턴스 생성 => 데이터를 2개로 분할해주는 객체
k_fold = KFold(n_splits = 2)

In [35]:
# 데이터 분할
datasets = k_fold.split(X)

for dataset in datasets:
    print(dataset)

(array([2, 3]), array([0, 1]))
(array([0, 1]), array([2, 3]))


In [36]:
# perch.csv 파일 데이터 기본 5등분
import pandas as pd

perchDF = pd.read_csv('../DATA/perch3.csv')
perchDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Weight  56 non-null     float64
 1   Length  56 non-null     float64
 2   Height  56 non-null     float64
 3   Width   56 non-null     float64
dtypes: float64(4)
memory usage: 1.9 KB


In [37]:
# perchDF 5등분
fold_5 = KFold()
datasets = fold_5.split(perchDF)

for index, dataset in enumerate(datasets):
    print(f'{index} => {dataset}')

0 => (array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
       46, 47, 48, 49, 50, 51, 52, 53, 54, 55]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]))
1 => (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 23, 24, 25, 26, 27,
       28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]), array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]))
2 => (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]), array([23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]))
3 => (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]), array

In [38]:
datasets = fold_5.split(perchDF)
for index, (train, test) in enumerate(datasets):
    print(f'{index} => {train.shape} {test.shape}')

0 => (44,) (12,)
1 => (45,) (11,)
2 => (45,) (11,)
3 => (45,) (11,)
4 => (45,) (11,)


In [39]:
# perchDF 3등분
fold_3 = KFold(n_splits = 3)
datasets = fold_3.split(perchDF)

for index, dataset in enumerate(datasets):
    print(f'{index} => {dataset}')

0 => (array([19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18]))
1 => (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55]), array([19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37]))
2 => (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37]), array([38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
       55]))


In [40]:
datasets = fold_3.split(perchDF)
for index, (train, test) in enumerate(datasets):
    print(f'{index} => {train.shape} {test.shape}')

0 => (37,) (19,)
1 => (37,) (19,)
2 => (38,) (18,)


In [41]:
irisDF = pd.read_csv('../DATA/Iris.csv')
irisDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [42]:
k_fold = KFold(n_splits = 3)
ret = k_fold.split(irisDF[irisDF.columns[:-1]])

In [43]:
for index, dataset in enumerate(ret):
    print(f'{index} => {dataset}')

0 => (array([ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
        63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
        76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
        89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
       102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
       115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
       128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
       141, 142, 143, 144, 145, 146, 147, 148, 149]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]))
1 => (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  

In [44]:
ret = k_fold.split(irisDF[irisDF.columns[:-1]])
for index, (train, test) in enumerate(ret):
    print(f'{index} => {train.shape} {test.shape}')

0 => (100,) (50,)
1 => (100,) (50,)
2 => (100,) (50,)


In [53]:
from sklearn.linear_model import LogisticRegression
ret = k_fold.split(irisDF[irisDF.columns[:-1]])

train_score3 = []
for p_name in ['l1', 'l2', 'elasticnet']:
    log_model = LogisticRegression(max_iter = 1000, solver = 'liblinear', penalty = p_name)
    for idx, (train, test) in enumerate(ret):
        # 학습용, 테스트용 인덱스 반환
        train_idx = train.tolist()
        test_idx = test.tolist()
        
        # 인덱스에 해당하는 데이터셋 추출
        trainDF = irisDF.iloc[train_idx]
        testDF = irisDF.iloc[test_idx]
        
        print(trainDF[trainDF.columns[4]].value_counts()/trainDF.shape[0])
        print(testDF[testDF.columns[4]].value_counts()/testDF.shape[0])

        X_train = trainDF[trainDF.columns[:-1]]
        y_train = trainDF[trainDF.columns[-1]]

        X_test = testDF[testDF.columns[:-1]]
        y_test = testDF[testDF.columns[-1]]

        # 분류 모델 학습
        log_model.fit(X_train, y_train)

        # 훈련 및 검증용 성능
        train_score = log_model.score(X_train, y_train)
        test_score = log_model.score(X_test, y_test)

        train_score3.append(train_score)

species
versicolor    0.5
virginica     0.5
Name: count, dtype: float64
species
setosa    1.0
Name: count, dtype: float64
species
setosa       0.5
virginica    0.5
Name: count, dtype: float64
species
versicolor    1.0
Name: count, dtype: float64
species
setosa        0.5
versicolor    0.5
Name: count, dtype: float64
species
virginica    1.0
Name: count, dtype: float64


In [55]:
sum(train_score3)/3

0.9833333333333334

In [57]:
# 타겟 데이터를 적절하게 섞어서 교차검증
from sklearn.model_selection import StratifiedKFold
k_fold = StratifiedKFold(n_splits = 3)
ret = k_fold.split(irisDF[irisDF.columns[:-1]], irisDF[irisDF.columns[-1]])

train_score3 = []
for p_name in ['l1', 'l2', 'elasticnet']:
    log_model = LogisticRegression(max_iter = 1000, solver = 'liblinear', penalty = p_name)
    for idx, (train, test) in enumerate(ret):
        # 학습용, 테스트용 인덱스 반환
        train_idx = train.tolist()
        test_idx = test.tolist()
        
        # 인덱스에 해당하는 데이터셋 추출
        trainDF = irisDF.iloc[train_idx]
        testDF = irisDF.iloc[test_idx]
        
        print(trainDF[trainDF.columns[4]].value_counts()/trainDF.shape[0])
        print(testDF[testDF.columns[4]].value_counts()/testDF.shape[0])

        X_train = trainDF[trainDF.columns[:-1]]
        y_train = trainDF[trainDF.columns[-1]]

        X_test = testDF[testDF.columns[:-1]]
        y_test = testDF[testDF.columns[-1]]

        # 분류 모델 학습
        log_model.fit(X_train, y_train)

        # 훈련 및 검증용 성능
        train_score = log_model.score(X_train, y_train)
        test_score = log_model.score(X_test, y_test)

        train_score3.append(train_score)

species
virginica     0.34
setosa        0.33
versicolor    0.33
Name: count, dtype: float64
species
setosa        0.34
versicolor    0.34
virginica     0.32
Name: count, dtype: float64
species
versicolor    0.34
setosa        0.33
virginica     0.33
Name: count, dtype: float64
species
setosa        0.34
virginica     0.34
versicolor    0.32
Name: count, dtype: float64
species
setosa        0.34
versicolor    0.33
virginica     0.33
Name: count, dtype: float64
species
versicolor    0.34
virginica     0.34
setosa        0.32
Name: count, dtype: float64


In [58]:
sum(train_score3)/3

0.9633333333333333