## 作業目標：運用scikit-learn API 實現K-fold分割資料

---

### 讀入資料

In [2]:
import pandas as pd
dataset = pd.read_csv('./datasets/Social_Network_Ads.csv')

In [3]:
# 觀察到是一個分類問題, 且須要做類別特徵的轉換以及數值特徵的標準化
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0
...,...,...,...,...,...
395,15691863,Female,46.0,41000.0,1
396,15706071,Male,51.0,23000.0,1
397,15654296,Female,50.0,20000.0,1
398,15755018,Male,36.0,33000.0,0


In [4]:
# 觀察缺失值以及資料型態
# ---> 無缺失值
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   User ID          400 non-null    int64  
 1   Gender           400 non-null    object 
 2   Age              400 non-null    float64
 3   EstimatedSalary  400 non-null    float64
 4   Purchased        400 non-null    int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 15.8+ KB


In [5]:
# 看數值型簡單統計分布
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [8]:
# 標籤的分布
# ---> 分布看過去算合理, 沒有過度集中其中一個類別
dataset['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

### 取出訓練特徵與標註

In [9]:
X = dataset[['User ID', 'Gender', 'Age', 'EstimatedSalary']].values
Y = dataset['Purchased'].values

---

In [10]:
import numpy as np
from sklearn.model_selection import KFold

### 將訓練資料按照順序切割成10等分

In [11]:
kf = KFold(n_splits=10)
kf.get_n_splits(X)

print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


### 將訓練資料隨機切割成10等分

In [12]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)

print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)


---

### 取出 切割資料對應位置

In [13]:
train_split = kf.split(X)
next(train_split)

(array([  0,   1,   2,   4,   5,   6,   7,   8,   9,  10,  11,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  67,  68,  69,  70,
         71,  72,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,
         87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
        100, 101, 102, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114,
        115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 144,
        145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
        158, 159, 162, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174,
        175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 188,
        189, 190, 191, 192, 194, 195, 196, 197, 199

### Or

In [14]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [  0   1   3   4   5   6   7   8   9  10  11  12  14  15  16  17  19  20
  21  22  23  24  25  26  27  28  29  30  31  32  33  35  36  37  38  40
  41  42  43  44  45  46  48  49  50  51  52  53  54  55  56  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  74  75  76  77  78  79
  80  81  84  85  86  87  88  90  91  92  93  94  95  97  98  99 100 101
 102 103 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
 121 122 123 124 125 126 127 128 130 131 132 133 134 136 137 138 139 140
 141 142 143 144 145 146 147 148 149 150 151 152 153 155 156 157 158 159
 160 161 162 164 165 166 167 168 170 171 172 173 174 175 176 177 178 179
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 207 208 209 210 211 213 214 215 216 217 218
 220 221 222 223 224 225 227 228 229 230 232 233 234 235 236 238 239 240
 241 242 243 244 245 246 247 248 249 250 251 252 253 255 256 257 258 259
 260 261 263 264 265 266 267 268 269 270 271

### 取出切割資料：trainset / testset 特徵(x_train/x_test)/標註(y_train/y_test)

In [16]:
for train_index, test_index in kf.split(X):
    print('訓練資料')
    print(X[train_index])
    print(Y[train_index])
    print('=' * 50)
    print('測試資料')
    print(X[test_index])
    print(Y[test_index])
    print('-' * 50)

訓練資料
[[15624510 'Male' 19.0 19000.0]
 [15810944 'Male' 35.0 20000.0]
 [15668575 'Female' 26.0 43000.0]
 ...
 [15654296 'Female' 50.0 20000.0]
 [15755018 'Male' 36.0 33000.0]
 [15594041 'Female' 49.0 36000.0]]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 0 1 0 1 1 0 0
 1 0 1 0 1 1 0 1 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1
 0 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 1 1 0 0 1 1 0 1 1 0 0 0 1
 1 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1
 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1]
測試資料
[[15598044 'Female' 27.0 84000.0]
 [15704987 'Male' 32.