## Train with Cross Validation

### K-Fold Cross Validation
K-Fold Cross Validation is a common method to evaluate a model. The dataset is split into K folds. Each fold is used as the test set in turn while the rest are used as the training set. The final score is the average of the scores of all folds. The advantage of K-Fold Cross Validation is that all data are used for training and testing. The disadvantage is that it is computationally expensive.

In [16]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

In [17]:
df = pd.read_csv('dataset/pima_indians_diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [18]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X.shape, y.shape

((767, 8), (767,))

In [19]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((613, 8), (154, 8), (613,), (154,))

In [20]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [22]:
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([613, 8]),
 torch.Size([154, 8]),
 torch.Size([613]),
 torch.Size([154]))

In [23]:
# kfold.split() returns indices to split data into training and test set
for train_idx, test_idx in kfold.split(X_train, y_train):
    print(train_idx, test_idx)
    print(len(train_idx), len(test_idx))

[  0   1   2   3   4   5   7   9  10  11  12  13  14  15  16  17  19  20
  21  22  23  24  25  27  28  30  31  32  33  36  37  38  39  40  41  42
  44  45  46  47  48  49  50  51  52  53  54  55  58  59  61  62  63  64
  65  66  67  68  70  71  72  73  74  75  76  77  79  80  81  82  84  85
  86  87  88  89  90  91  93  94  95  96  97  98  99 100 101 103 104 108
 109 110 111 112 114 115 116 117 118 119 120 121 122 123 125 127 128 129
 130 131 132 133 135 136 137 138 141 142 144 145 147 148 149 150 152 154
 155 157 158 160 161 162 164 165 166 167 168 169 170 171 172 173 175 176
 178 179 180 181 182 183 185 187 188 189 191 192 193 194 195 196 197 199
 201 203 204 205 206 207 208 209 210 211 212 213 215 216 217 218 219 220
 221 223 224 225 226 227 228 231 232 233 234 235 236 237 238 239 241 242
 243 244 245 248 249 250 251 252 255 256 257 259 260 261 262 263 265 267
 268 270 272 273 275 276 277 278 279 281 282 283 284 285 286 287 288 289
 290 293 295 297 300 302 303 304 305 306 307 308 30

In [32]:
class PrimaClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, hidden_size2, output_size):
        super(PrimaClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size2),
            nn.ReLU(),
            nn.Linear(hidden_size2, hidden_size2),
            nn.ReLU(),
            nn.Linear(hidden_size2, output_size),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.layers(x)

In [35]:
# model parameters
input_size = 8
hidden_size = 16
hidden_size2 = 4
output_size = 1
learning_rate = 0.01
epochs = 100
batch_size = 16

In [36]:
from sklearn.model_selection import KFold

# Define the number of splits for cross-validation
n_splits = 5

# Initialize the KFold object
kfold = KFold(n_splits=n_splits, shuffle=True)

# Initialize a list to store the accuracy scores for each fold
accuracy_scores = []

# Perform cross-validation
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
    print(f"Fold {fold+1}/{n_splits}")
    
    # Get the training and validation data for this fold
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    # Convert the data to PyTorch tensors
    X_train_fold = torch.tensor(X_train_fold, dtype=torch.float32)
    X_val_fold = torch.tensor(X_val_fold, dtype=torch.float32)
    y_train_fold = torch.tensor(y_train_fold, dtype=torch.float32)
    y_val_fold = torch.tensor(y_val_fold, dtype=torch.float32)
    
    # Move the data to the device (e.g., GPU)
    X_train_fold = X_train_fold.to(device)
    X_val_fold = X_val_fold.to(device)
    y_train_fold = y_train_fold.to(device)
    y_val_fold = y_val_fold.to(device)
    
    # Initialize the model
    model = PrimaClassifier(input_size, hidden_size, hidden_size2, output_size).to(device)
    
    # Define the loss function and optimizer
    loss_fn = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Train the model
    for epoch in range(epochs):
        for i in range(0, len(X_train_fold), batch_size):
            X_batch = X_train_fold[i:i+batch_size]
            y_batch = y_train_fold[i:i+batch_size]
            
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch.view(-1, 1))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    # Evaluate the model on the validation set
    with torch.no_grad():
        y_pred_val = model(X_val_fold)
        y_pred_val = y_pred_val.round()
        acc_val = y_pred_val.eq(y_val_fold.view_as(y_pred_val)).sum() / float(len(y_val_fold))
        accuracy_scores.append(acc_val.item())
        print(f"Validation accuracy: {acc_val:.4f}")

# train accuracy
with torch.no_grad():
    y_pred = model(X_train)
    y_pred = y_pred.round()
    acc = y_pred.eq(y_train.view_as(y_pred)).sum() / float(len(y_train))
    print(f'Train accuracy: {acc:.4f}')

# Calculate the average accuracy across all folds
avg_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average validation accuracy: {avg_accuracy:.4f}")

# test accuracy
with torch.no_grad():
    y_pred = model(X_test)
    y_pred = y_pred.round()
    acc = y_pred.eq(y_test.view_as(y_pred)).sum() / float(len(y_test))
    print(f'Test accuracy: {acc:.4f}')

Fold 1/5




Validation accuracy: 0.7967
Fold 2/5
Validation accuracy: 0.7073
Fold 3/5
Validation accuracy: 0.6179
Fold 4/5
Validation accuracy: 0.6967
Fold 5/5
Validation accuracy: 0.7295
Train accuracy: 0.8842
Average validation accuracy: 0.7096
Test accuracy: 0.7273
