# k-fold cross-validation

## Imports and settings

In [None]:
import os
import glob as gl
import shutil as sh
import numpy as np
import pandas as pd
import plotly.graph_objects as pg

from os import path
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
ROOT_DIR = 'root_dir'
TRAIN_IMAGE_DIR = path.join(ROOT_DIR, './train/images/')
TRAIN_LABEL_DIR = path.join(ROOT_DIR, './train/labels_thresh128/')
CV_DIR = path.join(ROOT_DIR, 'scv_train')

IMAGE_TYPE = 'png'
LABEL_TYPE = 'png'

FOLDS = 5

!ls $ROOT_DIR

## Structure creation

In [None]:
os.mkdir(CV_DIR)
for i in range(FOLDS):
    try:
        fold = f'{i+1}'
        print(f'Creating {fold} and sub-directories.')
        os.makedirs(path.join(CV_DIR, fold, 'train', 'images'))
        os.makedirs(path.join(CV_DIR, fold, 'train', 'labels'))
        os.makedirs(path.join(CV_DIR, fold, 'val', 'images'))
        os.makedirs(path.join(CV_DIR, fold, 'val', 'labels'))
    except FileExistsError:
        print('Structure already exists, clean up and run again.')

## Random folds
URL: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold

In [None]:
# get sorted file paths, cut file names down to stems
stems = sorted(gl.glob(os.path.join(TRAIN_IMAGE_DIR, '*.{}'.format(IMAGE_TYPE))))
stems = np.array([x.split('/')[-1][0:-4] for x in stems])

# generate k splits, shuffle with random state (reproducible)
X = np.array(range(len(stems)))
kf = KFold(n_splits=FOLDS, random_state=9001, shuffle=True)
kf.get_n_splits(X)

k = 1
for train_index, val_index in kf.split(X):
    train_stems, val_stems = stems[train_index], stems[val_index]
    
    # first n_samples % n_splits folds have size n_samples // n_splits + 1
    # See: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
    print(f'Fold {k}: Training {len(train_stems)}; Validation {len(val_stems)}')
    
    cv_fold_train_path = os.path.join(CV_DIR, f'{k}/train')
    for ts in train_stems:
        sh.copy(path.join(TRAIN_IMAGE_DIR, f'{ts}.{IMAGE_TYPE}'), path.join(cv_fold_train_path, 'images'))
        sh.copy(path.join(TRAIN_LABEL_DIR, f'{ts}.{LABEL_TYPE}'), path.join(cv_fold_train_path, 'labels'))
    
    cv_fold_val_path = os.path.join(CV_DIR, f'{k}/val')
    for vs in val_stems:
        sh.copy(path.join(TRAIN_IMAGE_DIR, f'{vs}.{IMAGE_TYPE}'), path.join(cv_fold_val_path, 'images'))
        sh.copy(path.join(TRAIN_LABEL_DIR, f'{vs}.{LABEL_TYPE}'), path.join(cv_fold_val_path, 'labels'))
    
    k += 1

## Stratified folds

URL: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
TODOs:
* Clean up and generalize for differing image/label formats
* Work on stems, not full file names

In [None]:
# class labels file (file, class)
PATH_CLASSES = 'path_to/clean128_classes_decile.csv'

In [None]:
df = pd.read_csv(PATH_CLASSES)
df['class'].value_counts()

In [None]:
# apply splitting
X, y = df['file'], df['class']
skf = StratifiedKFold(n_splits=FOLDS, random_state=9001, shuffle=True)
skf.get_n_splits(X, y)

# process splits
df_skf = pd.DataFrame(columns=['file', 'class', 'fold', 'set'])
i = 1
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index] # files
    y_train, y_val = y[train_index], y[val_index] # classes
    
    # prepare fold dfs, extend overview df
    df_t = pd.DataFrame(columns=['file', 'class', 'fold', 'set'])
    df_t['file'], df_t['class'] = X_train, y_train
    df_t['fold'], df_t['set'] = i, 'train'
    
    df_v = pd.DataFrame(columns=['file', 'class', 'fold', 'set'])
    df_v['file'], df_v['class'] = X_val, y_val
    df_v['fold'], df_v['set'] = i, 'val'    
     
    df_skf = pd.concat([df_skf, df_t, df_v])
    
    # get stems, copy files
    stems_train = np.array([x[0:-4] for x in df_t['file']])
    stems_val = np.array([x[0:-4] for x in df_v['file']])
    print(f'fold {i}: train {len(stems_train)}; val {len(stems_val)}')    
    
    cv_fold_train_path = os.path.join(CV_DIR, f'{i}/train')
    for st in stems_train:
        sh.copy(path.join(TRAIN_IMAGE_DIR, f'{st}.{IMAGE_TYPE}'), path.join(cv_fold_train_path, 'images'))
        sh.copy(path.join(TRAIN_LABEL_DIR, f'{st}.{LABEL_TYPE}'), path.join(cv_fold_train_path, 'labels'))
    
    cv_fold_val_path = os.path.join(CV_DIR, f'{i}/val')
    for sv in stems_val:
        sh.copy(path.join(TRAIN_IMAGE_DIR, f'{sv}.{IMAGE_TYPE}'), path.join(cv_fold_val_path, 'images'))
        sh.copy(path.join(TRAIN_LABEL_DIR, f'{sv}.{LABEL_TYPE}'), path.join(cv_fold_val_path, 'labels'))
    
    i += 1

# save overview
df_skf.to_csv(path.join(CV_DIR, 'overview.csv') , index=False)
df_skf