# Startified KFold for the OM dataset

This code is provided to split a dataset in COCO (+ YOLO later) format given labels distributions across the dataset. This algorithms is indended for preserving the samples percentages for each class in order to provide a smoother generalization and adress class imbalance. 

## Setup

In [1]:
import json
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os

## Load the dataset

In [2]:
dataset_path = './roboflow_datasets/xmm_om_artefacts_512-7-COCO/'
json_file_path = dataset_path+'train/_annotations.coco.json'
dest_train_path = dataset_path+'train/'
dest_valid_path = dataset_path+'valid/'

with open(json_file_path) as f:
    data_in = json.load(f)
data_in['categories']

[{'id': 0, 'name': 'artefacts', 'supercategory': 'none'},
 {'id': 1, 'name': 'central-ring', 'supercategory': 'artefacts'},
 {'id': 2, 'name': 'smoke-ring', 'supercategory': 'artefacts'},
 {'id': 3, 'name': 'star-loop', 'supercategory': 'artefacts'}]

## Stratified KFold

In [3]:
images, labels = [], []

for k in range(len(data_in['images'])):
    img_id = data_in['images'][k]['id']
    annotations = [data_in['annotations'][j] for j in range(len(data_in['annotations'])) if data_in['annotations'][j]['image_id'] == img_id]
    categories = set(str(annot['category_id']) for annot in annotations)
    images.append([img_id])
    labels.append(''.join(list(categories)))
        
images, labels = np.array(images), np.array(labels)

In [4]:
labels

array(['32', '312', '312', '3', '32', '31', '1', '32', '312', '32', '32',
       '312', '12', '3', '312', '31', '3', '31', '12', '1', '1', '31',
       '32', '312', '3', '312', '3', '312', '312', '312', '32', '32',
       '312', '3', '3', '32', '312', '12', '32', '32', '312', '31', '31',
       '32', '3', '312', '32', '312', '3', '312', '312', '1', '1', '31',
       '31', '3', '3', '312', '2', '1', '32', '31', '312', '312', '31',
       '32', '3', '312', '312', '31', '31', '1', '312', '3', '31', '3',
       '', '312', '312', '312', '312', '3', '312', '312', '312', '31',
       '31', '312', '31', '32', '1', '2', '312', '3', '12', '312', '2',
       '1', '32', '312', '31', '1', '31', '31', '3', '12', '312', '', '3',
       '32', '31', '31', '32', '32', '32', '32', '3', '32', '32', '31',
       '3', '3', '3', '31', '3', '32', '31', '2', '31', '1', '32', '31',
       '2', '312', '312', '32', '31', '3', '312', '3', '31', '3', '12',
       '32', '1', '32', '32', '312', '312', '32', '312', '3

The resulting arrays' size equals to the number of annotations because image ids are repeated for each label associated with them.

In [5]:
images.shape, labels.shape

((687, 1), (687,))

Run the Stratified KFold split and generate train and valid datasets given the number of splits. 

The split percentage is calculated depending on the `n_splits` parameter:

> train_percentage = 100 * 1/n_splits
>
> valid_percentage = 100 - train_percentage

In [6]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
skf_image_ids, skf_labels = {}, {}

for i, (train_index, valid_index) in enumerate(skf.split(images, labels)):
    print(f"Fold {i}:")
    print(f"  Train: Image index={images[train_index]}")
    print(f"  valid:  Image index={images[valid_index]}")
    skf_image_ids[i] = {'train': images[train_index], 'valid': images[valid_index]}
    skf_labels[i] = {'train': labels[train_index], 'valid': labels[valid_index]}

Fold 0:
  Train: Image index=[[  0]
 [  1]
 [  2]
 [  3]
 [  5]
 [  7]
 [  8]
 [  9]
 [ 11]
 [ 12]
 [ 13]
 [ 15]
 [ 20]
 [ 22]
 [ 23]
 [ 24]
 [ 26]
 [ 28]
 [ 29]
 [ 30]
 [ 31]
 [ 32]
 [ 33]
 [ 34]
 [ 35]
 [ 39]
 [ 41]
 [ 42]
 [ 45]
 [ 46]
 [ 48]
 [ 49]
 [ 51]
 [ 53]
 [ 58]
 [ 59]
 [ 60]
 [ 61]
 [ 62]
 [ 63]
 [ 66]
 [ 67]
 [ 68]
 [ 69]
 [ 70]
 [ 74]
 [ 75]
 [ 76]
 [ 79]
 [ 83]
 [ 84]
 [ 85]
 [ 86]
 [ 87]
 [ 90]
 [ 91]
 [ 95]
 [ 97]
 [ 98]
 [ 99]
 [101]
 [105]
 [106]
 [107]
 [108]
 [110]
 [111]
 [112]
 [113]
 [114]
 [115]
 [116]
 [120]
 [124]
 [125]
 [127]
 [129]
 [130]
 [131]
 [132]
 [134]
 [135]
 [136]
 [137]
 [140]
 [141]
 [142]
 [144]
 [145]
 [146]
 [148]
 [149]
 [150]
 [151]
 [153]
 [156]
 [157]
 [158]
 [159]
 [161]
 [162]
 [165]
 [166]
 [167]
 [169]
 [171]
 [172]
 [173]
 [174]
 [175]
 [177]
 [178]
 [179]
 [181]
 [184]
 [185]
 [187]
 [188]
 [189]
 [191]
 [192]
 [193]
 [194]
 [195]
 [196]
 [199]
 [205]
 [207]
 [209]
 [210]
 [212]
 [214]
 [216]
 [217]
 [220]
 [221]
 [222]
 [223]
 [224



In [7]:
len(train_index), len(valid_index)

(458, 229)

In [8]:
len(skf_image_ids), 'splits'

(3, 'splits')

**Ensure that there are no image ids present in both splits.**

In [9]:
for i in range(n_splits):
    print("intersection", len(np.intersect1d(skf_image_ids[i]['train'], skf_image_ids[i]['valid'])))

intersection 0
intersection 0
intersection 0


**Ensure that the labels distribution is roughly the same between splits.**

In [10]:
labels_percentages = {}

for i in range(n_splits):
        
    train_labels_counts = {'0':0, '1':0, '2':0, '3':0}
    valid_labels_counts = {'0':0, '1':0, '2':0, '3':0}
    
    for j in range(len(skf_image_ids[0]['train'])):
        for cat in list(skf_labels[0]['train'][j]):
            train_labels_counts[cat] += 1
    
    for j in range(len(skf_image_ids[0]['valid'])):
        for cat in list(skf_labels[0]['valid'][j]):
            valid_labels_counts[cat] += 1
            
    train_labels_counts = {cat:counts * 1.0/len(train_index) for cat, counts in train_labels_counts.items()}
    valid_labels_counts = {cat:counts * 1.0/len(valid_index) for cat, counts in valid_labels_counts.items()}
            
    labels_percentages[i] = {'train':train_labels_counts, 'valid':  valid_labels_counts}

In [11]:
labels_percentages

{0: {'train': {'0': 0.0,
   '1': 0.6703056768558951,
   '2': 0.5611353711790393,
   '3': 0.7336244541484717},
  'valid': {'0': 0.0,
   '1': 0.6724890829694323,
   '2': 0.5633187772925764,
   '3': 0.7379912663755459}},
 1: {'train': {'0': 0.0,
   '1': 0.6703056768558951,
   '2': 0.5611353711790393,
   '3': 0.7336244541484717},
  'valid': {'0': 0.0,
   '1': 0.6724890829694323,
   '2': 0.5633187772925764,
   '3': 0.7379912663755459}},
 2: {'train': {'0': 0.0,
   '1': 0.6703056768558951,
   '2': 0.5611353711790393,
   '3': 0.7336244541484717},
  'valid': {'0': 0.0,
   '1': 0.6724890829694323,
   '2': 0.5633187772925764,
   '3': 0.7379912663755459}}}

## Update the dataset and save new annotations files

In [12]:
data_in_train = data_in.copy()
data_in_valid = data_in.copy()

data_in_train['images'] = [data_in_train['images'][i] for i in range(len(train_index))]
data_in_valid['images'] = [data_in_train['images'][i] for i in range(len(valid_index))]
train_annot_ids, valid_annot_ids = [], []

for img_i in data_in_train['images']:
    annotation_ids = [annot['id'] for annot in data_in_train['annotations'] if annot['image_id'] == img_i['id']]
    train_annot_ids +=annotation_ids
    
for img_i in data_in_valid['images']:
    annotation_ids = [annot['id'] for annot in data_in_valid['annotations'] if annot['image_id'] == img_i['id']]
    valid_annot_ids +=annotation_ids
    
# data_in_train['annotations'] = [data_in_train['annotations'][i] for i in range(len(data_in_train['annotations'])) if data_in_train['annotations'][i] in ]

len(data_in_train['images']), len(data_in_valid['images'])

(458, 229)

**extract annotations given skf indices**

In [13]:
data_in_train['annotations'] = [data_in_train['annotations'][i] for i in range(len(train_annot_ids))]
data_in_valid['annotations'] = [data_in_valid['annotations'][i] for i in range(len(valid_annot_ids))]

In [14]:
len(train_annot_ids), len(valid_annot_ids), len(data_in_train['images']), len(data_in_valid['images'])

(1972, 971, 458, 229)

**save the new json data**

In [15]:
if not os.path.exists(dest_train_path):
    os.mkdir(dest_train_path)
if not os.path.exists(dest_valid_path):
    os.mkdir(dest_valid_path)

In [16]:
new_train_json_path = dest_train_path+'skf_train_annotations.coco.json'
new_valid_json_path = dest_valid_path+'skf_valid_annotations.coco.json'

with open(new_train_json_path, 'w') as f1, open(new_valid_json_path, 'w') as f2:
    json.dump(data_in_train, f1, indent=4)
    json.dump(data_in_valid, f2, indent=4)

**Move the filenames to the corresponding split directories**

In [23]:
import shutil
import os

filenames = [image['file_name'] for image in data_in_valid['images']]
filenames = list(set(filenames))
print(len(filenames), 'files')

# Iterate over the filenames and copy each one
for filename in filenames:
    source_path = os.path.join(dataset_path+'train/', filename)
    dest_path = os.path.join(dest_valid_path, filename)
    
    # Copy the file from source to destination
    shutil.move(source_path, dest_path)

print("Files moved successfully.")


229 files


FileNotFoundError: [Errno 2] No such file or directory: './roboflow_datasets/xmm_om_artefacts_512-7-COCO/train/S0135742601_V_png.rf.fee1724f0742b9b6eccdb6ea62ac0b78.jpg'