In [130]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [131]:
CIFAR10_DIR = Path('../resources/data/cifar10/')

## Training Data [Multiclass]
- preparing training csv 
- preparing validation csv
- adding stratified folds 

In [132]:
TRAIN_PATH = CIFAR10_DIR / 'train'
id = 0
class_id = 0
class_dict = {}
train_data = []

In [133]:
subdir_list = [ subdir for subdir in TRAIN_PATH.iterdir() if subdir.is_dir() ]

In [134]:
for subdir in subdir_list:
    if subdir.name not in class_dict.keys():
        class_dict[subdir.name] = class_id
        class_id += 1
    
    for file_path in subdir.iterdir():
        if file_path.is_file():
            train_data.append([ id, class_dict[subdir.name], file_path.relative_to( CIFAR10_DIR ), None ])
            id += 1

In [135]:
train_df = pd.DataFrame( train_data, columns=["id", "class", "path", "fold"] )

In [136]:
train_df.head()

Unnamed: 0,id,class,path,fold
0,0,0,train/airplane/3773.png,
1,1,0,train/airplane/0571.png,
2,2,0,train/airplane/2421.png,
3,3,0,train/airplane/1091.png,
4,4,0,train/airplane/0327.png,


In [137]:
x = train_df.to_numpy()
y = train_df["class"].to_numpy()

In [138]:
# adding fold entry in csv
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(x, y)
fold_idx = 0

for _, val_idx in skf.split(x, y):
    train_df.at[val_idx, "fold"] = fold_idx
    fold_idx += 1

In [139]:
train_df

Unnamed: 0,id,class,path,fold
0,0,0,train/airplane/3773.png,0
1,1,0,train/airplane/0571.png,0
2,2,0,train/airplane/2421.png,0
3,3,0,train/airplane/1091.png,0
4,4,0,train/airplane/0327.png,0
...,...,...,...,...
49995,49995,9,train/automobile/2523.png,4
49996,49996,9,train/automobile/4172.png,4
49997,49997,9,train/automobile/3678.png,4
49998,49998,9,train/automobile/4613.png,4


In [140]:
# shuffle
train_df = train_df.sample(frac = 1)
train_df.head()

Unnamed: 0,id,class,path,fold
22373,22373,4,train/ship/0384.png,2
29905,29905,5,train/horse/3164.png,4
2680,2680,0,train/airplane/2166.png,2
36776,36776,7,train/bird/4994.png,1
19235,19235,3,train/deer/3233.png,4


In [141]:
# saving to csv
train_df.to_csv(CIFAR10_DIR / "train_m.csv", index=False)

In [142]:
# sanity check
train_df["fold"].value_counts()

4    10000
3    10000
2    10000
1    10000
0    10000
Name: fold, dtype: int64

## Test Data [Multiclass]
- preparing test csv 

In [143]:
TEST_PATH = CIFAR10_DIR / 'test'
id = 0
test_data = []

In [145]:
subdir_list = [ subdir for subdir in TEST_PATH.iterdir() if subdir.is_dir() ]

In [146]:
for subdir in subdir_list:    
    for file_path in subdir.iterdir():
        if file_path.is_file():
            test_data.append([ id, class_dict[subdir.name], file_path.relative_to( CIFAR10_DIR ) ])
            id += 1

In [147]:
test_df = pd.DataFrame( test_data, columns=["id", "class", "path"] )

In [148]:
test_df

Unnamed: 0,id,class,path
0,0,0,test/airplane/0571.png
1,1,0,test/airplane/0327.png
2,2,0,test/airplane/0028.png
3,3,0,test/airplane/0007.png
4,4,0,test/airplane/0125.png
...,...,...,...
9995,9995,9,test/automobile/0305.png
9996,9996,9,test/automobile/0689.png
9997,9997,9,test/automobile/0274.png
9998,9998,9,test/automobile/0179.png


In [None]:
# saving to csv
test_df.to_csv(CIFAR10_DIR / "train_m.csv", index=False)