# Imports

In [30]:
import os 
import pandas as pd
from tqdm.notebook import tqdm
from glob import glob

<br>

# Explanation

This data splitter will explore the spurious correlation that might arise between right and left eye images for a certain severity level. If the model was trained only on right-eye images with non-referrable severity levels (0 and 1) images and tested on right-eye images with referrable severity levels (2 to 4), there are two possible outcomes. If the model learns the spurious relationship between the side of the eye and the severity it will classify the test samples with lower severity scores. If the model is robust enough to the spurious correlation it will most probably classify the test images correctly regardless of the eye-side bias introduced in the data split.

<br>

# Common Functions

In [55]:
def detect_image_extension(image_folder):
    IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp')
    image_paths = os.listdir(image_folder)
    assert len(image_paths), 'The images folder is empty! Please correct the folder path.'
    extension = os.path.splitext(image_paths[0])[-1]
    assert extension in IMG_EXTENSIONS, 'The images folder must only contain images files with consistent format.'
    return extension

<br>

# Definitions

In [158]:
# Path to image folder
IMG_FOLDER = '../datasets/reduced_eyepacs/resized_train_cropped/resized_train_cropped'
# Path to the file containing the image labels
IMG_LABELS_FILE = '../datasets/reduced_eyepacs/trainLabels_cropped.csv'
IMAGE_EXT = detect_image_extension(IMG_FOLDER)
SEED = 100

<br>

# Splitting data based on eye side, severity and frequency

## Preparing useful informationm

In [8]:
# Reading the labels data
df_labels = pd.read_csv(IMG_LABELS_FILE)
df_labels = df_labels[['image', 'level']]

In [25]:
# The image info will gather the useful information for splitting
df_img_info = df_labels

In [26]:
# Separates the patient ID from the eye-side information
image_name_info = df_img_info['image'].str.split('_', expand=True)

In [27]:
image_name_info.columns = ['patient_id', 'side']

In [28]:
df_img_info = pd.concat([image_name_info, df_img_info], axis=1)

In [58]:
df_img_info['image'] = df_img_info['image'].transform(
    lambda file_name: os.path.join(IMG_FOLDER, file_name + IMAGE_EXT))

In [91]:
# Referrable and non-referrable levels
df_img_info['level_group'] = df_img_info['level'].map(lambda val: 'nref' if val <= 1 else 'ref')

In [97]:
df_img_info['groups'] = df_img_info['side'] + '_' + df_img_info['level_group']

In [121]:
df_img_info.groups.value_counts(normalize=True)

right_nref    0.403555
left_nref     0.400820
left_ref      0.099009
right_ref     0.096616
Name: groups, dtype: float64

## Splitting Images

First, we need to first define two important sets, the environment set $\mathcal{E}$ and the labels set $\mathcal{Y}$

$$
\large \mathcal{E} = \{left,\;right\}
$$

$$
\large \mathcal{Y} = \{nref,\; ref\}\; 
$$

$where:$
\begin{equation*}    
    \begin{aligned}
        &nref = \{0,1\}\\
        &ref = \{2,3,4\}
    \end{aligned}
\end{equation*}

In [157]:
from sklearn.model_selection import train_test_split

In [186]:
MAJORITY_TRAIN_GROUPS = ['left_nref', 'right_ref']
MAJORITY_PROPORTION = 0.9
VAL_PCT_FROM_TRAIN = 0.1

In [149]:
df_major_groups_train = df_img_info.loc[df_img_info.groups.isin(MAJORITY_TRAIN_GROUPS)]

In [154]:
df_minor_groups_train = df_img_info.loc[df_img_info.index.difference(df_major_groups_train.index)]

In [159]:
majority_train, minority_test = train_test_split(
    df_major_groups_train, 
    stratify=df_major_groups_train['groups'], 
    train_size=MAJORITY_PROPORTION,
    random_state=SEED)

In [172]:
minority_train_size = round((len(majority_train) / MAJORITY_PROPORTION) * (1 - MAJORITY_PROPORTION))

In [178]:
majority_test, minority_train = train_test_split(
    df_minor_groups_train, 
    stratify=df_minor_groups_train['groups'], 
    test_size=minority_train_size / len(df_minor_groups_train),
    random_state=SEED)

In [185]:
# Taking part of training for validation
train_images = pd.concat([majority_train, minority_train], axis=0, sort=True)

In [187]:
train_images, val_images = train_test_split(
    train_images, 
    stratify=train_images['groups'],
    test_size=VAL_PCT_FROM_TRAIN,
    random_state=SEED)

In [184]:
test_images = pd.concat([majority_test, minority_test], axis=0, sort=True)

In [211]:
# Group distribution
print(f'Train:')
pprint(train_images['groups'].value_counts(normalize=True).round(decimals=2).to_dict())
print(f'Validation:')
pprint(val_images['groups'].value_counts(normalize=True).round(decimals=2).to_dict())
print(f'Test:')
pprint(test_images['groups'].value_counts(normalize=True).round(decimals=2).to_dict())

Train:
{'left_nref': 0.73, 'left_ref': 0.02, 'right_nref': 0.08, 'right_ref': 0.17}
Validation:
{'left_nref': 0.73, 'left_ref': 0.02, 'right_nref': 0.08, 'right_ref': 0.18}
Test:
{'left_nref': 0.08, 'left_ref': 0.18, 'right_nref': 0.72, 'right_ref': 0.02}
