## Kaggle Diabetic Retinopathy Classification

### Train labels

 Loading the dataset

In [None]:
from sklearn.model_selection import train_test_split
from shutil import copy

import matplotlib.pyplot as plt
import pandas as pd
import progressbar
import os

In [None]:
def reset_path(directory:str):
    """
    Deletes an existing directory and re-creates it

    - directory: path to directory
    """
    if os.path.exists(directory):
        !rm -r {directory}
    os.mkdir(directory)

In [None]:
DS_BASE_PATH = './Data'
DB_PATH = './DB'
reset_path(DB_PATH)

IMG_PATH = os.path.sep.join([DS_BASE_PATH, 'resized_train'])
csv_path = os.path.sep.join([DS_BASE_PATH,'trainLabels.csv'])

orig_ds = pd.read_csv(csv_path)
orig_ds

In [None]:
display(orig_ds.groupby('level')['image'].count())

dfp = orig_ds.pivot_table(
    index='level',
    aggfunc='size'
)
dfp.plot(
    kind='bar',
    figsize=(5,3),
    rot=0
)
plt.show()

classes = 5
plt.pie([orig_ds.loc[orig_ds['level'] == n].count()[0] for n in range(classes)], labels = range(classes), autopct='%.2f %%')
plt.title("Levels of DR in the original dataset")
img_path = os.path.sep.join([DB_PATH, 'distribution.png'])
plt.savefig(img_path)
plt.show()

In [None]:
dataset = orig_ds.groupby(['level']).sample(700)
display(dataset.groupby('level')['image'].count())

plt.pie([dataset.loc[dataset['level'] == n].count()[0] for n in range(classes)], labels = range(classes), autopct='%.2f %%')
plt.title("Levels of DR in the redistributed dataset")
plt.show()

rest_df = orig_ds.loc[~orig_ds['image'].isin(dataset['image'])].dropna()

display(rest_df.groupby('level')['image'].count())

In [None]:
x_train, x_remain, y_train, y_remain = train_test_split(
    dataset['image'],
    dataset['level'],
    train_size=0.7,
    test_size=0.3,
    random_state=42
)

In [None]:
train_df = pd.DataFrame()
train_df['image'] = x_train
train_df['level'] = y_train

In [None]:
remain_df = pd.DataFrame()
remain_df['image'] = x_remain
remain_df['level'] = y_remain

In [None]:
x_val, x_test, y_val, y_test = train_test_split(
    remain_df['image'],
    remain_df['level'],
    train_size=0.5,
    test_size=0.5,
    random_state=42
)

In [None]:
val_df = pd.DataFrame()
val_df['image'] = x_val
val_df['level'] = y_val

In [None]:
test_df = pd.DataFrame()
test_df['image'] = x_test
test_df['level'] = y_test

In [None]:
display(train_df.groupby(['level'])['image'].count())
# plt.pie([train_df.loc[train_df['level'] == n].count()[0] for n in range(classes)], labels = range(classes), autopct='%.2f %%')
# plt.title("Levels of DR in the train_df")
# img_path = os.path.sep.join([DB_PATH, 'train_dist.png'])
# plt.savefig(img_path)
# plt.show()
train_df.head()

In [None]:
display(val_df.groupby(['level'])['image'].count())
# plt.pie([val_df.loc[val_df['level'] == n].count()[0] for n in range(classes)], labels = range(classes), autopct='%.2f %%')
# plt.title("Levels of DR in the val_df")
# img_path = os.path.sep.join([DB_PATH, 'val_dist.png'])
# plt.savefig(img_path)
# plt.show()
val_df.head()

In [None]:
display(test_df.groupby(['level'])['image'].count())
# plt.pie([test_df.loc[test_df['level'] == n].count()[0] for n in range(classes)], labels = range(classes), autopct='%.2f %%')
# plt.title("Levels of DR in the test_df")
# img_path = os.path.sep.join([DB_PATH, 'test_dist.png'])
# plt.savefig(img_path)
# plt.show()
test_df.head()

In [None]:
def create_path(base_dir:str, dir_name:str):
    directory = os.path.sep.join([base_dir, dir_name])
    """
    Creates a new directory if it doesn't exist already

    - dir_name: path to directory
    """
    if not os.path.exists(directory):
        os.mkdir(directory)
    else:
        print(f'[INFO] {directory} already exists')
    
    return directory

def display_df_dist(df: pd.DataFrame, df_name:str):
    dfp = df.pivot_table(
        index='level',
        aggfunc='size'
    )
    dfp.plot(
        kind='bar',
        figsize=(5,3),
        rot=0
    )
    plt.title(f"Levels of DR in the {df_name} dataset")
    plt.savefig(os.path.sep.join(['DB', f'{df_name}.png']))
    plt.show()
    df.head()

def organize_set_imgs(df: pd.DataFrame, set_path:str):

    # images divided by class dir
    class_directories = {str(k):list(df['image'].loc[df['level'] == k]) for k in range(classes)}
    # display(class_directories)

    for k,imgs in class_directories.items():
        class_dir = create_path(set_path, k)

        widgets = [
            f'Building {set_path.split(os.path.sep)[-1]}/{k}:',
            ' ',
            progressbar.Percentage(),
            ' ',
            progressbar.Bar(),
            ' ',
            progressbar.Counter()
        ]

        pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(imgs))
        
        pbar.start()

        for i, img in enumerate(imgs):
            im_origin = os.path.sep.join([IMG_PATH, f'{img}.jpeg'])

            copy(im_origin, class_dir)

            pbar.update(i)
        
    pbar.finish()


In [None]:
data = [
    (train_df, 'Train'),
    (val_df, 'Val'),
    (test_df, 'Test'),
    (rest_df, 'Rest'),
]

for df, subset in data:
    ds_path = create_path(DB_PATH, subset)
    display_df_dist(df,  subset)
    organize_set_imgs(df, ds_path)