In [1]:
import h5py
import pandas as pd
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
# read metadata
path = "/media/yesindeed/WD5T/data/PAPILA/"

# OD for right, OS for left
od_meta = pd.read_csv(path + "ClinicalData/patient_data_od.csv")
os_meta = pd.read_csv(path + "ClinicalData/patient_data_os.csv")
od_meta.head()

Unnamed: 0,ID,Age,Gender,Diagnosis,dioptre_1,dioptre_2,astigmatism,Phakic/Pseudophakic,Pneumatic,Perkins,Pachymetry,Axial_Length,VF_MD
0,#002,47,0,2,0.75,-1.75,90.0,0.0,21.0,,586.0,23.64,-0.07
1,#004,58,1,1,1.5,-1.75,85.0,0.0,,19.0,501.0,23.06,-3.26
2,#005,89,1,1,-0.75,-1.25,101.0,1.0,13.0,14.0,565.0,23.81,-14.98
3,#006,69,0,2,1.0,-1.5,95.0,0.0,22.0,,612.0,26.25,-2.07
4,#007,22,1,2,-0.25,0.0,0.0,0.0,14.0,,,23.39,-2.3


In [4]:
os_meta.head()

Unnamed: 0,ID,Age,Gender,Diagnosis,dioptre_1,dioptre_2,astigmatism,Phakic/Pseudophakic,Pneumatic,Perkins,Pachymetry,Axial_Length,VF_MD
0,#002,47,0,2,-0.5,-1.5,88.0,0.0,20.0,,603.0,23.77,0.17
1,#004,58,1,1,1.5,-2.5,85.0,1.0,,19.0,511.0,22.96,-6.77
2,#005,89,1,1,-0.5,-2.0,100.0,1.0,24.0,10.0,575.0,24.33,-7.44
3,#006,69,0,2,1.0,-1.5,85.0,0.0,22.0,,593.0,26.21,-3.31
4,#007,22,1,2,-0.25,-0.5,0.0,0.0,13.0,,,23.35,-2.61


In [5]:
ids = os_meta["ID"].values
os_path = ["RET" + x[1:] + "OS.jpg" for x in ids]
os_meta["Path"] = os_path
os_path

['RET002OS.jpg',
 'RET004OS.jpg',
 'RET005OS.jpg',
 'RET006OS.jpg',
 'RET007OS.jpg',
 'RET008OS.jpg',
 'RET009OS.jpg',
 'RET010OS.jpg',
 'RET013OS.jpg',
 'RET014OS.jpg',
 'RET015OS.jpg',
 'RET016OS.jpg',
 'RET018OS.jpg',
 'RET019OS.jpg',
 'RET020OS.jpg',
 'RET021OS.jpg',
 'RET023OS.jpg',
 'RET024OS.jpg',
 'RET025OS.jpg',
 'RET026OS.jpg',
 'RET027OS.jpg',
 'RET028OS.jpg',
 'RET029OS.jpg',
 'RET030OS.jpg',
 'RET031OS.jpg',
 'RET032OS.jpg',
 'RET033OS.jpg',
 'RET034OS.jpg',
 'RET035OS.jpg',
 'RET036OS.jpg',
 'RET037OS.jpg',
 'RET038OS.jpg',
 'RET039OS.jpg',
 'RET041OS.jpg',
 'RET042OS.jpg',
 'RET044OS.jpg',
 'RET045OS.jpg',
 'RET046OS.jpg',
 'RET047OS.jpg',
 'RET048OS.jpg',
 'RET050OS.jpg',
 'RET051OS.jpg',
 'RET053OS.jpg',
 'RET055OS.jpg',
 'RET056OS.jpg',
 'RET057OS.jpg',
 'RET062OS.jpg',
 'RET064OS.jpg',
 'RET065OS.jpg',
 'RET066OS.jpg',
 'RET067OS.jpg',
 'RET068OS.jpg',
 'RET069OS.jpg',
 'RET072OS.jpg',
 'RET073OS.jpg',
 'RET074OS.jpg',
 'RET077OS.jpg',
 'RET079OS.jpg',
 'RET081OS.jpg

In [6]:
ids = od_meta["ID"].values
od_path = ["RET" + x[1:] + "OD.jpg" for x in ids]
od_meta["Path"] = od_path

In [7]:
meta_all = pd.concat([od_meta, os_meta])
subcolumns = ["ID", "Age", "Gender", "Diagnosis", "Path"]
meta_all = meta_all[subcolumns]
meta_all

Unnamed: 0,ID,Age,Gender,Diagnosis,Path
0,#002,47,0,2,RET002OD.jpg
1,#004,58,1,1,RET004OD.jpg
2,#005,89,1,1,RET005OD.jpg
3,#006,69,0,2,RET006OD.jpg
4,#007,22,1,2,RET007OD.jpg
...,...,...,...,...,...
239,#289,64,0,0,RET289OS.jpg
240,#290,75,1,0,RET290OS.jpg
241,#291,55,0,0,RET291OS.jpg
242,#292,56,1,0,RET292OS.jpg


In [8]:
meta_binary = meta_all[(meta_all["Diagnosis"].values == 1.0) | (
    meta_all["Diagnosis"].values == 0.0)]
meta_binary

Unnamed: 0,ID,Age,Gender,Diagnosis,Path
1,#004,58,1,1,RET004OD.jpg
2,#005,89,1,1,RET005OD.jpg
7,#010,72,1,1,RET010OD.jpg
8,#013,70,1,1,RET013OD.jpg
9,#014,60,1,1,RET014OD.jpg
...,...,...,...,...,...
239,#289,64,0,0,RET289OS.jpg
240,#290,75,1,0,RET290OS.jpg
241,#291,55,0,0,RET291OS.jpg
242,#292,56,1,0,RET292OS.jpg


In [9]:
meta_binary["Gender"].value_counts()

Gender
1    274
0    146
Name: count, dtype: int64

In [22]:
meta_binary.to_csv(os.path.join(path, "test.csv"), index=False)

In [23]:
tem_train = pd.read_csv(
    "/home/yesindeed/Documents/Dropbox/PAPILA-split/new_train.csv")
tem_test = pd.read_csv(
    "/home/yesindeed/Documents/Dropbox/PAPILA-split/new_test.csv")

df = pd.read_csv(os.path.join(path, "all.csv"))
df_train = df.loc[df["Path"].isin(tem_train["Path"])].reset_index(drop=True)
df_test = df.loc[df["Path"].isin(tem_test["Path"])].reset_index(drop=True)

df_train.to_csv(os.path.join(path, "train.csv"), index=False)
df_test.to_csv(os.path.join(path, "test.csv"), index=False)

In [3]:
# age

df_test = pd.read_csv(os.path.join(path, "test.csv"))

df_test = df_test[~df_test["Age"].isnull()]

df_test["age_binary"] = df_test["Age"].values.astype("float")
df_test["age_binary"] = np.where(
    df_test["age_binary"].between(-1, 60), 0, df_test["age_binary"])
df_test["age_binary"] = np.where(
    df_test["age_binary"] >= 60, 1, df_test["age_binary"])

class_counts = df_test["age_binary"].value_counts()
print(class_counts)
min_count = class_counts.min()
balanced_test_meta = df_test.groupby("age_binary").apply(
    lambda x: x.sample(min_count)).reset_index(drop=True)


balanced_test_meta.to_csv(os.path.join(path, "test_age.csv"), index=False)

balanced_test_meta["age_binary"].value_counts()

age_binary
1.0    32
0.0    24
Name: count, dtype: int64


  balanced_test_meta = df_test.groupby("age_binary").apply(


age_binary
0.0    24
1.0    24
Name: count, dtype: int64