# 1. importing libraries


In [None]:
import pandas as pd     # csv file handling
import os   # file manipulation
import shutil   # file manipulation
import matplotlib.pyplot as plt # labelling plots

# 2. reading csv files

In [None]:
# creation of dataframes
csv_train = pd.read_csv("./Image data/Training_Set/RFMid_Training_labels.csv")
csv_test = pd.read_csv("./Image data/Test_Set/RFMid_Testing_labels.csv")
csv_eval = pd.read_csv("./Image data/Evaluation_Set/RFMid_Validation_labels.csv")


# 3. creating subdirectories to place healthy and unhealthy retinal images


In [None]:
# extracting the current directory
directory = os.getcwd()
directory = os.path.join(directory, "Image data\\")
# iterating through the subdirectories
for dir in os.listdir(directory):
    d = os.path.join(directory, dir)
    if os.path.isdir(d):
        # creating subdirectories
        p1 = os.path.join(d, "healthy\\")
        p2 = os.path.join(d, "unhealthy\\")
        os.mkdir(p1)
        os.mkdir(p2)


# 4. extracting disease risk data into new dataframes


In [None]:
# creating new dataframes to contain only ID and Disease Risk
train_risk = csv_train[['ID', 'Disease_Risk']].copy()
test_risk = csv_test[['ID', 'Disease_Risk']].copy()
eval_risk = csv_eval[['ID', 'Disease_Risk']].copy()

# conversion from binary representation to string for easier classification
train_risk['Disease_Risk'] = train_risk['Disease_Risk'].replace(
    1, "healthy").astype(str)
train_risk['Disease_Risk'] = train_risk['Disease_Risk'].replace(
    '0', "unhealthy").astype(str)
test_risk['Disease_Risk'] = test_risk['Disease_Risk'].replace(
    1, "healthy").astype(str)
test_risk['Disease_Risk'] = test_risk['Disease_Risk'].replace(
    '0', "unhealthy").astype(str)
eval_risk['Disease_Risk'] = eval_risk['Disease_Risk'].replace(
    1, "healthy").astype(str)
eval_risk['Disease_Risk'] = eval_risk['Disease_Risk'].replace(
    '0', "unhealthy").astype(str)


# 5. plotting of graphs according to disease to compare


## 5.1. training set


In [None]:
# creating a new dataframe to make frequency plots
train_plot = csv_train.copy()
# dropping irrelevant columns
train_plot.drop("ID", axis=1, inplace=True)
train_plot.drop("Disease_Risk", axis=1, inplace=True)
# summation of columns
train_plot = train_plot.sum()
# plotting of bar graph
ax = train_plot.plot.bar(figsize=(30, 30))
ax.bar_label(ax.containers[0])
plt.xlabel('Diseases')
plt.ylabel('Frequency')


## 5.2. testing set


In [None]:
# creating a new dataframe to make frequency plots
test_plot=csv_test.copy()
# dropping irrelevant columns
test_plot.drop("ID",axis=1,inplace=True)
test_plot.drop("Disease_Risk",axis=1,inplace=True)
# summation of columns
test_plot = test_plot.sum()
# plotting of bar graph
ax=test_plot.plot.bar(figsize=(30,30))
ax.bar_label(ax.containers[0])
plt.xlabel('Diseases')
plt.ylabel('Frequency')

## 5.3. evaluation set


In [None]:
# creating a new dataframe to make frequency plots
eval_plot = csv_eval.copy()
# dropping irrelevant columns
eval_plot.drop("ID", axis=1, inplace=True)
eval_plot.drop("Disease_Risk", axis=1, inplace=True)
# summation of columns
eval_plot = eval_plot.sum()
# plotting of bar graph
ax = eval_plot.plot.bar(figsize=(30, 30))
ax.bar_label(ax.containers[0])
plt.xlabel('Diseases')
plt.ylabel('Frequency')


# 6. plotting healthy and unhealthy retinal statistics


## 6.1. training set


In [None]:
# plotting of frequency of retinal health
ax = train_risk['Disease_Risk'].value_counts().plot.bar()
ax.bar_label(ax.containers[0])
plt.xlabel('Condition')
plt.ylabel('Frequency')


## 6.2. testing set


In [None]:
# plotting of frequency of retinal health
ax = test_risk['Disease_Risk'].value_counts().plot.bar()
ax.bar_label(ax.containers[0])
plt.xlabel('Condition')
plt.ylabel('Frequency')


## 6.3. evaluation set


In [None]:
# plotting of frequency of retinal health
ax = eval_risk['Disease_Risk'].value_counts().plot.bar()
ax.bar_label(ax.containers[0])
plt.xlabel('Condition')
plt.ylabel('Frequency')


# 7. segregating images into healthy and unhealthy subdirectories


## 7.1. training set


In [None]:
# iterating through rows of dataframe
for index, row in train_risk.iterrows():
    # initialising source and destination paths
    src = os.path.join(directory, "Training_Set\\Training\\"+str(index+1)+".png")
    healthy = os.path.join(directory, "Training_Set\\healthy\\"+str(index+1)+".png")
    unhealthy = os.path.join(directory, "Training_Set\\unhealthy\\"+str(index+1)+".png")
    # moving of files
    if row['Disease_Risk'] == "healthy":
        shutil.move(src, healthy)
    else:
        shutil.move(src, unhealthy)


## 7.2. testing set


In [None]:
# iterating through rows of dataframe
for index, row in test_risk.iterrows():
    # initialising source and destination paths
    src = os.path.join(directory, "Test_Set\\Test\\"+str(index+1)+".png")
    healthy = os.path.join(directory, "Test_Set\\healthy\\"+str(index+1)+".png")
    unhealthy = os.path.join(directory, "Test_Set\\unhealthy\\"+str(index+1)+".png")
    # moving of files
    if row['Disease_Risk'] == "healthy":
        shutil.move(src, healthy)
    else:
        shutil.move(src, unhealthy)


## 7.3. evaluation set


In [None]:
# iterating through rows of dataframe
for index, row in eval_risk.iterrows():
    # initialising source and destination paths
    src = os.path.join(directory, "Evaluation_Set\\Validation\\"+str(index+1)+".png")
    healthy = os.path.join(directory, "Evaluation_Set\\healthy\\"+str(index+1)+".png")
    unhealthy = os.path.join(directory, "Evaluation_Set\\unhealthy\\"+str(index+1)+".png")
    # moving of files
    if row['Disease_Risk'] == "healthy":
        shutil.move(src, healthy)
    else:
        shutil.move(src, unhealthy)
