## Processing the data

In [37]:
import glob
import os
import random
import math
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import torch
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2

warnings.filterwarnings('ignore')


In [38]:
class DatasetParser():
    def __init__(self, root_dir, images_dir, labels_csv):
        self.image_paths = sorted(glob.glob(os.path.join(root_dir, images_dir,"*.png")))
        self.labels_df = self._labels_by_task(root_dir=root_dir, labels_csv=labels_csv)
        
        self.labels = ['Cardiomegaly','Emphysema','Effusion',
                           'Hernia','Nodule','Pneumothorax','Atelectasis',
                           'Pleural_Thickening','Mass','Edema','Consolidation',
                           'Infiltration','Fibrosis','Pneumonia', 'No Finding']
    
    def visualize_random_images(self, num_images=1, label=None, display_label=False):
        fig = plt.figure(figsize=(20,20))
        fig.tight_layout(pad=10.0)
        if label is None:
            idxs = random.sample(range(len(self.image_paths)), num_images)
        else:
            idxs = [idx for idx in range(len(self.labels_df['Label'])) if label in self.labels_df['Label'][idx]]
            if len(idxs) < num_images:
                num_images = len(idxs)
            else:
                idxs = random
                (idxs, num_images)
                
        num_rows = math.ceil(np.sqrt(num_images))
        num_cols = math.ceil(num_images/num_rows)
        
        for i in range(num_images):
            img = cv2.imread(self.image_paths[idxs[i]])
            plt.subplot(num_rows, num_cols, i+1)
            if display_label:
                plt.gca().set_title(self.labels_df['Label'][idxs[i]],wrap=True)
            plt.axis('off')
            plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    
    def _labels_by_task(self, root_dir=None, labels_csv=None):

        labels_df = pd.read_csv(os.path.join(root_dir, labels_csv))
        image_path = {os.path.basename(x): x for x in glob.glob(os.path.join(root_dir, 'images', '*.png'))}
        
        labels_df = labels_df[labels_df['Image Index'].map(os.path.basename).isin(image_path)]

        new_labels_df = pd.DataFrame()
        new_labels_df['Id'] = labels_df['Image Index'].copy()
        
        new_labels_df['Label'] = labels_df['Finding Labels'].apply(lambda val: val.split('|'))
        
        del labels_df
        
        return new_labels_df
        
    def get_labels_df(self):
        new_labels_df = self.labels_df.copy()
        
        for i in range(len(new_labels_df)):
                one_hot = [0 for element in self.labels]
                for element in new_labels_df['Label'][i]:
                    one_hot[self.labels.index(element)] = 1
                new_labels_df['Label'][i] = one_hot
                
        return new_labels_df
    
    def sample(self, num_samples, is_weighted=False):
        if not is_weighted:
            return self.labels_df.sample(num_samples)
        else:
            sample_weights = self.labels_df['Label'].map(lambda x: len(x)).values + 4e-2
            sample_weights /= sample_weights.sum()
            return self.labels_df.sample(num_samples, weights=sample_weights)



In [39]:
parser = DatasetParser(root_dir="/Users/ananyajain/Desktop/CSC413/CSC413-Final-Project/archive/sample",
                       images_dir="sample/images",
                       labels_csv="sample_labels.csv")
print("Total Trainable Data: ", parser.labels_df.shape[0])

Total Trainable Data:  5606


In [40]:
df = parser.sample(100, is_weighted=True)
df.head()

Unnamed: 0,Id,Label
5012,00026524_002.png,[No Finding]
1205,00006352_000.png,[No Finding]
5173,00027630_001.png,[No Finding]
1841,00009717_001.png,[No Finding]
735,00003867_010.png,"[Atelectasis, Consolidation, Effusion]"


In [41]:
train_val, test = train_test_split(df, test_size=0.2, random_state=42)  # Split into train+val (80%) and test (20%)
train, val = train_test_split(train_val, test_size=0.25, random_state=42)  # Split remaining data into train (60%) and val (20%)

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

print("Training set size: ", len(train))
print("Validation set size: ", len(val))
print("Test set size: ", len(test))

Training set size:  60
Validation set size:  20
Test set size:  20


In [42]:
train_datagen = ImageDataGenerator(rescale=1./255,
                                   horizontal_flip = True, 
                                   vertical_flip = False, 
                                   height_shift_range= 0.05, 
                                   width_shift_range=0.1, 
                                   rotation_range=5, 
                                   shear_range = 0.1,
                                   fill_mode = 'reflect',
                                   zoom_range=0.15)

val_datagen = ImageDataGenerator(rescale=1./255)    
train_generator = train_datagen.flow_from_dataframe(
        dataframe=train,
        directory='/Users/ananyajain/Desktop/CSC413/CSC413-Final-Project/archive/sample/sample/images',
        x_col="Id",
        y_col="Label",
        batch_size=32,
        target_size=(224,224),
        classes = parser.labels)

validation_generator = val_datagen.flow_from_dataframe(
        dataframe=val,
        directory='/Users/ananyajain/Desktop/CSC413/CSC413-Final-Project/archive/sample/sample/images',
        x_col="Id",
        y_col="Label",
        batch_size=32,
        target_size=(224,224),
        classes = parser.labels)

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test,
    directory='/Users/ananyajain/Desktop/CSC413/CSC413-Final-Project/archive/sample/sample/images',
    x_col="Id",
    y_col="Label",
    class_mode = "categorical",
    batch_size=32,
    target_size=(224, 224),
    classes = parser.labels,
    shuffle = False)


Found 45 validated image filenames belonging to 15 classes.
Found 13 validated image filenames belonging to 15 classes.
Found 12 validated image filenames belonging to 15 classes.
