In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.image as mpimg
from tensorflow.keras.preprocessing.image import ImageDataGenerator 
from tensorflow.keras.applications.densenet import DenseNet121 
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D 
from tensorflow.keras.models import Model 
from tensorflow.keras import backend as K 
from tensorflow.keras.models import load_model
import os, sys
import cv2
import random
%matplotlib inline

In [None]:
#ROOT_DIR = 
DATA_SETS = '../input/ranzcr-clip-catheter-line-classification'
#SCRIPTS = 

In [None]:
datasets_list = os.listdir(DATA_SETS)
datasets_list

In [None]:
train = os.path.join(DATA_SETS, 'train')
submission = os.path.join(DATA_SETS,'sample_submission.csv')
test = os.path.join(DATA_SETS, 'test')

In [None]:
train_df = pd.read_csv(DATA_SETS+'/train.csv')
print(f'Train Data CSV: {train_df.shape[0]}')
train_df.head()

In [None]:
sub_df = pd.read_csv(submission)
print(f'Sub CSV: {sub_df.shape[0]}')
sub_df.head()

In [None]:
# label column names
label_cols = list(sub_df.columns[1:])
image_labels = train_df[label_cols].values # will be used for train validation splitting

In [None]:
# Count up the number of instances of each class (drop non-class columns from the counts) 
class_counts = train_df.sum().drop(['StudyInstanceUID','PatientID'])

In [None]:
# we plot the distribution of patients to check if there is any class imbalance in the dataset
def plot_class_distributions(values, index):
    sns.barplot(x=values, y=index)
    plt.title('Distribution of classes for the patients')
    plt.xlabel('Patient Count', fontsize=15)
    plt.ylabel('Catheter Position', fontsize=15)
    plt.show()

In [None]:
plot_class_distributions(class_counts.values, class_counts.index)

In [None]:
train_images = DATA_SETS + "/train/" + train_df['StudyInstanceUID'] + '.jpg'   
test_images = DATA_SETS + "/test/" + sub_df['StudyInstanceUID'] + '.jpg'
len(test_images)

In [None]:
data_image = os.path.join(DATA_SETS, "train_annotations.csv")
data_image = pd.read_csv(data_image)
data_image.head()

In [None]:
# display images randomly on a grid
def display_images(image_ids, labels):
    fig = plt.figure(figsize = (16,12))
    
    for index, (image_id, label) in enumerate(zip(image_ids,labels)):
        plt.subplot(3,3, index+1)
        image = image_id + '.jpg'
        image = mpimg.imread(os.path.join(DATA_SETS, "train", image))
        plt.title(label, fontsize=12)
        plt.imshow(image,cmap='Greys')
    
    fig.tight_layout()
    plt.show()    
        

In [None]:
tmp_train = data_image.sample(9)
image_ids = tmp_train["StudyInstanceUID"].values
labels = tmp_train['label'].values
display_images(image_ids, labels)

In [None]:
# now we check for any kinds of data leakage between any two data sets, train-test, train-valid, valid-test
def check_for_leakage(df1, df2, patient_col):
    
    df1_unique = set(df1[patient_col])
    df2_unique = set(df2[patient_col])
    
    common_patients = df1_unique.intersection(df2_unique)
    
    return True if len(common_patients) > 0 else False
    
    

In [None]:
check_for_leakage(train_df, sub_df, 'StudyInstanceUID')

In [None]:
# for making validation set from given train set
def make_val_set(data, val_size):
    val_patientid = random.sample(list(train_df['PatientID'].unique()),int(val_size*len(train_df['PatientID'].unique())))
    df_train = data[~data['PatientID'].isin(val_patientid)]
    df_val = data[data['PatientID'].isin(val_patientid)]
    return df_train, df_val

In [None]:
df_train, df_val = make_val_set(train_df, val_size=0.01)

In [None]:
check_for_leakage(df_train, df_val, 'PatientID')

In [None]:
print(f'Train Data Size: {df_train.shape[0]}')
print(f'Validation Data Size: {df_val.shape[0]}')

In [None]:
TRAIN_IMAGE_DIR = DATA_SETS + '/train/'
TEST_IMAGE_DIR = DATA_SETS + '/test/'

In [None]:
# without augmentation
def get_train_generator(df, image_dir, image_id, label_names, shuffle=True, batch_size=32, seed=1, target_width=256, target_height=256):
    print('Train Generator Preparation: ')
    
    # first we normalize the images
    image_generator = ImageDataGenerator(
        samplewise_center=True,
        samplewise_std_normalization=True
    )
    
    # next we flow from data frame with a certain batch size.
    # This flows the images from the directory
    generator = image_generator.flow_from_dataframe(
        dataframe=df,
        directory=image_dir,
        x_col=image_id,
        y_col=label_names,
        class_mode="raw",
        classes=label_names,
        batch_size=batch_size,
        shuffle=shuffle,
        seed=1,
        target_size=(target_width, target_height)
    )
    
    return generator
    

In [None]:
# create image generator
def get_image_generator(train_df, image_dir, image_id, label_names, shuffle=True, batch_size=32, seed=1, target_width=256, target_height=256, sample_size=100):
    # generator to sample dataset
    raw_train_generator = ImageDataGenerator().flow_from_dataframe(
        dataframe=train_df,
        directory=image_dir,
        x_col=image_id,
        y_col=label_names,
        class_mode='raw',
        batch_size=sample_size,
        shuffle=shuffle,
        target_size=(target_width, target_height)
    )
    
    batch = raw_train_generator.next() 
    data_sample = batch[0] 
    
    # use sample to fit mean and std for test set generator 
    image_generator = ImageDataGenerator( 
        featurewise_center=True, 
        featurewise_std_normalization= True) 
    
    # fit generator to sample from training data 
    image_generator.fit(data_sample)
    
    return image_generator

In [None]:
def get_generator(df, image_dir, x_col, y_cols, image_generator, batch_size=32, seed=1, target_width = 256, target_height = 256):
    # get valid generator 
    generator = image_generator.flow_from_dataframe( 
        dataframe=df, 
        directory=image_dir, 
        x_col=x_col, y_col=y_cols, 
        class_mode="raw", 
        batch_size=batch_size, 
        shuffle=False, 
        seed=seed, 
        target_size=(target_width,target_height)
    )
    
    return generator

In [None]:
def append_ext(fn, ext='.jpg'):
    return fn+ext

train_df_splitted['StudyInstanceUID'] = train_df_splitted['StudyInstanceUID'].apply(append_ext)
valid_df['StudyInstanceUID'] = valid_df['StudyInstanceUID'].apply(append_ext)
sub_df['StudyInstanceUID'] = sub_df['StudyInstanceUID'].apply(append_ext)

In [None]:
train_generator = get_train_generator(train_df_splitted, TRAIN_IMAGE_DIR, "StudyInstanceUID", label_cols)
raw_image_generator = get_image_generator(train_df_splitted, TRAIN_IMAGE_DIR, "StudyInstanceUID", label_cols)
valid_generator = get_generator(valid_df, TRAIN_IMAGE_DIR, "StudyInstanceUID", label_cols, raw_image_generator)
test_generator= get_generator(sub_df, TEST_IMAGE_DIR, "StudyInstanceUID", label_cols, raw_image_generator)