In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib inline
 
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import numpy as np
from glob import glob 
from PIL import Image
import tensorflow as tf 
import cv2
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping , ReduceLROnPlateau
import datetime
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
all_xray = pd.read_csv('../input/data/Data_Entry_2017.csv')
bbox_list = pd.read_csv('/kaggle/input/data/BBox_List_2017.csv')


all_xray.sample(5)

# Data Exploration

In [None]:
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('..', 'input', 'data',  'images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray.shape[0])
all_xray['path'] = all_xray['Image Index'].map(all_image_paths.get)

In [None]:
all_xray["Finding Labels"].nunique()

In [None]:
label_counts = all_xray['Finding Labels'].value_counts()[:15]
fig, ax1 = plt.subplots(1, 1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts)) + 0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts)) + 0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)

In [None]:
bbox_labels = list(bbox_list["Finding Label"].unique())
bbox_labels

In [None]:
bbox_list.sample(5)

In [None]:
bbox_list['Finding Label'].value_counts()


In [None]:
label_counts = bbox_list['Finding Label'].value_counts()
fig, ax1 = plt.subplots(1, 1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts)) + 0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts)) + 0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)

In [None]:
A = all_xray.set_index('Image Index')
B = bbox_list.set_index('Image Index')


In [None]:
data = B.join(A, how = "inner")
data = data.reset_index(drop = False)
data = data.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 11'], axis = 1)
data.head(10)

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.loc[:, 'Bbox [x':'h]']

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols = 4, figsize = (15, 10), subplot_kw = {'xticks':[], 'yticks':[]})
for i, ax in enumerate(axes.flat):
    img = cv2.imread(data.loc[i, 'path'])
    cv2.rectangle(img, (int(data.iloc[i, 2:6][0]), int(data.iloc[i, 2:6][1])), (int(data.iloc[i, 2:6][0] + data.iloc[i, 2:6][2]), int(data.iloc[i, 2:6][1] + data.iloc[i, 2:6][3])), (255, 0, 0), 10)
    img = cv2.resize(img, (80, 80))
    ax.imshow(img)
    ax.set_title(data.loc[i, 'Finding Label'])
fig.tight_layout()    

plt.show()

# Data Preprocessing

**Because we'll split the data we must identify patient overlap ( check to see if a patient's ID appears in both the training set and the test set )**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score

train_data, val_data = train_test_split(data, test_size=0.2 , random_state=42)

In [None]:
print(f" Train data shape : {train_data.shape} , Test Data shape : {val_data.shape} ")


Extract and compare the PatientId columns from the train and validation sets :

1. **Extract patient IDs from the train and validation sets**
2. **Convert these arrays of numbers into set() datatypes for easy comparison**
3. **Identify patient overlap in the intersection of the two sets**

In [None]:
# Extract patient id's for the training set
ids_train = train_data['Patient ID'].values
# Extract patient id's for the validation set
ids_valid = val_data['Patient ID'].values

In [None]:
# Create a "set" datastructure of the training set id's to identify unique id's
ids_train_set = set(ids_train)
print(f'There are {len(ids_train_set)} unique Patient IDs in the training set')
# Create a "set" datastructure of the validation set id's to identify unique id's
ids_valid_set = set(ids_valid)
print(f'There are {len(ids_valid_set)} unique Patient IDs in the validation set')

In [None]:
# Identify patient overlap by looking at the intersection between the sets
patient_overlap = list(ids_train_set.intersection(ids_valid_set))
n_overlap = len(patient_overlap)
print(f'There are {n_overlap} Patient IDs in both the training and validation sets')
print('')
print(f'These patients are in both the training and validation datasets:')
print(f'{patient_overlap}')

In [None]:
train_overlap_idxs = []
valid_overlap_idxs = []
for idx in range(n_overlap):
    train_overlap_idxs.extend(train_data.index[train_data['Patient ID'] == patient_overlap[idx]].tolist())
    valid_overlap_idxs.extend(val_data.index[val_data['Patient ID'] == patient_overlap[idx]].tolist())
    
print(f'These are the indices of overlapping patients in the training set: ')
print(f'{train_overlap_idxs}')
print(f'These are the indices of overlapping patients in the validation set: ')
print(f'{valid_overlap_idxs}')

In [None]:
# Drop the overlapping patients from the validation set and add them to the training set

# Get the overlapping rows from the validation set
overlapping_rows = val_data.loc[valid_overlap_idxs]

val_data.drop(valid_overlap_idxs, inplace=True)


# Add the overlapping rows to the training set
train_data = pd.concat([train_data, overlapping_rows])

# Reset the indices of the updated training set
train_data.reset_index(drop=True, inplace=True)

Check that everything worked as planned by rerunning the patient ID comparison between train and validation sets.

In [None]:
# Extract patient id's for the validation set
ids_valid = val_data['Patient ID'].values
# Create a "set" datastructure of the validation set id's to identify unique id's
ids_valid_set = set(ids_valid)
print(f'There are {len(ids_valid_set)} unique Patient IDs in the validation set')

# Identify patient overlap by looking at the intersection between the sets
patient_overlap = list(ids_train_set.intersection(ids_valid_set))
n_overlap = len(patient_overlap)
print(f'There are {n_overlap} Patient IDs in both the training and validation sets')

**Because we're going to resize the images to 320 x 320, we also need to apply the same logic to the positions defining the regions** 

In [None]:


IMAGE_SIZE = 320
train_data['x0'] = train_data['Bbox [x'] *  IMAGE_SIZE /1024
train_data['y0'] = train_data['y'] *  IMAGE_SIZE / 1024
train_data['w0'] = train_data['w'] *  IMAGE_SIZE /1024 
train_data['h0'] = train_data['h]'] *  IMAGE_SIZE /1024

val_data['x0'] = val_data['Bbox [x'] *  IMAGE_SIZE /1024
val_data['y0'] = val_data['y'] *  IMAGE_SIZE / 1024
val_data['w0'] = val_data['w'] *  IMAGE_SIZE /1024 
val_data['h0'] = val_data['h]'] *  IMAGE_SIZE /1024

In [None]:
train_data[['path','x0','y0','h0','w0']]

# Models Implementation

In our work we'll use mainly **Intersection over Union (IoU)** but also Accuracy , Validation and training Loss
IoU metric in object detection evaluates the degree of overlap between the ground(gt) truth and prediction(pd). The ground truth and the prediction are shape-rectangular box .
Diagrammatically, IoU is defined as follows (the area of the intersection divided by the area of union between ground-truth and predicted box.



In [None]:
from keras.utils import Sequence
from keras.backend import epsilon

def loss(gt,pred):
    intersections = 0
    unions = 0
    gt = tf.cast(gt, tf.float32)  # Convert to float32 explicitly
    pred = tf.cast(pred, tf.float32)  # Convert to float32 explicitly
    diff_width = np.minimum(gt[:,0] + gt[:,2], pred[:,0] + pred[:,2]) - np.maximum(gt[:,0], pred[:,0])
    diff_height = np.minimum(gt[:,1] + gt[:,3], pred[:,1] + pred[:,3]) - np.maximum(gt[:,1], pred[:,1])
    intersection = diff_width * diff_height
    
    # Compute union
    area_gt = gt[:,2] * gt[:,3]
    area_pred = pred[:,2] * pred[:,3]
    union = area_gt + area_pred - intersection

#     Compute intersection and union over multiple boxes
    for j, _ in enumerate(union):
        if union[j] > 0 and intersection[j] > 0 and union[j] >= intersection[j]:
            intersections += intersection[j]
            unions += union[j]

    # Compute IOU. Use epsilon to prevent division by zero
    iou = np.round(intersections / (unions + epsilon()), 4)
    iou = iou.astype(np.float32)
    return iou

def IoU(y_true, y_pred):
    iou = tf.py_function(loss, [y_true, y_pred], tf.float32)
    return iou

## VGG19

In [None]:
from keras import Model

from keras.applications.mobilenet import MobileNet, preprocess_input

from keras.applications.vgg19 import VGG19
from keras import regularizers 
from keras.regularizers import l2
from keras.layers import Conv2D, Reshape, Dropout , BatchNormalization

model = VGG19(include_top=False,input_shape=(320, 320, 3), weights='/kaggle/input/vggweight/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5') 



for layer in model.layers:
    layer.trainable = False
    
from keras.layers import Conv2D , Reshape
x = model.layers[-1].output
x = Conv2D(4, kernel_size=10,name="CV")(x)
#x = BatchNormalization()(x)
x = Reshape((4,))(x) 

model = Model(inputs=model.input, outputs=x)
model.summary()

In [None]:
from tqdm import tqdm 
def read_img(img_path):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (320, 320))
    return img

train_img = []
for img_path in tqdm(train_data['path'].values):
    train_img.append(read_img( img_path))

val_img = []
for img_path in tqdm(val_data['path'].values):
    val_img.append(read_img( img_path))
    
    
X_train = np.array(train_img, np.float32) / 255  
X_val = np.array(val_img, np.float32) / 255  

In [None]:
train_data[['x0','y0','h0','w0']]

In [None]:
y_train = train_data[['x0','y0','h0','w0']]
y_val = val_data[['x0','y0','h0','w0']]
model.compile(optimizer='adam',loss='mean_squared_error', metrics=IoU)
checkpoint = ModelCheckpoint('VGG_model_1.h5', save_best_only=True, save_weights_only=False , monitor='val_loss' , mode='min', verbose=1 , period=1)
history_1 = model.fit(x= X_train , y=y_train, epochs= 100, validation_data=(X_val,y_val), steps_per_epoch=None,batch_size = 16, verbose=1,callbacks=checkpoint)

In [None]:
plt.plot(history_1.history['loss'])
plt.plot(history_1.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training', 'validation'], loc='best')


plt.show()

In [None]:
"""
import numpy as np

iou_values = []
total_batches = len(validation_generator)

for i in range(total_batches):
    batch_images, batch_labels = validation_generator[i]
    predictions = model.predict(batch_images)
    
    iou = IoU(batch_labels, predictions)
    iou_values.append(iou.numpy())  # Collect IoU values for each batch

mean_iou = np.mean(iou_values)  # Calculate the mean IoU over all batches
print("Mean IoU:", mean_iou)
print("Max IoU:", np.max(iou_values))"""


# Model 2

# Model selection

# Results 

In [None]:
#Tu peux utiliser cette fonction pour le test sur le meilleur modeles

import matplotlib.patches as patches

def plot_predictions(model , sample_df, X , Y , IMAGE_SIZE ):
    for i in range(sample_df.shape[0]):
        org_x0 = Y.iloc[i][0]
        org_y0 = Y.iloc[i][1]
        org_h0 = Y.iloc[i][2]
        org_w0 = Y.iloc[i][3]

        image = cv2.resize(X[i], (IMAGE_SIZE, IMAGE_SIZE)) 
        region = model.predict(x=np.array([image]))[0]

        x0 =region[0]  
        y0 = region[1]
        h0 = region[2]
        w0 = region[3]
        # Display the image
        fig,ax = plt.subplots(1)
        ax.imshow(X[i])

        # Create a Rectangle patch
        # x1-x0 is the width of the bounding box
        # y1-y0 is the height of the bounding box
        rect_pred = patches.Rectangle((x0, y0), w0, h0, linewidth=2, edgecolor='r', facecolor='none')
        rect_org = patches.Rectangle((org_x0, org_y0), org_w0, org_h0, linewidth=2, edgecolor='b', facecolor='none')
        # Add the patch to the Axes
        ax.add_patch(rect_pred)
        ax.add_patch(rect_org)

        # Image coordinates - top-left of the image is (0,0)

        ax.plot(x0, y0, 'o', color='b') # top-left of the bounding box
        ax.plot(x0+w0, y0+h0, '*', color='c') # bottom-right of the bounding-box
        ax.set_title("ok")
        plt.show()
        print(Y.iloc[i])
        print(region)
        #fig.savefig('prediction'+sample_df['Image Index'][i])

In [None]:

sample_df = val_data.head(103)

plot_predictions(model=model , sample_df=sample_df, X=X_val , Y=y_val , IMAGE_SIZE=320 )

In [None]:
import os

folder_path = '/kaggle/working' 
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        os.remove(file_path)


folder_path = '/kaggle/working'
for root, dirs, files in os.walk(folder_path, topdown=False):
    for name in files:
        file_path = os.path.join(root, name)
        os.remove(file_path)
    for name in dirs:
        dir_path = os.path.join(root, name)
        os.rmdir(dir_path)

