In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import os, shutil
import cv2
import skimage
import tensorflow as tf
import matplotlib.pyplot as plt

## Pre-Processing

In [None]:
# Define the directory containing the images
dir_path = '/kaggle/input/123-of-ai-presents-pneumonia-detection-from-xray/processed_train_data'

# List all files in the directory
img_files = os.listdir(dir_path)

df_metadata = pd.read_csv("/kaggle/input/123-of-ai-presents-pneumonia-detection-from-xray/1. train_metadata.csv")


In [None]:
# 1. Displaying the data
plt.figure(figsize=(20,10))

for i in range(9):
    plt.subplot(3,3,i+1)
    img_path = os.path.join(dir_path, img_files[i])
    img = plt.imread(img_path)
    plt.imshow(img, cmap='gray')
    plt.axis("off")
    plt.title("Title")
plt.tight_layout()
plt.show()

# It is evident from the below images, they are in different sizes.

In [None]:
# 2. Explore distribution - understand the data
df_metadata.tail(5)
df_metadata.nunique()

# We have 2 classes present -> pneumonia and healthy

In [None]:
# 2a. Plot the class distribution
class_counts = df_metadata['class'].value_counts()
class_counts.plot(kind='bar', figsize=(20, 6))
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

#There is an imbalance in the dataset, we have more samples in pnemonia class than in healthy

In [None]:
# 3a. Pre-processing 
# Resizing, Color Space, Input construction (what to give as an input?)
# Helper Functions - If we make use of ImageDataGenerator and flow_from_directory, we have inbuilt capabilities

def resize_image(image, size=(224, 224)):
    return cv2.resize(image, size)

def normalize_image(image):
    return image / 255.0

def convert_color_space(image, color_space='gray'):
    if color_space == 'gray':
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    elif color_space == 'rgb':
        return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    else:
        raise ValueError("Unsupported color space")

def preprocess_images(image_paths, size=(224, 224), color_space='gray'):
    preprocessed_images = []
    for image_path in image_paths:
        image = preprocess_image(image_path, size, color_space)
        preprocessed_images.append(image)
    return np.array(preprocessed_images)


In [None]:
# Define the paths
# Create and copy healthy and pneumonia images to working dir in kaggle 
base_dir = '/kaggle/working/processed_train_data'
healthy_dir = os.path.join(base_dir, 'healthy')
pneumonia_dir = os.path.join(base_dir, 'pneumonia')

# Create directories if they don't exist
os.makedirs(healthy_dir, exist_ok=True)
os.makedirs(pneumonia_dir, exist_ok=True)

# Load the metadata file
metadata_file = '/kaggle/input/123-of-ai-presents-pneumonia-detection-from-xray/1. train_metadata.csv'
metadata = pd.read_csv(metadata_file)

# Source directory where images are currently stored
source_dir = '/kaggle/input/123-of-ai-presents-pneumonia-detection-from-xray/processed_train_data'

# Iterate through the metadata and move images
for index, row in metadata.iterrows():
    img_name = row['path']
    img_class = row['class']
    
    if img_class == 'healthy':
        shutil.copy(os.path.join(source_dir, img_name), os.path.join(healthy_dir, img_name))
    elif img_class == 'pneumonia':
        shutil.copy(os.path.join(source_dir, img_name), os.path.join(pneumonia_dir, img_name))

print("Images have been copied successfully!")


In [None]:
# 3b. Synthetic Data Generation
# ImageDataGenerator helps in DataTransformation/Augmentation, Normalization/Scaling
# We can also make use of validation_split to generate train and validation sets
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.25,
    horizontal_flip=True,
    fill_mode='nearest',
    rescale=1./255,
    validation_split=0.2
)


In [None]:
# 3c. Data composition into tf format 
#  The method expects a specific directory structure where images are organized into subdirectories, 
#  each representing a class. This is an important note. Also subset='training/validation'

train_dir ='/kaggle/working/processed_train_data'

train_generator = datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),  # Resize images to 224x224, Default is 256x256
    batch_size=32, #default is 32
    class_mode='binary', # Possible values are categorical, binary, sparse, input, None
#     color_mode='grayscale', # rgb, rgba, grayscale
    shuffle=True,
    seed=42,
    subset='training'
    
)


validation_generator = datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
#     color_mode='grayscale',
    shuffle=False,
    seed=42,
    subset='validation'
)

In [None]:
# Any other relevant pre-processing (upto your exploration)

## Model Development

In [None]:
# 1. Divide into train and test (can be done in the model.fit too)

In [None]:
# 2. Define Model 
# Define the CNN model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])


In [None]:
# pip install tensorflow-addons

In [None]:
# 3. Define model compilation requirements - optimizer, loss, early stopping, etc. 

# We don't have any direct metric f1_score in tf.keras.metric, so need to get from add-on
#import tensorflow_addons as tfa
#f1_score = tfa.metrics.F1Score(num_classes=1, threshold=0.5)

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# 4. Enable tensorboard for tracking
from tensorflow.keras.callbacks import TensorBoard
import datetime
log_dir = "/kaggle/working/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint_dir = "/kaggle/working/checkpoints"
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_dir + "/model-{epoch:02d}-{val_loss:.2f}.keras",
    save_weights_only=False, #if True only weights will be saved, then it must be .h5 extn
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

from tensorflow.keras.callbacks import EarlyStopping
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min',
    verbose=1
)

In [None]:
# Adding class weight to 
from sklearn.utils.class_weight import compute_class_weight
y_train = train_generator.classes
class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

In [None]:
# 5. Fit Model; make sure to save checkpoints at intermediate points to avoid loss of information
### Experiment with different models and design


# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    epochs=50,
    callbacks=[tensorboard_callback,checkpoint_callback,early_stopping_callback],
    class_weight=class_weight_dict
)



In [None]:
# Seems like there is an issue with Kaggle while loading tensorboard
# Search for "kkb-production.jupyter-proxy.kaggle.net took too long to respond"
%reload_ext tensorboard
%tensorboard --logdir /kaggle/working/logs/fit/

In [None]:
# 6. Evaluate with relevant metric for your problem. 

# Evaluate the model on the validation data
loss, accuracy = model.evaluate(validation_generator)
print(f'Validation Test Loss: {loss}')
print(f'Validation Test Accuracy: {accuracy * 100:.2f}%')
#print(f'Validation Test F1 Score: {f1_score:.2f}')

In [None]:
# 7. After model choice is made, fine-tune model - hyperparameters!

## Transfer Learning Using Xception

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import Xception
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


# Load the pre-trained Xception model
base_model_xception = Xception(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add custom layers on top of the base model
x = base_model_xception.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
predictions = Dense(1, activation='sigmoid')(x)  # For binary classification

# Create the final model
model_tf_xception = Model(inputs=base_model_xception.input, outputs=predictions)

# Freeze the base model layers
for layer in base_model_xception.layers:
    layer.trainable = False

# Compile the model
model_tf_xception.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# Data generators
#train_datagen = ImageDataGenerator(rescale=1./255)
#train_generator = train_datagen.flow_from_directory('/kaggle/working/processed_train_data', target_size=(224, 224), batch_size=32, class_mode='binary')

#validation_datagen = ImageDataGenerator(rescale=1./255)
#validation_generator = validation_datagen.flow_from_directory('/kaggle/working/processed_train_data', target_size=(224, 224), batch_size=32, class_mode='binary')

# Train the model
history_tf_xception = model_tf_xception.fit(train_generator, epochs=10, validation_data=validation_generator, class_weight=class_weight_dict, callbacks=[checkpoint_callback,early_stopping_callback])


In [None]:
from tensorflow.keras.optimizers import Adam

# Unfreeze some layers of the base model
for layer in base_model_xception.layers[-5:]:  # Unfreeze the last 5 layers
    layer.trainable = True

# Compile the model with a lower learning rate
model_tf_xception.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Continue training the model with fine-tuning
history_fine_exception = model_tf_xception.fit(train_generator, epochs=10, validation_data=validation_generator,class_weight=class_weight_dict, callbacks=[checkpoint_callback,early_stopping_callback])

In [None]:

# Evaluate the model on the validation data
loss, accuracy = model_tf_xception.evaluate(validation_generator)
print(f'Validation Test Loss: {loss}')
print(f'Validation Test Accuracy: {accuracy * 100:.2f}%')

## Transfer Learning using ResNet50

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


# Load the pre-trained ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add custom layers on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)

x = Dense(2048, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

predictions = Dense(1, activation='sigmoid')(x)  # For binary classification

# Create the final model
model_tf = Model(inputs=base_model.input, outputs=predictions)

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model_tf.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# Data generators
#train_datagen = ImageDataGenerator(rescale=1./255)
#train_generator = train_datagen.flow_from_directory('/kaggle/working/processed_train_data', target_size=(224, 224), batch_size=32, class_mode='binary')

#validation_datagen = ImageDataGenerator(rescale=1./255)
#validation_generator = validation_datagen.flow_from_directory('/kaggle/working/processed_train_data', target_size=(224, 224), batch_size=32, class_mode='binary')

# Train the model
history_tf = model_tf.fit(train_generator, epochs=10, validation_data=validation_generator, class_weight=class_weight_dict, callbacks=[checkpoint_callback,early_stopping_callback])


In [None]:
from tensorflow.keras.optimizers import Adam

# Unfreeze some layers of the base model
for layer in base_model.layers[-5:]:  # Unfreeze the last 5 layers
    layer.trainable = True

# Compile the model with a lower learning rate
model_tf.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Continue training the model with fine-tuning
history_fine = model_tf.fit(train_generator, epochs=10, validation_data=validation_generator,class_weight=class_weight_dict, callbacks=[checkpoint_callback,early_stopping_callback])

## Transfer Learning using VGG16

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Nadam


# Load the pre-trained VGG16 model
base_model_vgg16 = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add custom layers on top of the base model
x = base_model_vgg16.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dense(512, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)  # For binary classification

# Create the final model
model_tf_vgg16 = Model(inputs=base_model_vgg16.input, outputs=predictions)

# Freeze the base model layers
for layer in base_model_vgg16.layers:
    layer.trainable = False

# Compile the model
model_tf_vgg16.compile(optimizer=Nadam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# Data generators
#train_datagen = ImageDataGenerator(rescale=1./255)
#train_generator = train_datagen.flow_from_directory('/kaggle/working/processed_train_data', target_size=(224, 224), batch_size=32, class_mode='binary')

#validation_datagen = ImageDataGenerator(rescale=1./255)
#validation_generator = validation_datagen.flow_from_directory('/kaggle/working/processed_train_data', target_size=(224, 224), batch_size=32, class_mode='binary')

# Train the model
history_tf_vgg16 = model_tf_vgg16.fit(train_generator, epochs=10, validation_data=validation_generator, class_weight=class_weight_dict, callbacks=[checkpoint_callback,early_stopping_callback])


In [None]:
from tensorflow.keras.optimizers import Nadam

# Unfreeze some layers of the base model
for layer in base_model_vgg16.layers[-5:]:  # Unfreeze the last 5 layers
    layer.trainable = True

# Compile the model with a lower learning rate
model_tf_vgg16.compile(optimizer=Nadam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Continue training the model with fine-tuning
history_fine_vgg16 = model_tf_vgg16.fit(train_generator, epochs=10, validation_data=validation_generator,class_weight=class_weight_dict, callbacks=[checkpoint_callback,early_stopping_callback])

In [None]:
loss, accuracy = model_tf_vgg16.evaluate(validation_generator)
print(f'Validation Test Loss: {loss}')
print(f'Validation Test Accuracy: {accuracy * 100:.2f}%')
#print(f'Validation Test F1 Score: {f1_score:.2f}')

## Ensemble Learning

In [None]:
def ensemble_predictions(models, data):
    predictions = [model.predict(data) for model in models]
    predictions = np.array(predictions)
    avg_predictions = np.mean(predictions, axis=0)
    return avg_predictions

# List of models
models = [model_tf_vgg16, model_tf, model]

# Get ensemble predictions
ensemble_preds = ensemble_predictions(models, validation_generator)


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Convert predictions to binary labels
ensemble_labels = (ensemble_preds > 0.5).astype(int)

# True labels
true_labels = validation_generator.classes

# Evaluate performance
accuracy = accuracy_score(true_labels, ensemble_labels)
f1 = f1_score(true_labels, ensemble_labels)

print(f'Ensemble Model Accuracy: {accuracy}')
print(f'Ensemble Model F1 Score: {f1}')


## Testing and Creating Output

In [None]:
import os
import shutil

# Create the directory structure - Prepare the test directory in kaggle/working
testdata = '/kaggle/working/testdata/images'
os.makedirs(testdata, exist_ok=True)

# Define source and destination directories
source_dir = '/kaggle/input/123-of-ai-presents-pneumonia-detection-from-xray/processed_test_set'
destination_dir = testdata

# Copy files from source to destination
for filename in os.listdir(source_dir):
    src_file = os.path.join(source_dir, filename)
    dst_file = os.path.join(destination_dir, filename)
    shutil.copy(src_file, dst_file)


In [None]:
# Creating output file for submission - Template Code
test_pd = pd.read_csv('/kaggle/input/123-of-ai-presents-pneumonia-detection-from-xray/2. test_files.csv')

# Do the same pre-processing/formatting as the training set for the test set - remember to use batch_size 1 for testing
test_data_dir = '/kaggle/working/testdata/'
test_datagen = ImageDataGenerator(rescale=1./255) #No augmentation needed for test :)

test_generator = test_datagen.flow_from_directory(
    test_data_dir,
    target_size=(224, 224),
    batch_size=1,      # Process one by one
    class_mode=None,   # No labels for test data, so None
    shuffle=False      # Preserve the order for test
)

if test_generator.samples == 0:
    raise ValueError("The test_generator is empty. Please check the directory structure and paths.")

# Evaluate trained model on test set - Load your trained model
#model = tf.keras.models.load_model('/kaggle/working/checkpoints/model-05-0.30.keras')
#predictions = model.predict(test_generator)
#y_pred = np.where(predictions > 0.5, 1, 0).flatten()

# Evaluate on TF+FineTuned model
#model_tf_i = tf.keras.models.load_model('/kaggle/working/checkpoints/model-02-0.23.keras')
predictions = model_tf.predict(test_generator)
y_pred = np.where(predictions > 0.5, 1, 0).flatten()

#Ensemble
#vgg16_preds = model_tf_vgg16.predict(test_generator)
#resnet50_preds = model_tf.predict(test_generator)
#cnn_preds = model.predict(test_generator)

# Ensemble the predictions
#ensemble_preds = (vgg16_preds + resnet50_preds + cnn_preds) / 3
#ensemble_labels = (ensemble_preds > 0.5).astype(int)
#y_pred = ensemble_labels.flatten()

# Save results to CSV
submission = pd.DataFrame({'ID': test_pd.index, 'class' : ['pneumonia' if pred == 1 else 'healthy' for pred in y_pred]})
submission.to_csv('output_submission_eval.csv', index=False)

In [None]:
# If you would like to save the model from working dir to your local machine
model.save('/kaggle/working/checkpoints/model-14-0.07.keras')
!zip -r /kaggle/working/model-14-0.07.zip /kaggle/working/checkpoints/model-14-0.07.keras