# Loading Dataset

You can download the dataset from {https://darwin.v7labs.com/v7-labs/covid-19-chest-x-ray-dataset?sort=priority\%3Adesc}.
The data entitled as '`darwin dataset pull v7-labs/covid-19-chest-x-ray-dataset:all-images`' will be used in this assignment. All dataset consist of 6504 images from 702 classes. We will extract the images of 4 classes (Bacterial Pneumonia, Viral Pneumonia, No Pneumonia (healthy), Covid-19) and save them as .npy file with the following code:

In [None]:
# Imports
import warnings
warnings.simplefilter(action='ignore')
import keras
import json
import numpy as np
import glob
import pandas as pd

import urllib.request
from PIL import Image
from keras import layers, models
import tensorflow
from sklearn.model_selection import train_test_split as tts
from sklearn import preprocessing



In [8]:
###################################
# > DISABLED - ALREADY EXECUTED < #
###################################

'''
Continue working in next cell
'''

# # all-images file should be uploaded to the same file
# imageNames = glob.glob("all-images/*")
# 
# dataset = []
# labels = []
# 
# for i, imName in enumerate(imageNames):
# 
#     # Opening JSON file
#     f = open(imName)
#     data = json.load(f)
#     for j in range(len(data['annotations'])):
# 
#         if 'COVID-19' in (data['annotations'][j]['name']):
#           #load images from url    
#             urllib.request.urlretrieve(data['image']['url'],"img.png")    
#             img = Image.open("img.png")
#             #convert images to grayscale
#             imgGray = img.convert('L')
#             #resize the image (156x156)
#             im = imgGray.resize((156,156), Image.LANCZOS)           
#             label = data['annotations'][j]['name']
#             dataset.append(np.array(im))
#             labels.append(label)
#             print(label)
#             break
# 
#         if 'Viral Pneumonia' in (data['annotations'][j]['name']) \
#             or 'Bacterial Pneumonia' in (data['annotations'][j]['name']) \
#             or 'No Pneumonia (healthy)' in (data['annotations'][j]['name']):
#             #load images from url    
#             urllib.request.urlretrieve(data['image']['url'],"img.png")    
#             img = Image.open("img.png")
#             #convert images to grayscale
#             imgGray = img.convert('L')
#             #resize the image (156x156)
#             im = imgGray.resize((156,156), Image.LANCZOS)           
#             label = data['annotations'][j]['name']
#             dataset.append(np.array(im))
#             labels.append(label)
#             break
# 
# #Convert data shape of (n_of_samples, width, height, 1)
# dataset = np.dstack(dataset)    
# dataset = np.rollaxis(dataset,-1)
# labels = np.array(labels)
# 
# #convert images gray scale to rgb
# data = np.array(layers.Lambda(tf.image.grayscale_to_rgb)(tf.expand_dims(dataset, -1)))
# 
# # save data and labels into a folder
# np.save("data.npy", data)
# np.save("labels.npy", labels)

'\nContinue working in next cell\n'

Once you save your data, you can load it from your directory.

In [9]:
data = np.load('data.npy')
labels = np.load('labels.npy')

# Preprocessing Steps



## Splitting Data

In [10]:
# stratify?
# shuffle?
from sklearn.model_selection import train_test_split

# Load the dataset
# Replace 'features' and 'labels' with your actual feature and label data
features = np.load('data.npy')
labels = np.load('labels.npy')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels)

# Further split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)


## Normalize Data

In [11]:
#making them float 
X_train=X_train.astype('float32')
X_test=X_test.astype('float32')
X_val=X_val.astype('float32')

#Normalizing the data between 0 and 1 
X_train=X_train/255.0
X_test=X_test/255.0
X_val=X_val/255.0


# Compute the mean and standard deviation of the training set
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)

# Normalize each set separately using the training set statistics
X_train_norm = (X_train - train_mean) / train_std
X_val_norm = (X_val - train_mean) / train_std
X_test_norm = (X_test - train_mean) / train_std


In [12]:
# Define a dictionary that maps each category to a numerical value
label_map = {"Bacterial Pneumonia": 0, "Viral Pneumonia": 1, "No Pneumonia (healthy)": 2, "COVID-19": 3}

# Encode the categorical labels as numerical values using the label map
y_train_encoded = np.vectorize(label_map.get)(y_train)
y_val_encoded = np.vectorize(label_map.get)(y_val)
y_test_encoded = np.vectorize(label_map.get)(y_test)

# Convert the numerical labels to one-hot encoded format
num_classes = 4
y_train_onehot = keras.utils.to_categorical(y_train_encoded, num_classes=num_classes)
y_val_onehot = keras.utils.to_categorical(y_val_encoded, num_classes=num_classes)
y_test_onehot = keras.utils.to_categorical(y_test_encoded, num_classes=num_classes)



# Create Baseline Model

In [13]:
from tensorflow import keras

model = keras.Sequential([
    # Convolutional layers
    keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', input_shape=(156, 156, 3)),
    keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),
    keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),
    keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    
    # Dense layers
    keras.layers.Flatten(),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(4, activation='softmax')
])

# Compile the model with appropriate loss function, optimizer, and metrics
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model for 10 epochs with a batch size of 32
history = model.fit(
    X_train_norm,
    y_train_onehot,
    batch_size=32,
    epochs=10,
    validation_data=(X_val_norm, y_val_onehot)
)




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Analyze the performance of the baseline model

In [None]:
import matplotlib.pyplot as plt

##Plot for the accuracy of the baseline model 
accuracy_train = history.history['accuracy']
accuracy_val = history.history['val_accuracy']
plt.plot(accuracy_train, label='training_accuracy')
plt.plot(accuracy_val, label='validation_accuracy')
plt.title('ACCURACY OF THE MODEL')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

##Plot for the loss of the baseline model 
loss_train = history.history['loss']
loss_val = history.history['val_loss']
plt.plot(loss_train, label='training_accuracy')
plt.plot(loss_val, label='validation_accuracy')
plt.title('LOSS OF MODEL')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

##ROC curve 
y_pred = model.predict(X_test_norm) 

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fpr = {}
tpr = {}
roc_auc = {}
#calculating roc for each class
for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_onehot[:,i], y_pred[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])
from sklearn.metrics import roc_auc_score
# calculating micro-average ROC curve and  area
fpr_micro, tpr_micro, _ = roc_curve(y_test_onehot.ravel(), y_pred.ravel())
roc_auc_micro = roc_auc_score(y_test_onehot.ravel(), y_pred.ravel())
# Compute macro-average ROC curve and  area
fpr_macro = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
tpr_macro = np.zeros_like(fpr_macro)
for i in range(num_classes):
    tpr_macro += np.interp(fpr_macro, fpr[i], tpr[i])
tpr_macro /= num_classes
roc_auc_macro = auc(fpr_macro, tpr_macro)
#Plot the ROC curve for each class using matplotlib.pyplot.plot()
plt.figure(figsize=(10, 5))
lw = 2
for i in range(num_classes):
    plt.plot(fpr[i], tpr[i], lw=lw, label='ROC curve of class %d (area = %0.2f)' % (i, roc_auc[i]))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.plot(fpr_micro, tpr_micro,lw=lw, linestyle='--', label='micro-average ROC curve (area = %0.2f)' % (roc_auc_micro))
plt.plot(fpr_macro, tpr_macro,lw=lw, linestyle='--', label='macro-average ROC curve (area = %0.2f)' % (roc_auc_macro))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic of Multiclass')
plt.legend(loc="lower right")
plt.show()


#reversing pred to categorical so to get the labels 
inverse_label_map = {v: k for k, v in label_map.items()}  # invert the label_map
y_pred_decoded_numerical = np.argmax(y_pred, axis=1)
y_pred_decoded_categorical = np.vectorize(inverse_label_map.get)(y_pred_decoded_numerical)



#confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_decoded_categorical)
classes = np.unique(y_test)
# plot the confusion matrix
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap='Reds')
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), xticklabels=classes, yticklabels=classes, ylabel='True label', xlabel='Predicted label')

# rotate the labels
plt.setp(ax.get_xticklabels(), rotation=20, ha="right", rotation_mode="anchor")
# text annotations like the numbers inside 
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], 'd'), ha="center", va="center", color="white" if cm[i, j] > thresh else "black")
plt.show()

# Adapting/fine-tuning the network

# Transfer Learning