In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread

In [2]:
import zipfile
import os

In [3]:
data_dir = '../input/chest-xray-covid19-pneumonia/Data'

In [4]:
os.listdir(data_dir)

In [5]:
train_dir = data_dir+"/train"
os.listdir(train_dir)

In [6]:
test_dir = data_dir+"/test"
os.listdir(test_dir)

#### Lets evaluate the directories
- checking image counts for each class
- Appears we have a bit of an unbalance with the data
- There are mare covid images than Normal and Pneumonia
- This could work in our favor allowing the model to differentiate covid images better 

In [7]:
len(os.listdir(train_dir+"/COVID19"))

In [8]:
len(os.listdir(train_dir+"/NORMAL"))

In [9]:
len(os.listdir(train_dir+"/PNEUMONIA"))

#### Testing Set

In [10]:
len(os.listdir(test_dir+"/COVID19"))

In [11]:
len(os.listdir(test_dir+"/NORMAL"))

In [12]:
len(os.listdir(test_dir+"/PNEUMONIA"))

### Observing a sample image of each X-ray
- We will grab an image from the training set from each class

In [13]:
os.listdir(train_dir+"/COVID19")[0]

In [14]:
pneumonia_sample = imread(train_dir+"/PNEUMONIA"+"/PNEUMONIA(3189).jpg")

In [15]:
plt.imshow(pneumonia_sample)

In [16]:
normal_sample = imread(train_dir+"/NORMAL"+"/NORMAL(342).jpg")

In [17]:
plt.imshow(normal_sample)

In [18]:
covid_sample = imread(train_dir+"/COVID19"+"/COVID19(189).jpg")

In [19]:
plt.imshow(covid_sample, cmap= "gray")

In [20]:
covid_sample.shape

In [21]:
pneumonia_sample.shape

In [22]:
normal_sample.shape

### Finding the average size of the images in the data
- We will need to generate the average size of the the images to feed into the model
- We will use the the training set using the Pneumonia images

In [23]:
dim1 = []
dim2 = []

for image_name in os.listdir(train_dir+"/PNEUMONIA"):
    img = imread(train_dir+"/PNEUMONIA/"+image_name)
    d1,d2,c = img.shape
    dim1.append(d1)
    dim2.append(d2)

In [24]:
plt.figure(figsize=(16,6))
sns.scatterplot(dim1, dim2)

In [25]:
np.mean(dim1)

In [26]:
np.mean(dim2)

### Average image size
- 400wX400hx3channels
- lowering dimension to decrease training time

In [27]:
average_image_size = (400,400,3)

### Creating a image generator
- This will allow us to minipulate and transform the images in the direcrtory as they are imported to them model for accurate training

In [28]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Generator 
- We will minipulate the images below
- WE will not flip the images because there will never be a point where we are scanning a patient up side down

In [29]:
generator = ImageDataGenerator(
    rotation_range=0,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    fill_mode='nearest',
    horizontal_flip=False,
    vertical_flip=False,
    rescale=1/255,
)

## Testing generator
- We will test the generator on a random image 
- Lets use the Pneunonia sample image we generated earlier

In [30]:
plt.imshow(generator.random_transform(pneumonia_sample))

### Generating Test and training generators
- These are the generators that will be passed into the model for fitting
- first we will check the class intances for each image in the directories

In [31]:
generator.flow_from_directory(test_dir)

In [32]:
generator.flow_from_directory(train_dir)

In [33]:
train_generator = generator.flow_from_directory (
    train_dir,
    target_size=average_image_size[:2],
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
)

## We do not shuffle images on the test generator 
- because this will suffle the labels as well and produce inaccurate labeling

In [34]:
test_generator = generator.flow_from_directory (
    test_dir,
    target_size=average_image_size[:2],
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False,
)

## Creating model 
- lets import the proper libraries through tensorflow and keras

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

### Early Stopping
- This will set a limit on the training of our model to prevent overtraining

In [36]:
stop = EarlyStopping(monitor="val_loss", mode="min", patience=6)

In [37]:
model = Sequential()
model.add(Conv2D(filters = 32, padding = "same", kernel_size = (2,2), strides = (2,2), activation = "relu", input_shape = average_image_size))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Conv2D(filters = 32, padding = "same", kernel_size = (2,2), strides = (2,2), activation = "relu", input_shape = average_image_size))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Conv2D(filters = 64, padding = "same", kernel_size = (2,2), strides = (2,2), activation = "relu", input_shape = average_image_size))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Flatten())
model.add(Dense(units = 132, activation = "relu"))
model.add(Dense(units = 60, activation = "relu"))
model.add(Dense(units = 3, activation = "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [38]:
model.fit_generator(train_generator, validation_data=test_generator, epochs=1, callbacks=[stop])

In [39]:
model.summary()

In [40]:
model.metrics_names

## Model History
- We will take a look at the model training on the training data and validation data
- The below diagrams lets us know that the model trained very well on the data and provided over 90% accuracy on the validation data 
- below we will made predictions to test our model 
- will also need to find a way to test the model on random data 

In [41]:
pd.DataFrame(model.history.history)[["loss", "val_loss"]].plot(figsize =(16,6), marker = "o", mfc = "g")

In [42]:
pd.DataFrame(model.history.history)[["accuracy", "val_accuracy"]].plot(figsize =(16,6), marker = "o", mfc = "g")

In [43]:
predictions = model.predict(test_generator)

## Predicitons
- Below our predictions are presented in probabilities 
- we will need to convert these to the proper label so we can compare to our test generator

In [44]:
predictions

In [45]:
pred_labels = np.argmax(predictions, axis = 1)

## Predicted classes 
- The predicted classes should match the length of test generator classes 
- Lets test this below

In [46]:
len(test_generator.classes)

In [47]:
len(pred_labels)

### Report
- We can see that the model is 92% accurate 
- also seems thsat the model seems to have some trouble with label 1 
- we will we will take a look this below of which label this is
- over all we can be happy with a 92 % accuracy in predictions

### Random Image testing
- Since we do not have random covid, pneumonia, or nurmal x-rays we will test the model on the data we already have and get a prediction on the label 
- lets pass the model random images from each clssificaiton and see if its accurate in predictions
- we have to consider the image size that the image was trained on so we wil address this as well

In [49]:
print(classification_report(test_generator.classes, pred_labels))

In [50]:
from tensorflow.keras.preprocessing import image

### Testing on covid image 
- Lets see if the model can predict on a covid image 

In [51]:
from random import randint
random_index = randint(1, len(os.listdir(train_dir+"/COVID19")))
random_covid_image_name = os.listdir(train_dir+"/COVID19")[random_index]

In [52]:
random_covid_image_name

In [53]:
covid_img_path = train_dir+"/COVID19/"+random_covid_image_name

In [54]:
random_covid_img = image.load_img(covid_img_path, target_size=average_image_size)

In [55]:
random_covid_img

In [56]:
random_covid_img_array = image.img_to_array(random_covid_img)

### Reshaping
- The image is in the proper shape but our dimensions are off 
- we wneed to alert the model that we are passing in 1 image, so we need our first layer of the array to be 1
- numpy expand dimensions feature will allow us to do this easily 

In [57]:
random_covid_img_array.shape

In [58]:
covid_img_array = np.expand_dims(random_covid_img_array, axis=0)

In [59]:
covid_img_array.shape

### Prediction of new image (Covid 19 image) 
- Model predict that the image is classify as label 0
- To check is this is correct we need to look at the class indicies of out test_generator
- keep in mind we are intentionally passing in a covid image for testing

In [60]:
np.argmax(model.predict(covid_img_array), axis =1)

In [61]:
test_generator.class_indices

### Metrics
- Using Sklearn we will use classification report to see the accuracy of the predictions per class

In [48]:
from sklearn.metrics import confusion_matrix, classification_report

In [69]:
classes = ["COVID19", "NORMAL", "PNEUMONIA"]

predictions = np.array(list(map(lambda x: np.argmax(x), predictions)))

y_true=test_generator.classes

CMatrix = pd.DataFrame(confusion_matrix(y_true, predictions), columns=classes, index =classes)

plt.figure(figsize=(12, 6))
ax = sns.heatmap(CMatrix, annot = True, fmt = 'g' ,vmin = 0, vmax = 250,cmap = 'Blues')
ax.set_xlabel('Predicted',fontsize = 14,weight = 'bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation =0);

ax.set_ylabel('Actual',fontsize = 14,weight = 'bold') 
ax.set_yticklabels(ax.get_yticklabels(),rotation =0);
ax.set_title('Confusion Matrix - Test Set',fontsize = 16,weight = 'bold',pad=20);

# Visualization

In [63]:
from skimage.segmentation import mark_boundaries
import lime
from lime import lime_image
import cv2

In [64]:
def visualize(file_path):
    test_image = cv2.imread(data_dir + file_path)
    test_image = cv2.resize(test_image,(400,400),interpolation=cv2.INTER_NEAREST)
    test_image = np.expand_dims(test_image, axis=0)

    explainer = lime_image.LimeImageExplainer()
    explanation = explainer.explain_instance(test_image[0], model.predict, top_labels=5, hide_color=0, num_samples=1000)
    temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=False)

    plt.imshow(mark_boundaries(temp, mask))

In [66]:
visualize('/train/COVID19/COVID19(25).jpg')