In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from sklearn.model_selection import train_test_split



In [43]:
train_data_dir = 'chest_xray/train/'
val_data_dir = 'chest_xray/val/'

In [44]:
val_generator = ImageDataGenerator().flow_from_directory(
        val_data_dir, 
        target_size=(64, 64), batch_size=16)

# Get all the data in the directory data/train (790 images), and reshape them
train_generator = ImageDataGenerator().flow_from_directory(
        train_data_dir, 
        target_size=(64, 64), batch_size=5216)

# Create the datasets
train_images, train_labels = next(train_generator)
val_images, val_labels = next(val_generator)

Found 16 images belonging to 2 classes.
Found 5216 images belonging to 2 classes.


In [45]:
print(np.shape(train_images))
print(np.shape(train_labels))
print(np.shape(val_images))
print(np.shape(val_labels))

(5216, 64, 64, 3)
(5216, 2)
(16, 64, 64, 3)
(16, 2)


In [46]:
train_img_unrow = train_images.reshape(5216, -1).T
np.shape(train_img_unrow)

(12288, 5216)

In [48]:
val_img_unrow = val_images.reshape(16, -1).T
np.shape(val_img_unrow)

(12288, 16)

In [49]:
train_generator.class_indices

{'NORMAL': 0, 'PNEUMONIA': 1}

In [50]:
train_labels_final = train_labels.T[[1]]
np.shape(train_labels_final)

(1, 5216)

In [51]:
val_labels_final = val_labels.T[[1]]
np.shape(val_labels_final)

(1, 16)

In [52]:
train_img_final = train_img_unrow/255
val_img_final = val_img_unrow/255

type(val_img_unrow)

numpy.ndarray

# Baseline - Logistic Regression

In [53]:
b = 0

In [54]:
def init_w(n):
    w = np.zeros((n, 1))
    return w

In [55]:
w = init_w(64*64*3)

In [56]:
def propagation(w, b, x, y):
    l = x.shape[1]
    y_hat = 1/(1 + np.exp(- (np.dot(w.T, x) + b)))                                  
    cost = -(1/l) * np.sum(y * np.log(y_hat) + (1-y)* np.log(1 - y_hat))    
    dw = (1/l) * np.dot(x,(y_hat - y).T)
    db = (1/l) * np.sum(y_hat - y)
    return dw, db, cost

In [57]:
def optimization(w, b, x, y, num_iterations, learning_rate, print_cost = False):
    
    costs = []
    
    for i in range(num_iterations):
        dw, db, cost = propagation(w, b, x, y)    
        w = w - learning_rate*dw
        b = b - learning_rate*db
        
        # Record the costs and print them every 50 iterations
        if i % 50 == 0:
            costs.append(cost)
        if print_cost and i % 50 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    return w, b, costs

In [58]:
def prediction(w, b, x):
    l = x.shape[1]
    y_prediction = np.zeros((1, l))
    w = w.reshape(x.shape[0], 1)
    y_hat = 1/(1 + np.exp(- (np.dot(w.T, x) + b))) 
    p = y_hat
    
    for i in range(y_hat.shape[1]):
        if (y_hat[0,i] > 0.5): 
            y_prediction[0, i] = 1
        else:
            y_prediction[0, i] = 0
    return y_prediction

In [59]:
def model(x_train, y_train, x_test, y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False):

    b = 0
    w = init_w(np.shape(x_train)[0]) 

    # Gradient descent (≈ 1 line of code)
    w, b, costs = optimization(w, b, x_train, y_train, num_iterations, learning_rate, print_cost)
    
    y_pred_test = prediction(w, b, x_test)
    y_pred_train = prediction(w, b, x_train)

    # Print train/test errors
    print('train accuracy: {} %'.format(100 - np.mean(np.abs(y_pred_train - y_train)) * 100))
    print('test accuracy: {} %'.format(100 - np.mean(np.abs(y_pred_test - y_test)) * 100))

    output = {'costs': costs, 
              'y_pred_test': y_pred_test,  
              'y_pred_train' : y_pred_train,  
              'w' : w, 
              'b' : b, 
              'learning_rate' : learning_rate, 
              'num_iterations': num_iterations}
    
    return output

In [60]:
output = model(train_img_final, train_labels_final, val_img_final, val_labels_final, 
               num_iterations=2000, learning_rate=0.005, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 50: 0.497045
Cost after iteration 100: 0.326489
Cost after iteration 150: 0.242337
Cost after iteration 200: 0.223689
Cost after iteration 250: 0.209860
Cost after iteration 300: 0.199025
Cost after iteration 350: 0.190259
Cost after iteration 400: 0.182993
Cost after iteration 450: 0.176855
Cost after iteration 500: 0.171591
Cost after iteration 550: 0.167017
Cost after iteration 600: 0.163000
Cost after iteration 650: 0.159440
Cost after iteration 700: 0.156258
Cost after iteration 750: 0.153394
Cost after iteration 800: 0.150800
Cost after iteration 850: 0.148436
Cost after iteration 900: 0.146272
Cost after iteration 950: 0.144281
Cost after iteration 1000: 0.142441
Cost after iteration 1050: 0.140735
Cost after iteration 1100: 0.139147
Cost after iteration 1150: 0.137663
Cost after iteration 1200: 0.136274
Cost after iteration 1250: 0.134968
Cost after iteration 1300: 0.133739
Cost after iteration 1350: 0.132578
Cost after iter

In [61]:
output = model(train_img_final, train_labels_final, val_img_final, val_labels_final, 
               num_iterations=500, learning_rate=0.01, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 50: 1.524937
Cost after iteration 100: 0.457551
Cost after iteration 150: 0.315320
Cost after iteration 200: 0.188661
Cost after iteration 250: 0.163206
Cost after iteration 300: 0.152922
Cost after iteration 350: 0.146684
Cost after iteration 400: 0.142256
Cost after iteration 450: 0.138775
train accuracy: 95.0536809815951 %
test accuracy: 87.5 %


In [62]:
output = model(train_img_final, train_labels_final, val_img_final, val_labels_final, 
               num_iterations=300, learning_rate=0.01, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 50: 1.524937
Cost after iteration 100: 0.457551
Cost after iteration 150: 0.315320
Cost after iteration 200: 0.188661
Cost after iteration 250: 0.163206
train accuracy: 94.1909509202454 %
test accuracy: 87.5 %


In [63]:
output = model(train_img_final, train_labels_final, val_img_final, val_labels_final, 
               num_iterations=200, learning_rate=0.005, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 50: 0.497045
Cost after iteration 100: 0.326489
Cost after iteration 150: 0.242337
train accuracy: 92.4463190184049 %
test accuracy: 81.25 %


In [64]:
output = model(train_img_final, train_labels_final, val_img_final, val_labels_final, 
               num_iterations=200, learning_rate=0.01, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 50: 1.524937
Cost after iteration 100: 0.457551
Cost after iteration 150: 0.315320
train accuracy: 92.48466257668711 %
test accuracy: 87.5 %
