# MA3832 Assignment 4 - Capstone (Data Wrangling & Proposed Model)
Done By: Josiah Teh

## AWS Setup

In [1]:
%%sh
ls -l

total 868
drwxrwxr-x 6 ec2-user ec2-user   4096 Jan 31 07:03 COVID-19_Radiography_Dataset
drwxrwxr-x 2 ec2-user ec2-user   4096 Jan 31 07:10 data
-rw-rw-r-- 1 ec2-user ec2-user   5267 Jan 31 14:36 MA3832_Capstone_Model.py
-rw-rw-r-- 1 ec2-user ec2-user 868841 Jan 31 14:35 MA3832_ProposedModel.ipynb


In [2]:
%%sh
sudo chown ec2-user lost+found
sudo chgrp ec2-user lost+found

chown: cannot access ‘lost+found’: No such file or directory
chgrp: cannot access ‘lost+found’: No such file or directory


CalledProcessError: Command 'b'sudo chown ec2-user lost+found\nsudo chgrp ec2-user lost+found\n'' returned non-zero exit status 1.

In [None]:
%%sh
ls -l

In [None]:
# Setup environment
import sagemaker #import sagemaker
print(sagemaker.__version__) 
sess = sagemaker.Session()                  
role = sagemaker.get_execution_role() 

## Data Loading

In [None]:
# Kaggle login
import os
os.environ["KAGGLE_USERNAME"] = "wdaable"
os.environ["KAGGLE_KEY"] = "8c751c49490a2cd7beb3c9f912893fb6"

In [None]:
# ! pip install kaggle

In [None]:
# ! kaggle datasets download tawsifurrahman/covid19-radiography-database

In [None]:
# ! unzip covid19-radiography-database.zip

## Pre-processing

In [None]:
# Necessary libraries
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, GlobalAveragePooling2D, Input
from tensorflow import keras

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import seaborn as sns 

In [None]:
# View first image of dataset
normal_img = cv2.imread("COVID-19_Radiography_Dataset/Normal/images/Normal-1.png")
covid_img = cv2.imread("COVID-19_Radiography_Dataset/COVID/images/COVID-1.png")
pneu_img = cv2.imread("COVID-19_Radiography_Dataset/Viral Pneumonia/images/Viral Pneumonia-1.png")

In [None]:
f = plt.figure(figsize = (20,5))
f.add_subplot(1, 3, 1)
plt.imshow(normal_img)
plt.title("Normal")

f.add_subplot(1, 3, 2)
plt.imshow(covid_img)
plt.title("COVID")

f.add_subplot(1, 3, 3)
plt.imshow(pneu_img)
plt.title("Viral Pneumonia")

plt.tight_layout()
plt.show()

In [None]:
# Check shapes
print(f"Shape of Normal Image: {normal_img.shape}")
print(f"Shape of COVID Image: {covid_img.shape}")
print(f"Shape of pneumonia Image: {pneu_img.shape}")

In [None]:
path = os.listdir('COVID-19_Radiography_Dataset/COVID/images')

print(path[:10])
print(len(path))

In [None]:
print(f"COVID-19_Radiography_Dataset/COVID/images/{path[0]}")

In [None]:
# Create function to return images and labels
def load_images(path, label):
    images = []
    labels = []
    urls = os.listdir(path)
    for i in range(len(urls)):
        img_path = f"{path}/{urls[i]}"
        img = cv2.imread(img_path)
        # Resize the images
        img = cv2.resize(img, (224, 224))
        images.append(img)
        labels.append(label)
    images = np.asarray(images)
    return images, labels

In [None]:
path = "COVID-19_Radiography_Dataset/Normal/images"
normal_images, normal_labels = load_images(path, 0)

# Check length of images to os directory
print(len(os.listdir(path)))
print(len(normal_images))
print(normal_images.shape)

In [None]:
path = "COVID-19_Radiography_Dataset/COVID/images"
COVID_images, COVID_labels = load_images(path, 1)

# Check length of images to os directory
print(len(os.listdir(path)))
print(len(COVID_images))
print(COVID_images.shape)

In [None]:
path = "COVID-19_Radiography_Dataset/Viral Pneumonia/images"
pneu_images, pneu_labels = load_images(path, 2)

# Check length of images to os directory
print(len(os.listdir(path)))
print(len(pneu_images))
print(pneu_images.shape)

In [None]:
# Join all images and labels together
x = np.r_[normal_images, COVID_images, pneu_images]
y = np.r_[normal_labels, COVID_labels, pneu_labels]

# Check
print(x.shape)
print(y.shape)

In [None]:
# Split data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Check
print(f"Shape of X Train: {x_train.shape}")
print(f"Shape of Y Train: {y_train.shape}")
print(f"Shape of X Test: {x_test.shape}")
print(f"Shape of Y Test: {y_test.shape}")

In [None]:
y_test

In [None]:
# Store the data to the notebook, then to AWS bucket
os.makedirs("./data", exist_ok=True)
np.savez('./data/training', image=x_train, label=y_train)
np.savez('./data/test', image=x_test, label=y_test)

In [None]:
# Save to to AWS bucket
prefix = "Capstone" 

training_input_path = sess.upload_data('data/training.npz', key_prefix = prefix+'/training')
test_input_path = sess.upload_data('data/test.npz', key_prefix = prefix+'/validation')

# Check paths
print(training_input_path)
print(test_input_path)

In [None]:
# training_input_path = "s3://sagemaker-ap-southeast-1-875555675952/Capstone/training/training.npz"
# test_input_path = "s3://sagemaker-ap-southeast-1-875555675952/Capstone/validation/test.npz"

## Initialise the Model

In [None]:
from sagemaker.tensorflow import TensorFlow
# location to store models
model_location = 's3://sagemaker-ap-southeast-1-875555675952/models'

cnn_model = TensorFlow(entry_point='MA3832_Capstone_Model.py',  #Python script
                          base_job_name='',
                          source_dir='.',
                          role=role,
                          instance_count=1, 
                          instance_type='ml.m4.xlarge', # instance type
                          framework_version='2.1.0', # Tensorflow version
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={'epochs': 10},
                           output_path=model_location,
                          ## after this line, everything is optional for managed spot instance
                          use_spot_instances=True,        # Use spot instance
                          max_run=3600,                    # Max training time
                          max_wait=7200,                  # Max training time + spot waiting time
                         )

In [None]:
cnn_model.fit({'training': training_input_path, 'validation': test_input_path})

In [None]:
# Deploy model as an endpoint 
proposed_predictor = cnn_model.deploy(initial_instance_count=1, # The initial number of instances to run in the Endpoint created from this Model.
                                      instance_type='ml.t2.medium', # The EC2 instance type to deploy this Model to.
                                      endpoint_name="proposed-CNN-model") # The name of the endpoint to create  



## Evaluation

In [None]:
# import io
# import json
# import boto3
# client = boto3.client('sagemaker-runtime') 
# test_file = io.StringIO()
# x_test[:1000].to_txt(test_file, header=None, index=None)
# ioc_predictor_endpoint_name = 'proposed-CNN-model'
# content_type = 'application/json' 
# ioc_response = client.invoke_endpoint(
# EndpointName=ioc_predictor_endpoint_name,
# Body=x_test,
# ContentType=content_type
# )

In [None]:
# proposed_predictor.delete_endpoint()