#Technical Project:Brain Tumor Detection

##Goals
1.  Predicting from the brain MRI scan images if the image has a tumor or not.
2.  Which model has a higher accuracy in detecting tumor  which in turn will help medical institutes to devise a cure quicker and help save a     lot of lives.
3.  Which gender gets diagnosed most with brain tumor?
4.  Number of death caused due to brain tumor.

In [0]:
#installing libraries for reading and processing images
!pip install opencv-python
!pip install scikit-image

In [0]:
#importing the required libraries
import mlflow
import numpy as np 
import pandas as pd
import os
import tensorflow as tf
import cv2
from tensorflow import keras
from tensorflow.keras import layers, Input
from keras.layers import InputLayer, MaxPooling2D, Flatten, Dense, Conv2D, Dropout
from keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions, ResNet50
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.optimizers import Adam, SGD

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from PIL.Image import open

from  matplotlib import pyplot as plt
import matplotlib.image as mpimg
import random
%matplotlib inline

In [0]:
#creating spark dataframe

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType

brain_df = (sqlContext.read.format("csv").
  option("header", "true").
  option("nullValue", "NA").
  option("inferSchema", True).
  option("encoding", "UTF-8").  
  option("ignoreLeadingWhiteSpace", True).
  option("ignoreTrailingWhiteSpace", True).
  option("multiLine", True).
  load("/FileStore/tables/Brain_Tumor/Brain_Tumor__2_.csv", usecols=[0,1]))

In [0]:
#projecting the required features for image pre processing
brain_df=brain_df[['Image','Class']]
brain_df.show()

In [0]:
#loading images to spark dataframe
brainImages_df=spark.read.format("image").load("dbfs:/FileStore/tables/Images")
display(brainImages_df)

In [0]:
#converting spark dataframe to pandas dataframe for further image processing
brain_df=brain_df.toPandas()
brain_df[['Image','Class']].head()

In [0]:
path_list = []
base_path = '/dbfs/FileStore/tables/Images'
for entry in os.listdir(base_path):
    path_list.append( os.path.join(base_path,entry))

In [0]:
# image_df = spark.read.format("binaryFile").load("dbfs:/FileStore/tables/Images")
# image_df.show()

In [0]:
#retrieving the path for the entire image dataset
paths_dict={os.path.splitext(os.path.basename(x))[0]: x for x in path_list}
brain_df['paths'] = brain_df['Image'].map(paths_dict.get)
brain_df[['Image','Class','paths']].head()

In [0]:
#from pyspark.sql import SparkSession
#Create PySpark SparkSession
#spark = SparkSession.builder \
#      .master("local[1]") \
#      .appName("SparkByExamples.com") \
#      .getOrCreate()
#Create PySpark DataFrame from Pandas
# Brain_df=spark.createDataFrame(brain_df[['Image','Class','paths',]]) 
# Brain_df.printSchema()
# Brain_df[['Image','Class','paths']].show()

In [0]:
# Brain_df=Brain_df.toPandas()
# Brain_df.head()

In [0]:
#plotting some samples from the dataset
for x in range (0 ,9):
    img = cv2.imread(brain_df['paths'][x])
    plt.imshow(img)
    plt.subplot(3,3,x+1)

In [0]:
#Brain_df[['pixels']]=Brain_df[['paths']].map(lambda x:np.asarray(open(x).resize((224,224))))

#RDD2 = sc.parallelize(Brain_df)
#Brain_df2=RDD2.map(lambda x:np.asarray(open(x).resize((224,224))))
#Brain_df2=RDD2.toDF(["Image","Class","paths","pixels"])
#Brain_df2.show()

In [0]:
#converting the images to pixels to get the input shape for the models
Brain_df['pixels']=Brain_df['paths'].map(lambda x:np.asarray(open(x).resize((224,224))))
Brain_df.head()

In [0]:
# from pyspark.sql import SparkSession
# #Create PySpark SparkSession
# spark = SparkSession.builder \
#      .master("local[1]") \
#      .appName("SparkByExamples.com") \
#      .getOrCreate()
# #Create PySpark DataFrame from Pandas
# sparkBrain_df=spark.createDataFrame(Brain_df[['Image','Class','paths','pixels']]) 
# sparkBrain_df.printSchema()
# sparkBrain_df.show()

In [0]:
image_list = []
for i in range(len(Brain_df)):
    brain_image = Brain_df["pixels"][i].astype(np.float32)
    brain_image /= 255
    image_list.append(brain_image)
X = np.array(image_list)
print(X.shape)

In [0]:
y = np.array(Brain_df.Class)
y.shape

In [0]:
print(y.shape)

In [0]:
#train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('The shape of the X_train :'+' '+str(X_train.shape))
print('The size of the X_train :'+' '+str(X_train.shape[0]))
print('The shape of the X_test :'+' '+str(X_test.shape))
print('The size of the X_test:'+' '+str(X_test.shape[0]))

In [0]:
#building ResNet50 model

def model(input_shape):
    #res_conv = ResNet50(include_top=False, weights="imagenet", input_tensor=None, input_shape=input_shape, pooling=None)
    model = Sequential()
    
    model.add(Input(shape=input_shape))
    
    model.add(Conv2D(16, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
    model.add(Conv2D(16, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_last", padding='same'))
            
    model.add(Conv2D(32, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
    model.add(Conv2D(32, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_last", padding='same'))
    
    model.add(Conv2D(64, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
    model.add(Conv2D(64, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_last", padding='same'))
    
#     model.add(Conv2D(128, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
#     model.add(Conv2D(128, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
#     model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_last", padding='same'))
    
#     model.add(Conv2D(256, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
#     model.add(Conv2D(256, kernel_size=3, strides=(2, 2), padding="same", activation="relu", kernel_initializer="he_normal"))
#     model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_last", padding='same'))
    
    model.add(Flatten())
    model.add(Dense(256, activation="relu"))
    model.add(Dense(128, activation="relu"))
#     model.add(Dropout(0.4))
    model.add(Dense(1, activation="sigmoid"))    # Never use sigmoid for binary classification
    
    return model

In [0]:
model = model(input_shape = (224,224, 3)) #loading the model with the input

In [0]:
model.summary() #working of the layers of the model

In [0]:
# optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name="Adam",)
optimizer = SGD(learning_rate=0.01)
loss_fn = BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [0]:
# Training the model
history = model.fit(x=X_train, y=y_train, epochs=50, batch_size=10)

In [0]:
loss = history.history["loss"]
acc = history.history["accuracy"]

In [0]:
#plotting the training loss
epoch = np.arange(50)
plt.plot(epoch, loss)
# plt.plot(epoch, val_loss)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend(['train'])

In [0]:
#plotting training accuracy
epoch = np.arange(50)
plt.plot(epoch, acc)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend(['train'])

In [0]:
#evaluating performance of model
eval_score = model.evaluate(X_test, y_test)
print("Test loss:", eval_score[0])
print("Test accuracy:", eval_score[1])

In [0]:
y_pred = model.predict(X_test) # prediction with the model

In [0]:
#converting the continous data to binary data for corelation matrix
pred_y = [1 if x>0.5 else 0 for x in y_pred]
print(pred_y)

In [0]:
#prediction of classes of the images and the precision of the prediction
from sklearn.metrics import confusion_matrix, classification_report
target_classes = ['No Tumor','Tumor']
classification_report(y_test, pred_y, output_dict = True, target_names=target_classes)

In [0]:
from sklearn.metrics import confusion_matrix
cf_matrix=confusion_matrix(y_test , pred_y )
print(cf_matrix)

In [0]:
#plotting confusion matrix
import seaborn as sns
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['0-No','1-Yes'])
ax.yaxis.set_ticklabels(['0-No','1-Yes'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [0]:
num_classes = 1
model = Sequential()
model.add(MobileNetV2(input_shape=(224, 224, 3),weights="imagenet"
                             ,include_top=False))
model.add(keras.layers.GlobalAveragePooling2D())
model.add( Dense(num_classes, activation='sigmoid',name='preds'))
model.layers[0].trainable= False
# show model summary
model.summary()

In [0]:
#building MobileV2Net
model.compile(
    # set the loss as binary_crossentropy
    loss=keras.losses.binary_crossentropy,
    # set the optimizer as stochastic gradient descent
    optimizer=keras.optimizers.SGD(lr=0.001),
    # set the metric as accuracy
    metrics=['accuracy']
)


# train the model 
model.fit(
    X_train[:,:,:,:],
    y_train[:],
    epochs=50,
    verbose=1,
    validation_data=(X_test[:,:,:,:], y_test[:])
)

In [0]:
# model.save(os.path('dbfs:/FileStore/tables/model_brain.h5'))

#pretrained_cnn = keras.models.load_model(os.path('/dbfs/tmp/hive/root/model_brain.h5'))

# modelpath = "/dbfs/FileStore/model/aaa"
# mlflow.sklearn.save_model(model, modelpath)
# print("Saved model to DBFS")

In [0]:
%fs ls /FileStore/tables/

In [0]:
# evaluate model on holdout set
eval_score = model.evaluate(X_test,y_test)
# print loss score
print('Eval loss:',eval_score[0])
# print accuracy score
print('Eval accuracy:',eval_score[1] )

Which model has a higher accuracy in detecting tumor which in turn will help medical institutes to devise a cure quicker and help save a lot of lives?

So from above results we can see that ResNet50 has a higher accuracy than MobileV2Net

In [0]:
#plotting the training loss
epoch = np.arange(50)
plt.plot(epoch, loss)
# plt.plot(epoch, val_loss)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend(['train'])

In [0]:
#plotting the test loss
epoch = np.arange(50)
plt.plot(epoch, acc)
# plt.plot(epoch, val_accuracy)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend(['train'])

In [0]:
y_pred = model.predict(X_test) # prediction with the model

In [0]:
#converting the continous data to binary data for corelation matrix
pred_y = [1 if x>0.5 else 0 for x in y_pred]
print(pred_y)

In [0]:
# def predict_classes(self, X_test, batch_size=32, verbose=1):
#   proba = self.predict(X_test, batch_size=batch_size, verbose=verbose)
#     if self.class_mode=='categorical':
#       return proba.argmax(axis=-1)
#     else:
#       return (proba > 0.5).astype('int32')

In [0]:
from sklearn.metrics import confusion_matrix, classification_report
target_classes = ['No Tumor','Tumor']
classification_report(y_test, pred_y, output_dict = True, target_names=target_classes)

In [0]:
from sklearn.metrics import confusion_matrix
cf_matrix=confusion_matrix(y_test , pred_y )
print(cf_matrix)

In [0]:
#plotting confusion matrix
import seaborn as sns
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['0-No','1-Yes'])
ax.yaxis.set_ticklabels(['0-No','1-Yes'])

## Display the visualization of the Confusion Matrix.
plt.show()

Predicting from the brain MRI scan images if the image has a tumor or not.

From the above confusion matrix we can see that the ResNet50 model correctly predicts for 424 images to not have a tumor and 310 images to have tumor.

The confusion matrix for MobileV2Net correctly predicts 393 images to not have tumor and 282 images to have a tumor.

In [0]:
#creating spark dataframe for EDA to answer the third and fourth hypoithesis
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType

detailedData_df = (sqlContext.read.format("csv").
  option("header", "true").
  option("nullValue", "NA").
  option("inferSchema", True).
  option("encoding", "UTF-8").  
  option("ignoreLeadingWhiteSpace", True).
  option("ignoreTrailingWhiteSpace", True).
  option("multiLine", True).
  load("dbfs:/FileStore/tables/detailed_data/detailed_data.csv",))

In [0]:
detailedData_df.show() # diplaying dataframe

In [0]:
detailedData_df.select("Patient").distinct().show() #displaying only patient column

In [0]:
print("Number of patients:", detailedData_df[['Patient']].distinct().count()) # displaying the count of patients

In [0]:
summary_df=detailedData_df.describe()
summary_df.show()

In [0]:
detailedData_df.printSchema() #printing schema of the dataframe

In [0]:
print("Number of patients:", detailedData_df[['Patient']].distinct().count())

In [0]:
Patient_df=detailedData_df.na.drop() #dropping columns rows with null values

In [0]:
Patient_df.show()

In [0]:
#checking if there is any null values left
from pyspark.sql.functions import isnan, when, count, col
nacounts=Patient_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in Patient_df.columns]).toPandas()
nacounts

In [0]:
print("Number of patients:", Patient_df[['Patient']].distinct().count()) #checking the count of patients after dropping null values

In [0]:
#dropping unwanted columns
Patient_df= Patient_df.drop("RNASeqCluster", "MethylationCluster", "miRNACluster", "CNCluster", "RPPACluster", "OncosignCluster", "COCCluster", "histological_type", "neoplasm_histologic_grade", "tumor_tissue_site", "laterality", "tumor_location", "age_at_initial_pathologic")
Patient_df.show()

In [0]:
#ordering the dataframe by gender and death
Patient_df=Patient_df.orderBy('death01','gender', ascending=True)
Patient_df.show()

In [0]:
Patient_df.printSchema() #printing schema after extracting only required features

In [0]:
#converting to pandas for bar plot to show the no of deaths and which gender is mostly diagnosed with brain tumor
Patient_df=Patient_df.toPandas()
Patient_df.head(78)

In [0]:
Patient_df['gender'].value_counts() #gives the count of which gender is mostly diagnosed with brain tumor

Which gender gets diagnosed most with brain tumor?

As per the abover results 44 males from the dataset gets diagnosed with brain tumor and 34 females gets diagnosed with brain tumor.

In [0]:
Patient_df['gender'].value_counts().plot(kind = 'bar')

In [0]:
Patient_df['death01'].value_counts() #gives the count of patients that have died having brain tumor

Number of death caused due to brain tumor.

As per above results 60 patients have died who has brain tumor and 18 have survived with brain tumor.

In [0]:
Patient_df['death01'].value_counts().plot(kind = 'bar')