<a href="https://colab.research.google.com/github/Aswani-ReddyKV/Melanoma_Detection/blob/main/aswani_reddy_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Problem statement: To build a CNN based model which can accurately detect melanoma. Melanoma is a type of cancer that can be deadly if not detected early. It accounts for 75% of skin cancer deaths. A solution which can evaluate images and alert the dermatologists about the presence of melanoma has the potential to reduce a lot of manual effort needed in diagnosis.

In [None]:
# import pathlib
# import tensorflow as tf
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# import os
# from glob import glob
# import PIL
# from tensorflow import keras
# from tensorflow.keras import layers
# from tensorflow.keras.models import Sequential
# from tensorflow.python.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPool2D
# from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPooling2D
# from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img,img_to_array

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Activation, Dropout, BatchNormalization, Rescaling
from tensorflow.keras.regularizers import l2
from tensorflow.keras import layers
import numpy as np
import os
import matplotlib.pyplot as plt
import os
import pathlib
import pandas as pd
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
rootfolder = '/content/drive/MyDrive/Colab Notebooks/SkinCancer_Data'
train_dir = pathlib.Path(rootfolder + '/Train')
test_dir = pathlib.Path(rootfolder + '/Test')

In [None]:
#Get the number of images present in Train directory
train_img_count = len(list(train_dir.glob('*/*.jpg')))
print("Total Images(Train):",train_img_count)

In [None]:
#Get the number of images present in Test directory
test_img_count = len(list(test_dir.glob('*/*.jpg')))
print("Total Images(Test):",test_img_count)

Define parameters for loader

In [None]:
# Batch size
batch_size = 32
# image height
img_height = 180
# image width
img_width = 180

Use 80% of the images for training and 20% for validation.
Creating two separate sets for Train and Validation.

In [None]:
# split dataset for train
train_ds = tf.keras.utils.image_dataset_from_directory(
  train_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
# split dataset for validation
val_ds = tf.keras.utils.image_dataset_from_directory(
  train_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
#get class names
class_names = train_ds.class_names
print(class_names)

In [None]:
len(class_names)

In [None]:
datalist =[]
for c in class_names:
  lst = os.listdir(pathlib.Path(train_dir / c)) # use / to join paths
  number_files = len(lst)
  datalist.append([c, number_files])
df = pd.DataFrame(datalist, columns=['Class', 'Count'])

In [None]:
#Visualize the Number of image in each class.
import seaborn as sns
plt.figure(figsize=(6, 6))
sns.barplot(x="Count", y="Class", data=df, palette='copper_r')

In [None]:
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
  test_dir,
  image_size=(img_height, img_width),
  batch_size=batch_size)

The `image_batch` is a tensor of the shape `(32, 180, 180, 3)`. This is a batch of 32 images of shape `180x180x3` (the last dimension refers to color channels RGB). The `label_batch` is a tensor of the shape `(32,)`, these are corresponding labels to the 32 images.

`Dataset.cache()` keeps the images in memory after they're loaded off disk during the first epoch.

`Dataset.prefetch()` overlaps data preprocessing and model execution while training.

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Create the model
####Create a CNN model, which can accurately detect 9 classes present in the dataset. Use ```layers.experimental.preprocessing.Rescaling``` to normalize pixel values between (0,1). The RGB channel values are in the `[0, 255]` range. This is not ideal for a neural network. Here, it is good to standardize values to be in the `[0, 1]`

In [None]:

%time
input_shape = (180,180,3)
model = Sequential()
model.add(Conv2D(16,
                 kernel_size = (3,3),
                 input_shape = (180, 180, 3),
                 activation = 'relu',
                 padding = 'same'))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(Conv2D(32,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(Conv2D(64,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(Conv2D(128,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(Conv2D(256,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(Flatten())
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(256,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(Dense(64,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(Dense(9,activation='softmax'))

### Compile the model
Choose an appropirate optimiser and loss function for model training

In [None]:
# compile
optimizer = 'adam'
loss_fn = "sparse_categorical_crossentropy"
model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])

In [None]:
# View the summary of all layers
model.summary()

### Train the model

In [None]:
%%time
epochs = 20
history = model.fit(
  train_ds,
  batch_size=batch_size,
  validation_data=val_ds,
  epochs=epochs
)

### Visualizing training results

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
loss, accuracy = model.evaluate(train_ds, verbose=1,)
loss_v, accuracy_v = model.evaluate(val_ds, verbose=1)

print("Accuracy: ", accuracy)
print("Validation Accuracy: ",accuracy_v)
print("Loss: ",loss)
print("Validation Loss", loss_v)

### Compile the model
Choose an appropirate optimiser and loss function for model training

In [None]:
# compile
optimizer = 'adam'
loss_fn = "sparse_categorical_crossentropy"
model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])

In [None]:
# View the summary of all layers
model.summary()

### Train the model

In [None]:
%%time
epochs = 20
history = model.fit(
  train_ds,
  batch_size=batch_size,
  validation_data=val_ds,
  epochs=epochs
)

### Visualizing training results

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
loss, accuracy = model.evaluate(train_ds, verbose=1,)
loss_v, accuracy_v = model.evaluate(val_ds, verbose=1)

print("Accuracy: ", accuracy)
print("Validation Accuracy: ",accuracy_v)
print("Loss: ",loss)
print("Validation Loss", loss_v)

#### Todo: Write your findings after the model fit, see if there is an evidence of model overfit or underfit

### Findings from the above data
Training accuracy value vs Validation accuracy value we can see a huge difference.
Training accuracy is at 68% while as validatoin accuracy is at 49%. This denotes models performance on training data is high but on validatoin data its low.
This observation says our model is overfitting (model moemorized the training data)

In [None]:
# Todo, after you have analysed the model fit history for presence of underfit or overfit, choose an appropriate data augumentation strategy.
from tensorflow.keras import layers
data_augmentation = tf.keras.Sequential(
  [
     layers.RandomFlip("horizontal_and_vertical",
                                                 input_shape=(img_height,
                                                              img_width,
                                                              3)),
    layers.RandomRotation(0.2),
    layers.RandomZoom(0.2),
    layers.RandomContrast(0.1)
  ]
)


In [None]:
# Todo, visualize how your augmentation strategy works for one instance of training image.
plt.figure(figsize=(10, 10))
for images, _ in train_ds.take(1):
  for i in range(9):
    augmented_images = data_augmentation(images)
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(augmented_images[0].numpy().astype("uint8"))
    plt.axis("off")


### Create the model, compile and train the model

In [None]:
%time
input_shape = (180,180,3)
model = Sequential()
model.add(Conv2D(16,
                 kernel_size = (3,3),
                 input_shape = (180, 180, 3),
                 activation = 'relu',
                 padding = 'same'))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(Conv2D(32,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(Conv2D(64,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(Conv2D(128,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(Conv2D(256,
                 kernel_size = (3,3),
                 activation = 'relu'))
model.add(Flatten())
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(256,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(Dense(64,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(Dense(9,activation='softmax'))

In [None]:
# compile
optimizer = 'adam'
loss_fn = "sparse_categorical_crossentropy"
model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])

In [None]:
# View the summary of all layers
model.summary()

### Train the model

In [None]:
%%time
epochs = 20
history = model.fit(
  train_ds,
  batch_size=batch_size,
  validation_data=val_ds,
  epochs=epochs
)

### Visualizing training results

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
loss, accuracy = model.evaluate(train_ds, verbose=1,)
loss_v, accuracy_v = model.evaluate(val_ds, verbose=1)

print("Accuracy: ", accuracy)
print("Validation Accuracy: ",accuracy_v)
print("Loss: ",loss)
print("Validation Loss", loss_v)

#### Todo: Write your findings after the model fit, see if there is an evidence of model overfit or underfit. Do you think there is some improvement now as compared to the previous model run?

#### **Todo:** Find the distribution of classes in the training dataset.
#### **Context:** Many times real life datasets can have class imbalance, one class can have proportionately higher number of samples compared to the others. Class imbalance can have a detrimental effect on the final model quality. Hence as a sanity check it becomes important to check what is the distribution of classes in the data.

In [None]:
datalist =[]
for c in class_names:
  lst = os.listdir(pathlib.Path(train_dir / c)) # use / to join paths
  number_files = len(lst)
  datalist.append([c, number_files])
df = pd.DataFrame(datalist, columns=['Class', 'Count'])

In [None]:
df.head(len(class_names))

In [None]:
#Visualize the Number of image in each class.
import seaborn as sns
plt.figure(figsize=(6, 6))
sns.barplot(x="Count", y="Class", data=df, palette='copper_r')

#### Write your findings here:
#### - Which class has the least number of samples?
77 samples are present in class "seborrheic keratosis"
#### - Which classes dominate the data in terms proportionate number of samples?
"pigmented benign keratosis" class stands out high

#### **Todo:** Rectify the class imbalance
#### **Context:** You can use a python package known as `Augmentor` (https://augmentor.readthedocs.io/en/master/) to add more samples across all classes so that none of the classes have very few samples.

In [None]:
!pip install Augmentor

To use `Augmentor`, the following general procedure is followed:

1. Instantiate a `Pipeline` object pointing to a directory containing your initial image data set.<br>
2. Define a number of operations to perform on this data set using your `Pipeline` object.<br>
3. Execute these operations by calling the `Pipeline’s` `sample()` method.

In [None]:
path_to_training_dataset="/content/drive/MyDrive/Colab Notebooks/SkinCancer_Data/Train/"
import Augmentor
for i in class_names:
    p = Augmentor.Pipeline(path_to_training_dataset + i)
    print(p)
    p.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)
    p.sample(500) ## We are adding 500 samples per class to make sure that none of the classes are sparse.

Augmentor has stored the augmented images in the output sub-directory of each of the sub-directories of skin cancer types.. Lets take a look at total count of augmented images.

In [None]:
image_count_train = len(list(train_dir.glob('*/output/*.jpg')))
print(image_count_train)

In [None]:
from glob import glob

path_list_new = list(glob(os.path.join(train_dir, '*','output', '*.jpg')))
path_list_new[:5]

In [None]:
import os # imports the os module
from glob import glob # imports the glob function from the glob module

lesion_list_new = [os.path.basename(os.path.dirname(os.path.dirname(y))) for y in glob(os.path.join(train_dir, '*','output', '*.jpg'))]
lesion_list_new[:5]

In [None]:
dataframe_dict_new = dict(zip(path_list_new, lesion_list_new))

In [None]:
for i in class_names:
    directory = train_dir
    directory_out = train_dir / i / 'output'
    # directory_out = train_dir+i+'/output/'
    class_directory = pathlib.Path(directory)
    class_directory_out = pathlib.Path(directory_out)
    length=len(list(class_directory.glob(i+'/*.jpg')))
    length_out=len(list(class_directory_out.glob('*.jpg')))
    length_tot=length+length_out
    print(f'{i} has {length_tot} samples.')