# Aksacarma Machine Learning
This notebook run on Google Colab with Python 3.10.12, Tensorflow 2.12.0, and Keras 2.12.0

In [None]:
# Run this cell to check Python version
!python --version

In [None]:
# Run this cell to check Tensorflow version
import tensorflow as tf
print(tf.__version__)

In [None]:
# Run this cell to check Keras version
import keras
print(keras.__version__)

## Get The Data
The data to be used is **Dermnet**, obtained from https://www.kaggle.com/datasets/shubhamgoel27/dermnet.

This section only needs to be run once.
This section will download the data from kaggle and standardize the dataset folder name.

In [None]:
# opendatasets is a Python library for downloading datasets from online sources like Kaggle.
# Follow this link for details, https://pypi.org/project/opendatasets/
%pip install opendatasets

### Import Required Libraries

In [None]:
# Import required libraries
import opendatasets as od
import os
import shutil
from shutil import copyfile
import random

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import RMSprop
from tensorflow import keras

import matplotlib.pyplot as plt # matplotlib 3.7.1

### Download Dataset

In [None]:
# Run this cell to download dataset
dataset_url = "https://www.kaggle.com/datasets/shubhamgoel27/dermnet"
od.download(dataset_url)

In [None]:
# Define the Dermnet dataset path
source_dir = "./dermnet"
source_dir_train = os.path.join(source_dir, "train")
source_dir_test = os.path.join(source_dir, "test")

 ### Standardize The Dataset Folder Name

In [None]:
# Standariz name folder
# This cell will remove word "Photos" and change " " with "-"
def standerize_name(fn):
  fn = fn.lower().replace("photos", "")
  fn = " ".join(fn.split())
  return fn.replace(" ", "-")

In [None]:
# Correct all name folder with standariz name folder in main folder
def correct_name(main_folder_dir):
  old_dir_list = os.listdir(main_folder_dir)
  new_dir_list = [standeriz_name(fn) for fn in old_dir_list]
  for i in range(len(old_dir_list)):
    old_name = os.path.join(main_folder_dir, old_dir_list[i])
    new_name = os.path.join(main_folder_dir, new_dir_list[i])
    os.rename(old_name, new_name)


In [None]:
# Correct folder name in "./dermnet/train" and "./dermnet/test"
correct_name(source_dir_train)
correct_name(source_dir_test)

After downloading and standardizing the folder name, the data will be merged and sorted manually for model training needs. In this Dermnet dataset irrelevant data will be deleted, for example non-skin images and microscopy images. The merged and sorted data can be accessed at the following [link](https://).

## Split Dataset
Split [merged dataset](https://) to train, validation, and test dataset. Data will be stored on Google Drive.

In [None]:
# Data access on google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define directory
# source_path is the path where source data is fetched
source_path = "/content/drive/MyDrive/Colab Notebooks/merged-sorted-dermnet"
# Data from source_path will be separated and stored in root_dir
root_dir = './dermnet-image-for-train'

In [None]:
# From Dermnet dataset, 10 skin diseases were selected to make the model machine learning.
# Those are acne-and-rosacea, bullous-disease, eczema,
# melanoma-skin-cancer-nevi-and-moles,
# scabies-lyme-disease-and-other-infestations-and-bites,
# tinea-ringworm-candidiasis-and-other-fungal-infections, urticaria-hives,
# vascular-tumors, vasculitis, warts-molluscum-and-other-viral-infections
skin_type_desease = [
    'acne-and-rosacea',
    'bullous-disease',
    'eczema',
    'melanoma-skin-cancer-nevi-and-moles',
    'scabies-lyme-disease-and-other-infestations-and-bites',
    'tinea-ringworm-candidiasis-and-other-fungal-infections',
    'urticaria-hives',
    'vascular-tumors',
    'vasculitis',
    'warts-molluscum-and-other-viral-infections',
    ]

# n is many of types of skin desease
n = len(skin_type)
print("Many of types of skin desease:", n)

In [None]:
# Deletes all non-image files
!find ./merged-sorted-dermnet/ -type f ! -name "*.jpg" -exec rm {} +

# Check how many pictures are in the folder
for fn in skin_type:
    source_path_skin = os.path.join(source_path, fn)
    print(f"There are {len(os.listdir(source_path_skin))} images of {fn}.")

In [None]:
# Empty directory to prevent FileExistsError is the function is run several times
if os.path.exists(root_dir):
    shutil.rmtree(root_dir)

In [None]:
# Create train and validation folder
def create_train_val_dirs(root_path, type_distribution):
  train_dir = os.path.join(root_path, 'train')
  validation_dir = os.path.join(root_path, 'validation')
  test_dir = os.path.join(root_path, 'test')

  # Create train's folder
  for fn in type_distribution:
    train_type_distribution_dir = os.path.join(train_dir, fn)
    os.makedirs(train_type_distribution_dir)

  # Create validation's folder
  for fn in type_distribution:
    validation_type_distribution_dir = os.path.join(validation_dir, fn)
    os.makedirs(validation_type_distribution_dir)

  # Create test's folder
  for fn in type_distribution:
    test_type_distribution_dir = os.path.join(test_dir, fn)
    os.makedirs(test_type_distribution_dir)


In [None]:
# Test create_train_val_dirs function
try:
  create_train_val_dirs(root_path=root_dir, type_distribution=skin_type)
except FileExistsError:
  print("You should not be seeing this since the upper directory is removed beforehand")

for rootdir, dirs, files in os.walk(root_dir):
    for subdir in dirs:
        print(os.path.join(rootdir, subdir))

In [None]:
# Split source data to training and validation data
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, TEST_DIR, SPLIT_SIZE):
  files = []
  for fn in os.listdir(SOURCE_DIR):
    file_path = os.path.join(SOURCE_DIR, fn)

    if os.path.getsize(file_path):
      files.append(fn)
    else:
      print(f"{fn} is zero length, so ignoring.")

  file_size = len(files)

  split_point_1 = int(file_size * SPLIT_SIZE[0])
  split_point_2 = int(file_size * SPLIT_SIZE[1]) + split_point_1

  shuffled = random.sample(files, file_size)

  train_set = shuffled[:split_point_1]
  validation_set = shuffled[split_point_1:split_point_2]
  test_set = shuffled[split_point_2:]

  for fn in train_set:
    copyfile(os.path.join(SOURCE_DIR, fn), os.path.join(TRAINING_DIR, fn))

  for fn in validation_set:
    copyfile(os.path.join(SOURCE_DIR, fn), os.path.join(VALIDATION_DIR, fn))

  for fn in test_set:
    copyfile(os.path.join(SOURCE_DIR, fn), os.path.join(TEST_DIR, fn))


In [None]:
# Process split skin desease data
# Define training and validation root
TRAINING_DIR = os.path.join(root_dir, "train")
VALIDATION_DIR = os.path.join(root_dir, "validation")
TEST_DIR = os.path.join(root_dir, "test")

In [None]:
# List of all skin disease source directory
SKIN_TYPE_DIR_SOURCE = []
# List of all skin disease training directory
SKIN_TYPE_DIR_TRAINING = []
# List of all skin disease calidation directory
SKIN_TYPE_DIR_VALIDATION = []
# List of all skin disease test directory
SKIN_TYPE_DIR_TEST = []

# Append list of SKIN_TYPE_DIR_SOURCE, SKIN_TYPE_DIR_TRAINING, SKIN_TYPE_DIR_VALIDATION
for fn in skin_type:
  SKIN_SOURCE_DIR = f"{source_path}/{fn}"
  SKIN_TYPE_DIR_SOURCE.append(SKIN_SOURCE_DIR)

  TRAINING_SKIN_DIR = os.path.join(TRAINING_DIR, fn)
  SKIN_TYPE_DIR_TRAINING.append(TRAINING_SKIN_DIR)

  VALIDATION_SKIN_DIR = os.path.join(VALIDATION_DIR, fn)
  SKIN_TYPE_DIR_VALIDATION.append(VALIDATION_SKIN_DIR)

  TEST_SKIN_DIR = os.path.join(TEST_DIR, fn)
  SKIN_TYPE_DIR_TEST.append(TEST_SKIN_DIR)

# Empty directories in case this cell run multiple times
for fn in SKIN_TYPE_DIR_TRAINING:
  if len(os.listdir(fn)) > 0:
    for file in os.scandir(fn):
      os.remove(file.path)

for fn in SKIN_TYPE_DIR_VALIDATION:
  if len(os.listdir(fn)) > 0:
    for file in os.scandir(fn):
      os.remove(file.path)

for fn in SKIN_TYPE_DIR_TEST:
  if len(os.listdir(fn)) > 0:
    for file in os.scandir(fn):
      os.remove(file.path)

# Define proportion of images used for training, validation, and test
# (training proportion, validation proportion), remainder is test set
split_size = (.8, .15) # 80% for training, 15% for validation, and rest 5% for test

# Split from source folder to training and validation folder
for i in range(len(skin_type)):
  TMP_SOURCE_DIR = SKIN_TYPE_DIR_SOURCE[i]
  TMP_TRAINING_DIR = SKIN_TYPE_DIR_TRAINING[i]
  TMP_VALIDATION_DIR = SKIN_TYPE_DIR_VALIDATION[i]
  TMP_TEST_DIR = SKIN_TYPE_DIR_TEST[i]

  split_data(TMP_SOURCE_DIR, TMP_TRAINING_DIR, TMP_VALIDATION_DIR, TMP_TEST_DIR, split_size)

# Check that the number of images matches the expected output
print("The number of images in folder Original -> Training, Validation, Test")
for i in range(len(skin_type)):
  print(f"{skin_type[i]} : {len(os.listdir(SKIN_TYPE_DIR_SOURCE[i]))} -> {len(os.listdir(SKIN_TYPE_DIR_TRAINING[i]))}, {len(os.listdir(SKIN_TYPE_DIR_VALIDATION[i]))}, {len(os.listdir(SKIN_TYPE_DIR_TEST[i]))}")

## Create Model

In [None]:
# Define batch size
BATCH_SIZE = 32

### Image Augmentation

In [None]:
# Creates the training and validation data generators
train_datagen = ImageDataGenerator(rescale = 1.0/255.,
                                   rotation_range = 40,
                                   zoom_range = 0.2,
                                   width_shift_range = 0.2,
                                   height_shift_range = 0.2,
                                   horizontal_flip = True,
                                   vertical_flip = True,
                                   )
train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
                                                      batch_size=BATCH_SIZE,
                                                      class_mode='categorical',
                                                      target_size=(150, 150)
                                                      )

validation_datagen = ImageDataGenerator(rescale = 1.0/255.,
                                        rotation_range = 40,
                                        zoom_range = 0.2,
                                        width_shift_range = 0.2,
                                        height_shift_range = 0.2,
                                        horizontal_flip = True,
                                        vertical_flip = True,
                                        )
validation_generator = validation_datagen.flow_from_directory(directory=TEST_DIR,
                                                              batch_size=BATCH_SIZE,
                                                              class_mode='categorical',
                                                              target_size=(150, 150)
                                                              )

In [None]:
# Define step size for train and validation
STEP_SIZE_TRAIN = (train_generator.n // train_generator.batch_size)
STEP_SIZE_VALID = (validation_generator.n // validation_generator.batch_size)

### Import Transfer Learning InceptionV3



In [None]:
# Download the pre-trained weights. No top means it excludes the fully connected layer it uses for classification.
!wget --no-check-certificate \
    https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5 \
    -O /tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import layers

# Set the downloaded weights file into a variable
local_weights_file = '/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'

# Initialize the base model.
# Set the input shape and remove the dense layers.
pre_trained_model = InceptionV3(input_shape = (150, 150, 3),
                                include_top = False,
                                weights = None)

# Load the pre-trained weights you downloaded.
pre_trained_model.load_weights(local_weights_file)

# Freeze the weights of the layers.
for layer in pre_trained_model.layers:
  layer.trainable = False

In [None]:
# Choose `mixed_7` as the last layer of base model
last_layer = pre_trained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output

### Define Model And Train Model

In [None]:
from tensorflow.keras import Model

# Flatten the output layer to 1 dimension
x = layers.Flatten()(last_output)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(n, activation='softmax')(x)

# Append the dense network to the base model
model = Model(pre_trained_model.input, x)

# Print the model summary. See dense network connected at the end.
model.summary()

In [None]:
# Config the model with optimizer, losses, and metrics
model.compile(
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001),
    loss = "categorical_crossentropy",
    metrics = ['acc']
    )

In [None]:
# Train the model
# It can reach loss: 0.6811 - acc: 0.7690 - val_loss: 1.4539 - val_acc: 0.6322 in 8 × 15 epochs
history = model.fit(
    train_generator,
    validation_data = validation_generator,
    steps_per_epoch = STEP_SIZE_TRAIN,
    epochs = 15,
    validation_steps = STEP_SIZE_VALID,
    )

### Plot Training Accuracy and Lose

In [None]:
# Plot result training and validation data

# Retrieve a list of list results on training and test data sets for each training epoch
def plot_acc_loss(history_model, metric):
    if metric == "acc":
        acc = history_model.history['acc']
        val_acc = history_model.history['val_acc']
    elif metric == "accuracy":
        acc = history_model.history['accuracy']
        val_acc = history_model.history['val_accuracy']

    loss = history_model.history['loss']
    val_loss = history_model.history['val_loss']

    epochs = range(len(acc)) # Get number of epochs

    # Plot training and validation accuracy per epoch
    plt.plot(epochs, acc, 'r', label="Training Accuracy")
    plt.plot(epochs, val_acc, 'b', label="Validation Accuracy")
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

    # Plot training and validation loss per epoch
    plt.plot(epochs, loss, 'r', label="Training Loss")
    plt.plot(epochs, val_loss, 'b', label="Validation Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
plot_acc_loss(history, "acc")

## Export Model
After the model is trained, save the model

In [None]:
# Export model to SavedModel, HDF5 Format
!mkdir -p saved_model
model.save('saved_model/ml-aksacarma')

In [None]:
# For Google Colab, run this cell to convert the model to zip and download it
from google.colab import files
!zip -r savedmodel.zip "/content/saved_model"
files.download("/content/savedmodel.zip")

## Load Saved Model

In [None]:
# Load Saved_Model
model_path = "/content/drive/MyDrive/Colab Notebooks/saved_model/ml-aksacarma"
new_model = tf.keras.models.load_model(model_path)
new_model.summary()

In [None]:
from keras.preprocessing import image
import keras.utils as image
import numpy as np

# skin_type_desease is thes same as defined at the beginning
skin_type_desease = [
    'acne and rosacea',
    'bullous disease',
    'eczema',
    'melanoma skin cancer nevi and moles',
    'scabies lyme disease and other infestations and bites',
    'tinea ringworm candidiasis and other fungal infections',
    'urticaria hives',
    'vascular tumors',
    'vasculitis',
    'warts molluscum and other viral infections',
    ]

In [None]:
# Determine class
def what_class(arr_prob_class, type_skin_diseases):
  # Get class with highest probabilty
  index_highest_probability_class  = np.argmax(arr_prob_class)
  if arr_prob_class[index_highest_probability_class] < 0.5:
    return None

  for i in range(len(skin_type_desease)):
    if index_highest_probability_class == i:
      return skin_type_desease[i]

In [None]:
# Predict skin desease
def predict_skin_desease(img_path, new_model, skin_type_desease):
    # Convert image for input tensorflow
    img = image.load_img(img_path, target_size=(150, 150))
    imgplot = plt.imshow(img)
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = x / 255.0  # Normalize image

    # Predict image
    predictions = new_model.predict(x)
    return what_class(predictions[0], skin_type_desease)

In [None]:
# Get image that want to predict
path = "/content/drive/MyDrive/Colab Notebooks/dermnet-image-for-train/test/warts-molluscum-and-other-viral-infections/corns-36.jpg"
print(predict_skin_desease(path, new_model ,skin_type_desease))