In [1]:
# Import necessary libraries for image processing, data manipulation, model building, training, evaluation, and saving
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Set the path to the dataset and define the target image size
data_dir = 'Skin_Disease_Dataset'
img_size = 64

# Initialize empty lists to store image data and corresponding labels
images = []
labels = []

# Loop through each disease category folder in the dataset directory
for label in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, label)
    # Loop through each image file in the category folder
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        # Process only valid image file formats
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            img = cv2.imread(img_path)  # Read the image
            if img is not None:
                img = cv2.resize(img, (img_size, img_size))  # Resize image to target size
                images.append(img)  # Append image data to list
                labels.append(label)  # Append corresponding label
            else:
                print(f"Warning: Could not read image {img_path}, skipping.")  # Handle unreadable images

In [3]:
# Convert the list of images to a NumPy array and normalize pixel values to range [0, 1]
images = np.array(images, dtype='float32') / 255.0

# Convert the list of labels to a NumPy array
labels = np.array(labels)

# Print the total number of images loaded
print(f"Total images loaded: {len(images)}")

Total images loaded: 26945


In [4]:
# Initialize the label encoder
le = LabelEncoder()

# Convert string labels (e.g., 'Acne', 'Eczema') to numeric form
labels_encoded = le.fit_transform(labels)

# Convert numeric labels to one-hot encoded format for training
labels_categorical = to_categorical(labels_encoded)

# Save the label encoder to a file for future use during prediction
with open("labels.pkl", "wb") as f:
    pickle.dump(le, f)

In [5]:
# Split the dataset into training and testing sets
# 80% of data will be used for training and 20% for testing
# random_state=42 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(images, labels_categorical, test_size=0.2, random_state=42)

In [6]:
# Build a Convolutional Neural Network (CNN) model using Keras Sequential API
model = Sequential([
    # First convolutional layer with 32 filters, 3x3 kernel, ReLU activation
    # Input shape corresponds to the image size and 3 color channels (RGB)
    Conv2D(32, (3,3), activation='relu', input_shape=(img_size, img_size, 3)),
    MaxPooling2D(2,2),  # Downsampling with 2x2 max pooling to reduce spatial dimensions

    # Second convolutional layer with 64 filters and ReLU activation
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),  # Another max pooling layer to further reduce size

    Flatten(),  # Flatten 3D feature maps to 1D feature vector for dense layers
    Dense(128, activation='relu'),  # Fully connected layer with 128 neurons and ReLU activation
    Dropout(0.5),  # Dropout layer with 50% rate to reduce overfitting

    # Output layer with neurons equal to number of classes
    # Softmax activation outputs class probabilities
    Dense(len(le.classes_), activation='softmax')
])

# Compile the model with Adam optimizer, categorical cross-entropy loss, and accuracy metric
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print a summary of the model architecture and parameters
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [7]:
# Train the CNN model on training data
# Validate the model on the test data after each epoch to monitor performance
# epochs=15 means the model will see the entire training set 15 times
# batch_size=32 means training will happen in mini-batches of 32 images for efficient learning
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=32)

Epoch 1/15
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 46ms/step - accuracy: 0.5488 - loss: 0.9756 - val_accuracy: 0.7669 - val_loss: 0.5106
Epoch 2/15
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 45ms/step - accuracy: 0.7486 - loss: 0.5614 - val_accuracy: 0.8135 - val_loss: 0.4489
Epoch 3/15
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 44ms/step - accuracy: 0.7946 - loss: 0.4483 - val_accuracy: 0.8406 - val_loss: 0.3705
Epoch 4/15
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 45ms/step - accuracy: 0.8335 - loss: 0.3897 - val_accuracy: 0.8605 - val_loss: 0.3125
Epoch 5/15
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 43ms/step - accuracy: 0.8533 - loss: 0.3547 - val_accuracy: 0.8901 - val_loss: 0.2613
Epoch 6/15
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 50ms/step - accuracy: 0.8737 - loss: 0.3037 - val_accuracy: 0.9011 - val_loss: 0.2441
Epoch 7/15
[1m6

In [8]:
from sklearn.metrics import classification_report

# Evaluate the trained model's loss and accuracy on the test dataset
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc * 100:.2f}%")  # Print test accuracy as percentage

# Predict class probabilities for all test images
y_pred_probs = model.predict(X_test)

# Convert predicted probabilities to class labels by selecting the index with highest probability
y_pred = np.argmax(y_pred_probs, axis=1)

# Get true class labels from one-hot encoded y_test by selecting the index of '1'
y_true = np.argmax(y_test, axis=1)

# Generate and print detailed classification report with precision, recall, f1-score for each class
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=le.classes_))

[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9144 - loss: 0.2135
Test Accuracy: 91.84%
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step

Classification Report:

              precision    recall  f1-score   support

        Acne       0.97      0.95      0.96      2036
      Eczema       0.64      0.60      0.62       332
    Melanoma       0.97      0.95      0.96       630
   Psoriasis       0.66      0.70      0.68       432
     Rosacea       0.95      0.97      0.96      1959

    accuracy                           0.92      5389
   macro avg       0.84      0.84      0.84      5389
weighted avg       0.92      0.92      0.92      5389



In [9]:
# Save the trained CNN model in Keras (.keras) format for later use or deployment
model.save("skin_disease_identification_cnn_model.keras")

In [10]:
def predict_image(img_path):
    # Read the image from the given path
    img = cv2.imread(img_path)
    if img is None:
        # Handle case when image path is invalid or file is missing
        print("Image not found.")
        return
    
    # Resize the image to match model input size and normalize pixel values
    img = cv2.resize(img, (img_size, img_size)) / 255.0
    
    # Add batch dimension since model expects input shape (1, img_size, img_size, 3)
    img = np.expand_dims(img, axis=0)

    # Get prediction probabilities from the model
    prediction = model.predict(img)
    
    # Find the index of the highest probability (predicted class)
    predicted_index = np.argmax(prediction)
    
    # Decode the label back to original class name
    predicted_class = le.inverse_transform([predicted_index])[0]

    # Print predicted class
    print(f"Predicted disease: {predicted_class}")
    
    # Print confidence scores for each class to understand model certainty
    print("Confidence scores:")
    for i, score in enumerate(prediction[0]):
        print(f"  {le.inverse_transform([i])[0]}: {score:.4f}")

In [11]:
# Call the prediction function on a sample image
predict_image('Skin_Disease_Dataset/Rosacea/aug_0_4.jpg')
# This will print the predicted skin disease class and confidence scores for each class

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Predicted disease: Rosacea
Confidence scores:
  Acne: 0.0216
  Eczema: 0.0000
  Melanoma: 0.0000
  Psoriasis: 0.0000
  Rosacea: 0.9784
