In [1]:
import os
import pandas as pd
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import random

In [2]:
random.seed(42)

## Preprocessing

In [3]:
import os
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the path to your dataset
dataset_path = r"C:\Users\Alex\Desktop\AIPM\dataset"

# Define image size
image_size = (64, 64)

# Create an ImageDataGenerator for data augmentation
datagen = ImageDataGenerator(
    rescale=1./255,  # Normalize pixel values to [0, 1]
    rotation_range=30,  # Random rotation between -30 and 30 degrees
    horizontal_flip=True,  # Random horizontal flip
)

# This will store image data and labels
metadata = []

def preprocess_images_from_folder(folder_path, class_name):
    """
    Process all images from a folder and store the image data and labels into the metadata list.
    :param folder_path: The path to the folder containing images
    :param class_name: The class name (folder name) of the images
    """
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('png', 'jpg', 'jpeg')):
                img_path = os.path.join(root, file)
                
                # Read the image using OpenCV
                img = cv2.imread(img_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR (OpenCV) to RGB

                # Resize the image to the target size
                img_resized = cv2.resize(img, image_size)

                # Normalize the image
                img_resized_norm = img_resized / 255.0

                # Add the image data (as a 3D array) and label to metadata
                metadata.append({
                    'image_data': img_resized_norm,
                    'class': class_name
                })

                # Apply augmentation if necessary (not saving augmented images for CSV)
                img_resized_norm = np.expand_dims(img_resized_norm, axis=0)  # Add batch dimension
                augmented_gen = datagen.flow(img_resized_norm, batch_size=1)

                for _ in range(1):  # Generate augmented images
                    augmented_img = next(augmented_gen)[0]

                    # Add augmented image data (as a 3D array) and label to metadata
                    metadata.append({
                        'image_data': augmented_img,
                        'class': class_name
                    })

def process_dataset(dataset_path):
    """
    Loop through the 'train' and 'test' directories and preprocess all images
    :param dataset_path: The root path of the dataset
    """
    # Process the 'train' folder
    train_dir = os.path.join(dataset_path, 'train')
    for class_name in os.listdir(train_dir):
        class_path = os.path.join(train_dir, class_name)
        preprocess_images_from_folder(class_path, class_name)

    # Process the 'test' folder
    test_dir = os.path.join(dataset_path, 'test')
    for class_name in os.listdir(test_dir):
        class_path = os.path.join(test_dir, class_name)
        preprocess_images_from_folder(class_path, class_name)

# Collect metadata and save to CSV
def save_metadata_to_csv(metadata):
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv('image_metadata.csv', index=False)
    print("Metadata saved to 'image_metadata.csv'")

# Save image data as a numpy file (.npz) containing arrays
def save_metadata_to_npz(metadata, filename='image_data.npz'):
    # Convert metadata to numpy arrays
    images = np.array([entry['image_data'] for entry in metadata])
    labels = np.array([entry['class'] for entry in metadata])

    # Save both images and labels in a compressed .npz file
    np.savez_compressed(filename, images=images, labels=labels)
    print(f"Metadata saved to {filename}")


# Run the preprocessing
#process_dataset(dataset_path)

# Usage
#save_metadata_to_npz(metadata)


In [3]:
# Loading data from the .npz file
data = np.load('image_data.npz')
X = data['images']  # This will have shape (num_samples, 64, 64, 3)
y = data['labels']

In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Convert text labels to numeric labels
y = label_encoder.fit_transform(y)
y_categorical = to_categorical(y, num_classes=23)
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)
X_train = np.array(X_train, dtype=np.float32)


## Model creation

In [6]:
# Initialize the CNN model
model = Sequential()

# Add convolutional and pooling layers
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())

# Add fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(23, activation='softmax')) 

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 62, 62, 32)        896       
                                                                 
 max_pooling2d (MaxPooling2  (None, 31, 31, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 29, 29, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 14, 14, 64)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 12544)             0         
                                                                 
 dense (Dense)               (None, 128)             

In [20]:
# Split data manually (instead of using validation_split)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model with explicit validation data
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


# Save the trained model
model.save('skin_disease_model.h5')


Epoch 1/10
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 42ms/step - accuracy: 0.0949 - loss: 3.0128 - val_accuracy: 0.1150 - val_loss: 2.9394
Epoch 2/10
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 42ms/step - accuracy: 0.1204 - loss: 2.9257 - val_accuracy: 0.1305 - val_loss: 2.8834
Epoch 3/10
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 45ms/step - accuracy: 0.1354 - loss: 2.8716 - val_accuracy: 0.1361 - val_loss: 2.8622
Epoch 4/10
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 42ms/step - accuracy: 0.1471 - loss: 2.8236 - val_accuracy: 0.1443 - val_loss: 2.8507
Epoch 5/10
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 42ms/step - accuracy: 0.1669 - loss: 2.7698 - val_accuracy: 0.1559 - val_loss: 2.8259
Epoch 6/10
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 41ms/step - accuracy: 0.1762 - loss: 2.7126 - val_accuracy: 0.1540 - val_loss: 2.8200
Epoch 7/10
[1m7



In [21]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.1739 - loss: 2.8504
Test Accuracy: 16.99%


In [7]:
# Initialize the CNN model
model = Sequential()

# Add convolutional and pooling layers
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())

# Add fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(23, activation='softmax')) 

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 62, 62, 32)        896       
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 31, 31, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_3 (Conv2D)           (None, 29, 29, 64)        18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 14, 14, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_4 (Conv2D)           (None, 12, 12, 128)       73856     
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 6, 6, 128)        

In [23]:
# Split data manually (instead of using validation_split)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model with explicit validation data
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


# Save the trained model
model.save('skin_disease_model.h5')


Epoch 1/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 39ms/step - accuracy: 0.0842 - loss: 3.0218 - val_accuracy: 0.0985 - val_loss: 2.9340
Epoch 2/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.1060 - loss: 2.9567 - val_accuracy: 0.1148 - val_loss: 2.9110
Epoch 3/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.1235 - loss: 2.9143 - val_accuracy: 0.1308 - val_loss: 2.8849
Epoch 4/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.1267 - loss: 2.8916 - val_accuracy: 0.1390 - val_loss: 2.8654
Epoch 5/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 38ms/step - accuracy: 0.1458 - loss: 2.8517 - val_accuracy: 0.1450 - val_loss: 2.8341
Epoch 6/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.1473 - loss: 2.8140 - val_accuracy: 0.1496 - val_loss: 2.8170
Epoch 7/10
[1m6



In [24]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.2078 - loss: 2.6638
Test Accuracy: 20.30%


## Grid Search

In [6]:
from sklearn.model_selection import GridSearchCV
def create_model(neuron=32, optimizator='adam', dropout_rate=0.0):
    model = Sequential([
        Dense(neuron, input_dim=X_train.shape[1], activation='relu'),
        Dropout(dropout_rate),
        Dense(64, activation='relu'),
        Dense(len(np.unique(y)), activation='softmax')
    ])
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizator, metrics=['accuracy'])
    return model


In [24]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scikeras.wrappers import KerasRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D

# Defining the models to be used
random_state = 42

models = {
    "DecisionTree": DecisionTreeRegressor(random_state=random_state),
    "RandomForest": RandomForestRegressor(random_state=random_state)
}

param_grids = {
    "DecisionTree": {
        "max_depth": [3, 5, 10],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "RandomForest": {
        "n_estimators": [50, 100],
        "max_depth": [5, 10]
    }
}

best_models = {}
X_train_2d = X_train.reshape(X_train.shape[0], -1)

# Grid Search for each model
for name, model in models.items():
    print(f"Optimizing {name}...")
    grid_search = GridSearchCV(
        model,
        param_grids[name],
        scoring="neg_mean_absolute_error",
        cv=3,  # Reduce the number of folds for faster execution
        n_jobs=-1
    )
    grid_search.fit(X_train_2d[:1000], y_train[:1000])
    
    best_models[name] = grid_search.best_estimator_
    print(f"Best hyperparameters for {name}: {grid_search.best_params_}\n")


Optimizing DecisionTree...
Best hyperparameters for DecisionTree: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}

Optimizing RandomForest...
Best hyperparameters for RandomForest: {'max_depth': 10, 'n_estimators': 50}

Optimizing GradientBoosting...


ValueError: 
All the 24 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\ensemble\_gb.py", line 672, in fit
    y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\utils\validation.py", line 1485, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (666, 23) instead.

--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\ensemble\_gb.py", line 672, in fit
    y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  File "C:\Users\Alex\anaconda3\envs\tf_env\lib\site-packages\sklearn\utils\validation.py", line 1485, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (667, 23) instead.


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_train_2d = X_train.reshape(X_train.shape[0], -1)
X_test_2d = X_test.reshape(X_test.shape[0], -1)

# Best hyperparameters for DecisionTree and RandomForest from GridSearchCV
best_dt = DecisionTreeRegressor(max_depth=10, min_samples_leaf=2, min_samples_split=5)
best_rf = RandomForestRegressor(max_depth=10, n_estimators=50)

# Train DecisionTreeRegressor with the best parameters
best_dt.fit(X_train_2d, y_train)
y_pred_dt = best_dt.predict(X_test_2d)

# Train RandomForestRegressor with the best parameters
best_rf.fit(X_train_2d, y_train)
y_pred_rf = best_rf.predict(X_test_2d)

# Evaluate models
print("DecisionTreeRegressor Evaluation:")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_dt)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_dt)}")
print(f"R^2 Score: {r2_score(y_test, y_pred_dt)}\n")

print("RandomForestRegressor Evaluation:")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_rf)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_rf)}")
print(f"R^2 Score: {r2_score(y_test, y_pred_rf)}\n")
