In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
import os
from sklearn.metrics import accuracy_score

# Define dataset class
class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

# Define transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [6]:
import os
import shutil
from tqdm import tqdm

base_dir = "../Datasets/SIPaKMeD"
output_dir = "../Outputs/sipakmed_combine"
folders = ["im_Dyskeratotic", "im_Koilocytotic", "im_Metaplastic", "im_Parabasal", "im_Superficial-Intermediate"]

os.makedirs(output_dir, exist_ok=True)

label_mapping = {folder: i for i, folder in enumerate(folders)}

for folder in folders:
    class_dir = os.path.join(base_dir, folder, "CROPPED")
    
    if os.path.exists(class_dir):
        for file in tqdm(os.listdir(class_dir), desc=f"Processing {folder}"):
            if file.endswith(".bmp"): 
                old_path = os.path.join(class_dir, file)
                new_filename = f"{label_mapping[folder]}__{file.split('.')[0]}.bmp"
                new_path = os.path.join(output_dir, new_filename)
                shutil.copy(old_path, new_path)

print("Organisation et renommage des fichiers terminés ! ✅")


Processing im_Dyskeratotic: 100%|██████████| 2439/2439 [00:00<00:00, 17773.68it/s]
Processing im_Koilocytotic: 100%|██████████| 2475/2475 [00:00<00:00, 11059.10it/s]
Processing im_Metaplastic: 100%|██████████| 2379/2379 [00:00<00:00, 9142.08it/s]
Processing im_Parabasal: 100%|██████████| 2361/2361 [00:00<00:00, 14176.74it/s]
Processing im_Superficial-Intermediate: 100%|██████████| 2493/2493 [00:00<00:00, 5162.88it/s]

Organisation et renommage des fichiers terminés ! ✅





In [7]:
import os
from sklearn.model_selection import train_test_split

image_folder = "../Outputs/sipakmed_combine"

image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith('.bmp')]
labels = [int(img.split('__')[0]) for img in os.listdir(image_folder) if img.endswith('.bmp')]  # Extraction du label

print(f"Total images: {len(image_paths)}")
print(f"Total labels: {len(labels)}")

Total images: 4049
Total labels: 4049


In [9]:
split_ratios = {"train": 0.75, "cal": 0.1125, "val": 0.0375, "test": 0.10}

train_paths, temp_paths, train_labels, temp_labels = train_test_split(
    image_paths, labels, test_size=1 - split_ratios["train"], stratify=labels, random_state=42
)

remaining_ratio = 1 - split_ratios["train"]
cal_ratio = split_ratios["cal"] / remaining_ratio
val_ratio = split_ratios["val"] / remaining_ratio
test_ratio = split_ratios["test"] / remaining_ratio

cal_paths, temp_paths, cal_labels, temp_labels = train_test_split(
    temp_paths, temp_labels, test_size=(val_ratio + test_ratio), stratify=temp_labels, random_state=42
)

val_paths, test_paths, val_labels, test_labels = train_test_split(
    temp_paths, temp_labels, test_size=(test_ratio / (val_ratio + test_ratio)), stratify=temp_labels, random_state=42
)

print(f"Train: {len(train_paths)}, Cal: {len(cal_paths)}, Val: {len(val_paths)}, Test: {len(test_paths)}")

Train: 3036, Cal: 455, Val: 152, Test: 406


In [10]:

dataset_train = ImageDataset(train_paths, train_labels, transform=transform)
dataset_val = ImageDataset(val_paths, val_labels, transform=transform)
dataset_cal = ImageDataset(cal_paths, cal_labels, transform=transform)
dataset_test = ImageDataset(test_paths, test_labels, transform=transform)

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=32, shuffle=False)
dataloader_cal = DataLoader(dataset_cal, batch_size=32, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False)


In [12]:

# Load ResNet50 model
resnet = models.resnet50(pretrained=True)
resnet.fc = nn.Identity() 
resnet.eval()
resnet.to('cuda' if torch.cuda.is_available() else 'cpu')

# Feature extraction function
def extract_features(dataloader):
    features, labels_list = [], []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Extracting features"):
            images = images.to(device)
            feat = resnet(images)
            features.append(feat.cpu().numpy())
            labels_list.extend(labels.numpy())
    return np.vstack(features), np.array(labels_list)

# Extract features for train, val, and test
features_train, labels_train = extract_features(dataloader_train)
features_val, labels_val = extract_features(dataloader_val)
features_cal, labels_cal = extract_features(dataloader_cal)
features_test, labels_test = extract_features(dataloader_test)


Extracting features: 100%|██████████| 95/95 [00:30<00:00,  3.07it/s]
Extracting features: 100%|██████████| 5/5 [00:01<00:00,  3.19it/s]
Extracting features: 100%|██████████| 15/15 [00:04<00:00,  3.42it/s]
Extracting features: 100%|██████████| 13/13 [00:04<00:00,  2.94it/s]


In [37]:
np.savez("../Feature_Extraction/sipakmed_features.npz", train=features_train, train_labels=labels_train, 
         val=features_val, val_labels=labels_val,
         cal=features_cal, cal_labels=labels_cal,
         test=features_test, test_labels=labels_test)

In [62]:
def set_seed(seed=42):
    """
    Set the random seed for reproducibility across different libraries:
    numpy, torch, and cuda if available.
    
    Args:
        seed (int): The seed value for random number generators. Default is 42.
    """
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    
set_seed()

Epoch 1/25, Loss: 46.8372, Val Loss: 1.3598, Val Acc: 0.8816
Epoch 2/25, Loss: 21.4198, Val Loss: 1.2374, Val Acc: 0.9145
Epoch 3/25, Loss: 17.9827, Val Loss: 1.0701, Val Acc: 0.9408
Epoch 4/25, Loss: 17.7786, Val Loss: 0.9791, Val Acc: 0.9342
Epoch 5/25, Loss: 14.4647, Val Loss: 1.2757, Val Acc: 0.9013
Epoch 6/25, Loss: 11.7573, Val Loss: 1.3027, Val Acc: 0.8882
Epoch 7/25, Loss: 10.1886, Val Loss: 1.0554, Val Acc: 0.9342
Epoch 8/25, Loss: 9.1647, Val Loss: 0.9319, Val Acc: 0.9408
Epoch 9/25, Loss: 7.8848, Val Loss: 1.5678, Val Acc: 0.9276
Epoch 10/25, Loss: 8.9252, Val Loss: 1.3154, Val Acc: 0.9342
Epoch 11/25, Loss: 8.4252, Val Loss: 1.1965, Val Acc: 0.9539
Epoch 12/25, Loss: 8.7049, Val Loss: 0.9013, Val Acc: 0.9539
Epoch 13/25, Loss: 8.4871, Val Loss: 1.0519, Val Acc: 0.9408
Epoch 14/25, Loss: 7.7927, Val Loss: 1.0113, Val Acc: 0.9408
Epoch 15/25, Loss: 7.0666, Val Loss: 0.9268, Val Acc: 0.9408
Epoch 16/25, Loss: 6.5883, Val Loss: 1.2564, Val Acc: 0.9276
Epoch 17/25, Loss: 6.0760,

  mlp.load_state_dict(torch.load("best_mlp_classifier.pth"))


**Find the best hyperparameters for the MLP model using Optuna**

In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import itertools
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load extracted features
data = np.load("../Feature_Extraction/sipakmed_features.npz")
features_train, labels_train = data["train"], data["train_labels"]
features_val, labels_val = data["val"], data["val_labels"]
features_test, labels_test = data["test"], data["test_labels"]

# Define MLP classifier
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, 128)
        self.fc3 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Hyperparameters to test
BATCHS = [32, 64, 128]
LEARNING_RATE = 1e-4
NUM_EPOCHS = [25, 50, 100]
PATIENCE = 5

# Store results
results = []
best_model = None
best_config = None
lowest_val_loss = float("inf")

# Device setup
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Iterate over all combinations of batch_size and num_epochs
for batch_size, num_epochs in tqdm(itertools.product(BATCHS, NUM_EPOCHS), total=len(BATCHS) * len(NUM_EPOCHS), desc="Hyperparameter Tuning"):

    print(f"\n🚀 Training MLP with Batch: {batch_size}, Epochs: {num_epochs}")

    # Create model
    input_dim = features_train.shape[1]
    num_classes = len(set(labels_train))
    mlp = MLPClassifier(input_dim, num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(mlp.parameters(), lr=LEARNING_RATE)

    # Convert features to tensors
    X_train = torch.tensor(features_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(labels_train, dtype=torch.long).to(device)
    X_val = torch.tensor(features_val, dtype=torch.float32).to(device)
    y_val = torch.tensor(labels_val, dtype=torch.long).to(device)

    # Early stopping variables
    best_val_loss = float("inf")
    epochs_no_improve = 0
    best_model_state = None

    # Training with tqdm and patience
    for epoch in tqdm(range(num_epochs), desc=f"Training Epochs (Batch {batch_size})", leave=False):
        mlp.train()
        optimizer.zero_grad()
        outputs = mlp(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        # Validation loss
        mlp.eval()
        with torch.no_grad():
            val_outputs = mlp(X_val)
            val_loss = criterion(val_outputs, y_val).item()  # Compute loss
            val_preds = val_outputs.argmax(1).cpu().numpy()
            val_acc = accuracy_score(labels_val, val_preds)

        # Check if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = mlp.state_dict()
            epochs_no_improve = 0  # Reset counter
        else:
            epochs_no_improve += 1  # Increase counter

        # Early stopping condition
        if epochs_no_improve >= PATIENCE:
            print(f"⏹️ Early stopping triggered after {epoch+1} epochs (Best Val Loss: {best_val_loss:.4f})")
            break

    # Store results
    results.append({
        "batch_size": batch_size,
        "epochs_trained": epoch+1,
        "best_val_loss": best_val_loss,
        "best_val_acc": val_acc
    })

    if best_val_loss < lowest_val_loss:
        lowest_val_loss = best_val_loss
        best_model = best_model_state
        best_config = {"batch_size": batch_size, "epochs": epoch+1, "val_loss": best_val_loss, "val_acc": val_acc}

    print(f"✅ Batch {batch_size}, Epochs Trained {epoch+1}, Best Validation Loss: {best_val_loss:.4f}")

# Save the best model
torch.save(best_model, "best_mlp_model.pth")

# Display results
df_results = pd.DataFrame(results)



Hyperparameter Tuning:   0%|          | 0/9 [00:00<?, ?it/s]


🚀 Training MLP with Batch: 32, Epochs: 25


Hyperparameter Tuning:  11%|█         | 1/9 [00:00<00:06,  1.31it/s]

✅ Batch 32, Epochs Trained 25, Best Validation Loss: 0.7894

🚀 Training MLP with Batch: 32, Epochs: 50


Hyperparameter Tuning:  22%|██▏       | 2/9 [00:02<00:07,  1.09s/it]

✅ Batch 32, Epochs Trained 50, Best Validation Loss: 0.3579

🚀 Training MLP with Batch: 32, Epochs: 100


Hyperparameter Tuning:  33%|███▎      | 3/9 [00:04<00:11,  1.88s/it]

✅ Batch 32, Epochs Trained 100, Best Validation Loss: 0.2401

🚀 Training MLP with Batch: 64, Epochs: 25


Hyperparameter Tuning:  44%|████▍     | 4/9 [00:05<00:06,  1.40s/it]

✅ Batch 64, Epochs Trained 25, Best Validation Loss: 0.7872

🚀 Training MLP with Batch: 64, Epochs: 50


Hyperparameter Tuning:  56%|█████▌    | 5/9 [00:07<00:05,  1.43s/it]

✅ Batch 64, Epochs Trained 50, Best Validation Loss: 0.3555

🚀 Training MLP with Batch: 64, Epochs: 100


Hyperparameter Tuning:  67%|██████▋   | 6/9 [00:09<00:05,  1.90s/it]

✅ Batch 64, Epochs Trained 100, Best Validation Loss: 0.2451

🚀 Training MLP with Batch: 128, Epochs: 25


Hyperparameter Tuning:  78%|███████▊  | 7/9 [00:10<00:03,  1.53s/it]

✅ Batch 128, Epochs Trained 25, Best Validation Loss: 0.7270

🚀 Training MLP with Batch: 128, Epochs: 50


Hyperparameter Tuning:  89%|████████▉ | 8/9 [00:12<00:01,  1.49s/it]

✅ Batch 128, Epochs Trained 50, Best Validation Loss: 0.3372

🚀 Training MLP with Batch: 128, Epochs: 100


Hyperparameter Tuning: 100%|██████████| 9/9 [00:15<00:00,  1.67s/it]

✅ Batch 128, Epochs Trained 100, Best Validation Loss: 0.2451





In [61]:

print("\n🏆 Best Model Configuration:")
print(best_config)


🏆 Best Model Configuration:
{'batch_size': 32, 'epochs': 100, 'val_loss': 0.2400825470685959, 'val_acc': 0.9144736842105263}


**Use best hyperparameters**

In [65]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Training the MLP
input_dim = features_train.shape[1]
num_classes = len(set(labels_train))
mlp = MLPClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp.parameters(), lr=0.0001, weight_decay=1e-4)

def get_batches(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        yield torch.tensor(X[i:i+batch_size], dtype=torch.float32), torch.tensor(y[i:i+batch_size], dtype=torch.long)

# Training loop
num_epochs = 100
best_val_loss = float('inf')
patience = 10 
batch_size = 32 
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(num_epochs):
    mlp.train()
    running_loss = 0.0
    
    # Training phase
    for X_batch, y_batch in get_batches(features_train, labels_train, batch_size=batch_size):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = mlp(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    mlp.eval()
    val_loss = 0.0
    val_preds, val_targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in get_batches(features_val, labels_val, batch_size=batch_size):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = mlp(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())
    
    val_accuracy = accuracy_score(val_targets, val_preds)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
    
    # Check if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        print(f"Validation loss decreased to {val_loss:.4f}. Saving model...")
        torch.save(mlp.state_dict(), "best_mlp_classifier.pth")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epochs")
    
    # Early stopping
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

# Load the best model for testing
print("Loading best model for testing...")
mlp.load_state_dict(torch.load("best_mlp_classifier.pth"))
mlp.eval()
test_preds, test_targets = [], []

with torch.no_grad():
    for X_batch, y_batch in get_batches(features_test, labels_test, batch_size=batch_size):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = mlp(X_batch)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        test_targets.extend(y_batch.cpu().numpy())



Epoch 1/100, Loss: 91.7326, Val Loss: 2.5975, Val Acc: 0.8224
Validation loss decreased to 2.5975. Saving model...
Epoch 2/100, Loss: 37.7243, Val Loss: 1.7129, Val Acc: 0.8816
Validation loss decreased to 1.7129. Saving model...
Epoch 3/100, Loss: 27.3474, Val Loss: 1.4699, Val Acc: 0.8947
Validation loss decreased to 1.4699. Saving model...
Epoch 4/100, Loss: 22.6385, Val Loss: 1.3914, Val Acc: 0.8947
Validation loss decreased to 1.3914. Saving model...
Epoch 5/100, Loss: 19.8084, Val Loss: 1.3564, Val Acc: 0.9013
Validation loss decreased to 1.3564. Saving model...
Epoch 6/100, Loss: 17.7172, Val Loss: 1.3313, Val Acc: 0.9079
Validation loss decreased to 1.3313. Saving model...
Epoch 7/100, Loss: 16.0306, Val Loss: 1.3138, Val Acc: 0.9079
Validation loss decreased to 1.3138. Saving model...
Epoch 8/100, Loss: 14.6591, Val Loss: 1.3052, Val Acc: 0.9079
Validation loss decreased to 1.3052. Saving model...
Epoch 9/100, Loss: 13.4559, Val Loss: 1.2996, Val Acc: 0.9145
Validation loss de

  mlp.load_state_dict(torch.load("best_mlp_classifier.pth"))


In [66]:
test_accuracy = accuracy_score(test_targets, test_preds)
print(f"Test Accuracy: {test_accuracy*100:.4f}")

Test Accuracy: 93.1034
