In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')


Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [2]:
# Team members:
# Duru KARACAN 202128022
# Dilara Çağla BANKO 202128201

# The average accuracy obtained over 5 splits is: 0.9714
# The execution time is approximately 3 minutes.

# We use seng_445_project_fe.py for template

# The most significant change is the implementation of Feature Selection using Recursive Feature Elimination (RFE).
# We selected the 100 most important features using an SVM with a linear kernel.
# This approach achieves an average accuracy above 95%.



# Mount Google Drive to access the dataset stored in Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

"""### Feature Extraction Section"""

# Import necessary libraries for feature extraction
import torch
import torch.nn as nn
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader
import numpy as np

# Import libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from random import randint

# Load Pretrained EfficientNetV2 Model
# This model is used to extract deep features from images
model = models.efficientnet_v2_s(pretrained=True)
model.eval()  # Set the model to evaluation mode to avoid training updates

# Modify the model to use as a feature extractor
# Remove the last layer to extract only features
feature_extractor = nn.Sequential(*list(model.children())[:-1])

# Define transformations for input images
# Resize, normalize, and convert to tensors to make the images compatible with the pretrained model
transform = transforms.Compose([
    transforms.Resize(224),  # Resize images to 224x224
    transforms.ToTensor(),  # Convert images to tensors
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalize with ImageNet statistics
])

# Path to the dataset stored in Google Drive
data_dir = "/content/gdrive/MyDrive/seng445/project/dataset (1)"

# Load the dataset with ImageFolder
# This assumes the dataset directory structure is organized in class-labeled subfolders
dataset = datasets.ImageFolder(data_dir, transform=transform)

# Create a DataLoader for efficient batch processing of images
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Initialize empty arrays to store extracted features (X) and labels (y)
X, y = [], []

# Extract features using the pretrained model
with torch.no_grad():  # Disable gradient computation for faster processing
    for inputs, labels in dataloader:
        features = feature_extractor(inputs)  # Extract features from images
        features = features.view(features.size(0), -1)  # Flatten features to a 1D array
        X.extend(features.cpu().numpy())  # Append features to the list
        y.extend(labels.cpu().numpy())  # Append corresponding labels

# Convert feature and label lists to NumPy arrays
X = np.array(X)
y = np.array(y)

"""### Traditional Machine Learning with SVM"""

# Normalize the features using Min-Max Scaling
# Scales feature values to the range [0, 1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Initialize variables for cross-validation
n_tests = 5  # Number of train-test splits
total_accuracy = 0  # Accumulator for average accuracy calculation

# Feature Selection using Recursive Feature Elimination (RFE)
# Select 100 most important features using an SVM with a linear kernel
selector = RFE(SVC(kernel="linear"), n_features_to_select=100, step=10)
X_selected = selector.fit_transform(X_scaled, y)

# Perform 5-fold cross-validation
for i in range(n_tests):
    random_seed = randint(0, 100)  # Generate a random seed for splitting data
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.33, stratify=y, random_state=random_seed
    )  # Split data into train and test sets

    # Train SVM classifier
    clf = SVC(kernel="linear", C=1)  # Linear kernel and C=1 for regularization
    clf.fit(X_train, y_train)  # Train the classifier on the training set

    # Test the classifier and calculate accuracy
    y_pred = clf.predict(X_test)  # Predict labels for the test set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f"Split {i+1} (Seed {random_seed}): Accuracy = {accuracy:.4f}")
    total_accuracy += accuracy  # Accumulate accuracy for average calculation

# Calculate and display the average accuracy over all splits
avg_accuracy = total_accuracy / n_tests
print(f"\nAverage Accuracy over {n_tests} splits: {avg_accuracy:.4f}")


Mounted at /content/gdrive




Split 1 (Seed 62): Accuracy = 0.9429
Split 2 (Seed 59): Accuracy = 0.9857
Split 3 (Seed 5): Accuracy = 0.9857
Split 4 (Seed 66): Accuracy = 0.9571
Split 5 (Seed 69): Accuracy = 0.9857

Average Accuracy over 5 splits: 0.9714
