In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
base_dir = "dataset"

train_dir = os.path.join(base_dir, "train")
valid_dir = os.path.join(base_dir, "valid")
test_dir = os.path.join(base_dir, "test")

print("Number of classes:", len(os.listdir(train_dir)))
print("Number of classes:", len(os.listdir(valid_dir)))
print("Number of classes:", len(os.listdir(test_dir)))

Number of classes: 102
Number of classes: 102
Number of classes: 819


## Step 1: Data Preproccessing
The first step is data preprocessing, where clean and prepare the dataset ready to apply for machine learning model. This step is important to ensure data quality and reliable model performance.

### Image Size Inspection

Before preprocessing, several sample images were inspected to examine their original dimensions. The images were found to have different size which makes them unsuitable for direct input into machine learning models. Therefore, resizing is required to standardize the input dimensions.

In [18]:
limits = 5

sizes = []
for cls in os.listdir(train_dir):
    image_files = os.listdir(os.path.join(train_dir, cls))[:limits]

    for image_name in image_files:
        image_path = os.path.join(train_dir, cls, image_name)
        image = Image.open(image_path)
        sizes.append(image.size)

        if(len(sizes)) >= limits:
            break
    if(len(sizes)) >= limits:
        break

print("Image sizes (width, height):", sizes)


Image sizes (width, height): [(523, 500), (666, 500), (750, 500), (595, 500), (626, 500)]


### Image Cleaning and Resizing

Next, the preprocess_image function will:
1. Resized all images to 64*64 pixels to ensure a fixed length input for machine learning model. 
2. Converted images to RGB format.
3. Normalize pixel values to improve training stability.
4. Flattens each images into 1-dimensional array for input to machine learning model.

In [25]:
def preprocess_image(folder, IMAGE_SIZE = (64, 64)):
    x = []
    y = []

    class_folder = [f 
                    for f in os.listdir(folder) 
                        if os.path.isdir(os.path.join(folder, f)
                    )]

    if class_folder:
        for label in class_folder:
            label_path = os.path.join(folder, label)

            for image_name in os.listdir(label_path):
                image_path = os.path.join(label_path, image_name)
                try:
                    image = Image.open(image_path).convert('RGB')
                    image = image.resize(IMAGE_SIZE)
                    image = np.array(image) / 255.0
                    image = image.flatten()
                    x.append(image)
                    y.append(label)
                except:
                    continue
    else:
        for image_name in os.listdir(folder):
            image_path = os.path.join(folder, image_name)
            try:
                image = Image.open(image_path).convert('RGB')
                image = image.resize(IMAGE_SIZE)
                image = np.array(image) / 255.0
                image = image.flatten()
                x.append(image)
                y.append("unknown")
            except:
                continue
                
    return np.array(x), np.array(y)

x, y = preprocess_image(train_dir)
print("Preprocessed data shape:", x.shape)
print("Labels shape:", y.shape)

Preprocessed data shape: (6552, 12288)
Labels shape: (6552,)


### Preprocessing Train, Validation, and Test Sets
The training, validation, and test datasets were preprocessed using the preprocess_image function.

In [None]:
X_train, Y_train = preprocess_image(train_dir)
X_valid, Y_valid = preprocess_image(valid_dir)
X_test, Y_test = preprocess_image(test_dir)

print("Training data shape:", X_train.shape, Y_train.shape)
print("Validation data shape:", X_valid.shape, Y_valid.shape)
print("Test data shape:", X_test.shape, Y_test.shape)

Training data shape: (6552, 12288) (6552,)
Validation data shape: (818, 12288) (818,)
Test data shape: (819, 12288) (819,)


### Encode Labels
Class label were converted from string format to numeric labels for model compatibality.

In [None]:
le = LabelEncoder()
Y_train_enc = le.fit_transform(Y_train)
Y_valid_enc = le.transform(Y_valid)
Y_test_enc = le.transform(Y_test)

print("Classes (First 10): ", le.classes_[:10])

### Feature Scaling
All input features were standardized to have zero mean and unit variance to improve neural network performance during training and ensures that all input features contributes equally to the learning process.

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [21]:
def count_images(folder):
    total = 0
    for f in os.listdir(folder):
        path = os.path.join(folder, f)
        if os.path.isdir(path):
            total += len(os.listdir(path))
    return total

print("Train images:", count_images(train_dir))
print("Valid images:", count_images(valid_dir))
print("Test images:", count_images(test_dir))

Train images: 6552
Valid images: 818
Test images: 0


## Step 2: Model Selection
We select three complementary models to justify different biases/complexities:
- Logistic Regression: strong linear baseline for multiclass classification.
- Support Vector Classifier (RBF kernel): non-linear decision boundaries without deep nets.
- MLP (ANN): a shallow neural network over flattened pixels.
Each model will use `StandardScaler` to normalize features for more stable training.

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Define three models (scaling is done in Step 1 preprocessing)
models = {
    "LogisticRegression": Pipeline([
        ("clf", LogisticRegression(max_iter=1000, solver="lbfgs"))
    ]),
    "SVC_RBF": Pipeline([
        ("clf", SVC(kernel="rbf", C=1.0, gamma="scale"))
    ]),
    "MLP_ANN": Pipeline([
        ("clf", MLPClassifier(hidden_layer_sizes=(256, 128), activation="relu", max_iter=50, random_state=42))
    ])
}

print("Models ready:", list(models.keys()))

Models ready: ['LogisticRegression', 'SVC_RBF', 'MLP_ANN']
