In [3]:
import os
import shutil
from google.colab import drive

drive.mount('/content/drive')

path_MG = '/content/drive/MyDrive/00gerem00/DL-AI-Project'
path_LM = '/content/drive/MyDrive/lorenzomeroni02/DL-AI-Project'

if os.path.exists(path_MG):
    working_dir = path_MG
    print(f"Working in MG folder: {working_dir}")
elif os.path.exists(path_LM):
    working_dir = path_LM
    print(f"Working in LM folder: {working_dir}")
else:
    print("Error: Project folder not found in Drive. Check the paths.")
    working_dir = None

if working_dir:
    os.chdir(working_dir)
    zip_filename = 'leather-defect-classification.zip'
    fast_local_dir = '/content/fast_dataset'

    if os.path.exists(zip_filename):
        print(f"Great! {zip_filename} found on Drive.")

        if not os.path.exists(fast_local_dir):
            os.makedirs(fast_local_dir, exist_ok=True)
            !unzip -q {zip_filename} -d {fast_local_dir}
    else:
        print(f"Error: {zip_filename} not found in your Drive folder. Please check if you uploaded it correctly to {working_dir}.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Working in MG folder: /content/drive/MyDrive/00gerem00/DL-AI-Project
Great! leather-defect-classification.zip found on Drive.


In [4]:
from torchvision.datasets import ImageFolder

base_dir = '/content/fast_dataset'

content = os.listdir(base_dir)
subfolders = [f for f in content if os.path.isdir(os.path.join(base_dir, f))][0]
data_dir = os.path.join(base_dir, subfolders)

dataset = ImageFolder(
    root = data_dir,
    transform = None)

print(f"Total number of images: {len(dataset)}")
print(f"Classes found: {dataset.classes}")
print(f"Index mapping: {dataset.class_to_idx}")

Total number of images: 3600
Classes found: ['Folding marks', 'Grain off', 'Growth marks', 'loose grains', 'non defective', 'pinhole']
Index mapping: {'Folding marks': 0, 'Grain off': 1, 'Growth marks': 2, 'loose grains': 3, 'non defective': 4, 'pinhole': 5}


In [5]:
from PIL import Image

dataset.samples = [s for s in dataset.samples if Image.open(s[0]).size == (227, 227)]
dataset.imgs = dataset.samples
dataset.targets = [s[1] for s in dataset.samples]
print(f"{len(dataset)} images")

3598 images


In [6]:
import torch
from torch.utils.data import random_split

train_size = int(0.7 * len(dataset))
val_size = (len(dataset) - train_size) // 2
test_size = len(dataset) - train_size - val_size

train_set, val_set, test_set = random_split(
    dataset,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

print(f"Completed split: {len(train_set)} training set, {len(val_set)} validation set, {len(test_set)} test set")

Completed split: 2518 training set, 540 validation set, 540 test set


# **Model 2: Custom Convolutional Autoencoder Classifier**

## **Data Preprocessing & DataLoader**

In this section, we set up the data transformations and the DataLoader instances. Since the main goal of our Autoencoder is to achieve a highly accurate reconstruction of the original images, unlike the ResNet-18 classifier, we had to take a completely different approach for the preprocessing pipeline:

* We decided to skip data augmentation entirely. This means we purposely left out transformations like `RandomCrop()`, `RandomRotation()`, and `RandomHorizontalFlip()` from our training set. The reason is simple: the autoencoder needs to learn the actual spatial representation of the leather textures and their defects, without having to struggle with reconstructing pixels that have been artificially shifted or distorted.
* For resizing, we went with a more direct approach. Instead of doing a `Resize(230)` followed by a `RandomCrop(224)`, which we previously used to introduce translation invariance, we just applied a straightforward `Resize((224, 224))` using standard bilinear interpolation. This lets us smoothly compress the original $227 \times 227$ images without throwing away any important information around the borders, where some defects might actually be located.
* Finally, we changed how we handle pixel scaling by avoiding zero-centered normalization. In the ResNet-18 model, we mapped pixels to a [-1.0, 1.0] range to help with gradient flow during classification. Here, though, we only rely on `ToTensor()`, which keeps the pixel values in the [0.0, 1.0] range. This was a strict architectural requirement because the final layer of our decoder uses a Sigmoid activation function, which naturally outputs values between 0 and 1. By making sure the input and output domains match perfectly, our Mean Squared Error (MSE) loss can compute the reconstruction error accurately.

In [8]:
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

class TransformSubset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.subset)

ae_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

train_final = TransformSubset(train_set, transform=ae_transform)
val_final = TransformSubset(val_set, transform=ae_transform)
test_final = TransformSubset(test_set, transform=ae_transform)

batch_size = 32

train_loader = DataLoader(train_final, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_final, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_final, batch_size=batch_size, shuffle=False)