In [58]:
# import the main PyTorch package
import torch
# import PyTorch neural network module utilities (layers, losses, etc.)
import torch.nn as nn
# import PyTorch optimizers
import torch.optim as optim
# import torchvision transforms for common image preprocessing/augmentation
from torchvision import transforms
# import PIL.Image for opening images
from PIL import Image
# import os for filesystem path operations
import os
# import numpy for numeric utilities (not heavily used below but common)
import numpy as np
# import sklearn metrics for final classification reporting
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
# import Colab-specific drive mount helper (only works in Google Colab)
from google.colab import drive
# mount the user's Google Drive at /content/drive (interactive permission step in Colab)
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
'''
ZIP_PATH = "/content/drive/MyDrive/data/archive.zip"
EXTRACT_TO = "/content/drive/MyDrive/data_extracted"# clean new folder
!mkdir -p "$EXTRACT_T
!unzip -q "$ZIP_PATH" -d "$EXTRACT_TO"
print("Extraction complete")
'''


'\nZIP_PATH = "/content/drive/MyDrive/data/archive.zip"\nEXTRACT_TO = "/content/drive/MyDrive/data_extracted"# clean new folder\n!mkdir -p "$EXTRACT_T\n!unzip -q "$ZIP_PATH" -d "$EXTRACT_TO"\nprint("Extraction complete")\n'

In [None]:
# import os (already imported above but repeated in the notebook)
import os

# print a short label
print("Top-level extracted folders/files:")
# list the top-level directory 'EXTRACT_TO' and print the returned list
print(os.listdir(EXTRACT_TO))

# loop through items at top level and if the item is a directory, print a preview of its contents
for item in os.listdir(EXTRACT_TO):
    # join path pieces to make a full path to the item
    path = os.path.join(EXTRACT_TO, item)
    # check if it is a directory
    if os.path.isdir(path):
        # print which folder we are showing contents for
        print(f"\nContents of {item}:")
        # print first 10 entries inside that folder to avoid very long output
        print(os.listdir(path)[:10])  # show only first


Top-level extracted folders/files:
['images', 'labels']

Contents of images:
['20160928-140314-0.jpg', '20160928-140337-0.jpg', '20160928-140731-0.jpg', '20160928-140747-0.jpg', '20160928-141107-0.jpg', '20160928-141135-0.jpg', '20160928-141355-0.jpg', '20160928-141421-0.jpg', '20160928-141437-0.jpg', '20160928-142056-0.jpg']

Contents of labels:
['labels.csv', 'test_subset0.csv', 'test_subset1.csv', 'test_subset2.csv', 'test_subset3.csv', 'test_subset4.csv', 'train_subset0.csv', 'train_subset1.csv', 'train_subset2.csv', 'train_subset3.csv']


In [None]:
# set the base path to the extracted dataset
main = "/content/drive/MyDrive/data_extracted"

# path to labels folder under main
labels_folder = os.path.join(main, "labels")
# path to images folder under main
images_folder = os.path.join(main, "images")
# print whether labels folder exists (True/False)
print("Labels folder exists:", os.path.exists(labels_folder))
# print whether images folder exists (True/False)
print("Images folder exists:", os.path.exists(images_folder))
# print a sorted slice of files found in the labels folder (first 20)
print("Labels files:", sorted(os.listdir(labels_folder))[:20])  # show first 20 label files


Labels folder exists: True
Images folder exists: True
Labels files: ['labels.csv', 'test_subset0.csv', 'test_subset1.csv', 'test_subset2.csv', 'test_subset3.csv', 'test_subset4.csv', 'train_subset0.csv', 'train_subset1.csv', 'train_subset2.csv', 'train_subset3.csv', 'train_subset4.csv', 'val_subset0.csv', 'val_subset1.csv', 'val_subset2.csv', 'val_subset3.csv', 'val_subset4.csv']


In [None]:
import pandas as pd

labels_csv = os.path.join(labels_folder, "labels.csv")
print("labels.csv path:", labels_csv)
df_labels = pd.read_csv(labels_csv)
print("columns:", df_labels.columns.tolist())
print("first 6 rows:\n", df_labels.head(6))

# peek one subset at  (test_subset0.csv)
test0 = pd.read_csv(os.path.join(labels_folder, "test_subset0.csv"))
print("\nTest subset example (first 6):\n", test0.head(6))


labels.csv path: /content/drive/MyDrive/data_extracted/labels/labels.csv
columns: ['Filename', 'Label', 'Species']
first 6 rows:
                 Filename  Label       Species
0  20160928-140314-0.jpg      0  Chinee apple
1  20160928-140337-0.jpg      0  Chinee apple
2  20160928-140731-0.jpg      0  Chinee apple
3  20160928-140747-0.jpg      0  Chinee apple
4  20160928-141107-0.jpg      0  Chinee apple
5  20160928-141135-0.jpg      0  Chinee apple

Test subset example (first 6):
                 Filename  Label
0  20160928-140747-0.jpg      0
1  20160928-141437-0.jpg      0
2  20160928-142110-0.jpg      0
3  20161207-110730-0.jpg      0
4  20161207-110753-0.jpg      0
5  20161207-110837-0.jpg      0


In [None]:
main = "/content/drive/MyDrive/data_extracted"
LABELS_DIR = os.path.join(main, "labels")
IMAGES_DIR = os.path.join(main, "images")

# it disappears bro
print("root exists:", os.path.exists(main))
print("labels dir exists:", os.path.exists(LABELS_DIR))
print("images dir exists:", os.path.exists(IMAGES_DIR))
print("files in labels dir (sample):", sorted(os.listdir(LABELS_DIR))[:20])


root exists: True
labels dir exists: True
images dir exists: True
files in labels dir (sample): ['labels.csv', 'test_subset0.csv', 'test_subset1.csv', 'test_subset2.csv', 'test_subset3.csv', 'test_subset4.csv', 'train_subset0.csv', 'train_subset1.csv', 'train_subset2.csv', 'train_subset3.csv', 'train_subset4.csv', 'val_subset0.csv', 'val_subset1.csv', 'val_subset2.csv', 'val_subset3.csv', 'val_subset4.csv']


In [None]:
master_csv = os.path.join(LABELS_DIR, "labels.csv")
print("Master CSV:", master_csv)
master_df = pd.read_csv(master_csv)   # contains Filename,Label,Species (as you showed)
print("master_df columns:", master_df.columns.tolist())
print(master_df.head(6))

subset0 = pd.read_csv(os.path.join(LABELS_DIR, "test_subset0.csv"))
print("\nSubset sample:", subset0.head(6))


Master CSV: /content/drive/MyDrive/data_extracted/labels/labels.csv
master_df columns: ['Filename', 'Label', 'Species']
                Filename  Label       Species
0  20160928-140314-0.jpg      0  Chinee apple
1  20160928-140337-0.jpg      0  Chinee apple
2  20160928-140731-0.jpg      0  Chinee apple
3  20160928-140747-0.jpg      0  Chinee apple
4  20160928-141107-0.jpg      0  Chinee apple
5  20160928-141135-0.jpg      0  Chinee apple

Subset sample:                 Filename  Label
0  20160928-140747-0.jpg      0
1  20160928-141437-0.jpg      0
2  20160928-142110-0.jpg      0
3  20161207-110730-0.jpg      0
4  20161207-110753-0.jpg      0
5  20161207-110837-0.jpg      0


In [None]:
import glob
def concat_subsets(labels_dir, pattern):
    """Concatenate CSV files matching pattern in labels_dir into one DataFrame."""
    paths = sorted(glob.glob(os.path.join(labels_dir, pattern)))
    if not paths:
        raise FileNotFoundError(f"No files for pattern {pattern} in {labels_dir}")
    dfs = [pd.read_csv(p) for p in paths]
    return pd.concat(dfs, ignore_index=True)

train_df = concat_subsets(LABELS_DIR, "train_subset*.csv")
val_df   = concat_subsets(LABELS_DIR, "val_subset*.csv")
test_df  = concat_subsets(LABELS_DIR, "test_subset*.csv")

print("train/val/test sizes:", len(train_df), len(val_df), len(test_df))
print("train sample:\n", train_df.head())


train/val/test sizes: 52525 17511 17509
train sample:
                 Filename  Label
0  20171109-175921-2.jpg      5
1  20170714-142019-3.jpg      1
2  20170718-101402-2.jpg      0
3  20170126-095456-0.jpg      1
4  20170913-110647-1.jpg      3


In [54]:
device = torch.device("cuda")
print("Using device:", device)


Using device: cuda


In [51]:
# ===== IMAGE TRANSFORMS =====
# IMG_SIZE is the square size models like EfficientNet / ResNet expect (commonly 224)
IMG_SIZE = 224

# Training transforms (randomized) — used only when training
train_transform = transforms.Compose([
    # RandomResizedCrop: pick a random region of the image and scale it to IMG_SIZE.
    # This both crops and resizes, which acts as a strong augmentation.
    transforms.RandomResizedCrop(IMG_SIZE),

    # Random horizontal flip: with probability 0.5 flips image left-right.
    transforms.RandomHorizontalFlip(),

    # Random small rotation: rotates the image by a random angle in ±15 degrees.
    # Helps the model learn rotation-invariant features.
    transforms.RandomRotation(15),

    # Convert PIL Image -> Tensor and scale from [0,255] ints to [0.0,1.0] floats
    transforms.ToTensor(),

    # Normalize channels using ImageNet mean/std.
    # Pretrained models (ImageNet) expect inputs normalized this way.
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Validation / Test transforms (deterministic) — used for evaluation
test_transform = transforms.Compose([
    # Resize the shorter side to 256 pixels (keeps aspect ratio)
    transforms.Resize(256),

    # CenterCrop to IMG_SIZE to get a deterministic view of the image
    transforms.CenterCrop(IMG_SIZE),

    # Convert to tensor and normalize with same mean/std
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [56]:
def load_sample(row, image_dir, transform):
    """
    row: a pandas Series containing at least the 'Filename' and 'Label' columns
    returns: (image tensor, label tensor) after applying transform
    """
    # build image path from image directory + filename in the CSV row
    img_path = os.path.join(image_dir, row["Filename"])
    # open the image file and convert to RGB (some images might be grayscale)
    image = Image.open(img_path).convert("RGB")
    # apply the supplied torchvision transform pipeline
    image = transform(image)
    # convert numeric label to a torch.long tensor (class index)
    label = torch.tensor(int(row["Label"]), dtype=torch.long)
    # return the processed image tensor and label tensor
    return image, label


def get_batch(df, image_dir, transform, batch_size, start_idx):
    # create lists to collect tensors
    images = []
    labels = []

    # compute the end index for this batch (clamped to length of df)
    end_idx = min(start_idx + batch_size, len(df))
    # iterate rows from start_idx up to end_idx (exclusive)
    for i in range(start_idx, end_idx):
        # load each sample (image tensor, label tensor)
        img, lab = load_sample(df.iloc[i], image_dir, transform)
        # append to lists
        images.append(img)
        labels.append(lab)

    # stack image tensors into a single batched tensor of shape (B, C, H, W)
    images = torch.stack(images)
    # stack label tensors into a single tensor of shape (B,)
    labels = torch.stack(labels)
    # move tensors to the device (GPU or CPU)
    return images.to(device), labels.to(device)


In [60]:
# import timm (Torch Image Models) which provides pretrained architectures
import timm
# compute number of distinct classes from the master labels DataFrame
num_classes = master_df["Label"].nunique()

# create an EfficientNet-b0 model from timm with pretrained ImageNet weights
model = timm.create_model("efficientnet_b0", pretrained=True)
# replace the classifier (final layer) with a new linear layer sized for our num_classes
model.classifier = nn.Linear(model.classifier.in_features, num_classes)
# send the model to the device (GPU)
model = model.to(device)
# define the loss function as CrossEntropyLoss (combines softmax + NLLLoss)
criterion = nn.CrossEntropyLoss()




def train_epoch(df, transform, optimizer, batch_size=32):
    # put the model into training mode (enables dropout, batchnorm running stats updates)
    model.train()
    # counters for accuracy
    correct = 0
    total = 0

    # iterate through the dataset in steps of batch_size (manual batching)
    for i in range(0, len(df), batch_size):
        # get a batch of images and labels starting at index i
        images, labels = get_batch(df, IMAGES_DIR, transform, batch_size, i)

        # zero gradients for optimizer
        optimizer.zero_grad()
        # forward pass: compute model outputs (logits)
        outputs = model(images)
        # compute cross-entropy loss between outputs and true labels
        loss = criterion(outputs, labels)
        # backward pass: compute gradients
        loss.backward()
        # optimizer step: apply gradients to update parameters
        optimizer.step()

        # get predicted class indices (argmax along class dim)
        preds = outputs.argmax(1)
        # accumulate correct predictions count
        correct += (preds == labels).sum().item()
        # accumulate total samples processed
        total += labels.size(0)

    # return training accuracy for the epoch
    return correct / total



In [61]:
def evaluate(df, transform, batch_size=32):
    # set the model to evaluation mode (disables dropout, uses batchnorm running stats)
    model.eval()
    # lists to collect predictions and labels from all batches
    all_preds = []
    all_labels = []

    # disable gradient computation for inference (saves memory and compute)
    with torch.no_grad(): 
        #Stops gradient tracking → faster + less memory during evaluation


        
        # iterate over dataset in batch_size steps
        for i in range(0, len(df), batch_size):
            # load batch to device
            images, labels = get_batch(df, IMAGES_DIR, transform, batch_size, i)
            # forward pass to get outputs
            outputs = model(images)
            # get predicted class indices for this batch
            preds = outputs.argmax(1)

            # append predictions and labels (moved to cpu) to lists
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

    # concatenate all batch tensors into single numpy arrays
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    # return arrays (labels, preds) for external metric calculation
    return all_labels, all_preds


In [62]:
# freeze all parameters in the model (no gradient updates)
for p in model.parameters():
    p.requires_grad = False

# unfreeze the classifier head parameters only (we just replaced it earlier)
for p in model.classifier.parameters():
    p.requires_grad = True

# create optimizer that will only optimize parameters of the classifier head
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-3)

# run 5 epochs of training on only the classifier head (partial fine-tuning)
for epoch in range(5):
    acc = train_epoch(train_df, train_transform, optimizer)
    print(f"Partial FT Epoch {epoch+1}: Train Acc = {acc:.4f}")


Partial FT Epoch 1: Train Acc = 0.6812
Partial FT Epoch 2: Train Acc = 0.7261
Partial FT Epoch 3: Train Acc = 0.7362
Partial FT Epoch 4: Train Acc = 0.7447
Partial FT Epoch 5: Train Acc = 0.7474


In [63]:
# unfreeze all parameters so they can be trained now
for p in model.parameters():
    p.requires_grad = True

# create optimizer for the whole model parameters but at a lower learning rate
optimizer = optim.Adam(model.parameters(), lr=5e-5)

# run 5 epochs of full fine-tuning (now backbone + head will be updated)
for epoch in range(5):
    acc = train_epoch(train_df, train_transform, optimizer)
    print(f"Full FT Epoch {epoch+1}: Train Acc = {acc:.4f}")


Full FT Epoch 1: Train Acc = 0.8484
Full FT Epoch 2: Train Acc = 0.9111
Full FT Epoch 3: Train Acc = 0.9364
Full FT Epoch 4: Train Acc = 0.9487
Full FT Epoch 5: Train Acc = 0.9586
