
# Maize Fall Armyworm — Baseline (PyTorch, ResNet18, AUC)

Binary image classification to detect Fall Armyworm on maize leaves.  
This notebook is **Colab-ready** (works on Tesla K80) and conforms to the hackathon rules.


In [None]:
# =======================
# 0. Install dependencies
# =======================
!pip -q install -U tqdm pillow scikit-learn

In [None]:
# ===========
# 1. Imports
# ===========
import os, random
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

In [None]:
# ============
# 2. Settings
# ============
SEED = 1337
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

In [None]:
!git clone https://github.com/Gaabshiine/pycon-2025-hackthon.git

BASE_DIR   = Path('/content/pycon-2025-hackthon')
IMAGES_DIR = BASE_DIR / 'Images'
TRAIN_CSV  = BASE_DIR / 'Train.csv'
TEST_CSV   = BASE_DIR / 'Test.csv'
SAMPLE_SUB = BASE_DIR / 'SampleSubmission.csv'

train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)
print(train_df.head())

In [None]:
# ================
# 3. Quick EDA
# ================
counts = train_df['Label'].value_counts()
counts.plot(kind="bar", title="Class balance (0=Healthy, 1=Fall armyworm)")
plt.show()

In [None]:
# ==================
# 4. Dataset class
# ==================
class MaizeDataset(Dataset):
    def __init__(self, df, images_dir, mode='train', transform=None):
        self.df = df.reset_index(drop=True)
        self.images_dir = Path(images_dir)
        self.mode = mode
        self.transform = transform
        
    def __len__(self): return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_id = row['Image_id']  # full filename e.g. id_xxx.jpg
        img_path = self.images_dir / img_id
        img = Image.open(img_path).convert('RGB')
        if self.transform: img = self.transform(img)
        
        if self.mode == 'test':
            return img, img_id
        else:
            label = torch.tensor(float(row['Label']), dtype=torch.float32)
            return img, label

In [None]:
# ===========================
# 5. Data transforms & split
# ===========================
IMG_SIZE   = 256     # T4-friendly
BATCH_SIZE = 64
NUM_WORKERS = 2

train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.05),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])
valid_tfms = transforms.Compose([
    transforms.Resize(IMG_SIZE+32),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

train_split, valid_split = train_test_split(
    train_df, test_size=0.2, stratify=train_df['Label'], random_state=SEED
)

train_ds = MaizeDataset(train_split, IMAGES_DIR, 'train', train_tfms)
valid_ds = MaizeDataset(valid_split, IMAGES_DIR, 'train', valid_tfms)

train_loader = DataLoader(train_ds, BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
valid_loader = DataLoader(valid_ds, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

In [None]:
# ================
# 6. Model
# ================
def build_model():
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    in_feats = model.fc.in_features
    model.fc = nn.Linear(in_feats, 1)
    return model

model = build_model().to(device)
print("Model params (M):", sum(p.numel() for p in model.parameters())/1e6)

In [None]:
# ========================
# 7. Training utilities
# ========================
class EarlyStopper:
    def __init__(self, patience=3):
        self.best = -np.inf
        self.wait = 0
        self.patience = patience
    def step(self, val):
        if val > self.best + 1e-4:
            self.best = val
            self.wait = 0
            return True
        self.wait += 1
        return False
    def should_stop(self): return self.wait >= self.patience

def train_one_epoch(model, loader, optimizer, scaler):
    model.train()
    total_loss = 0
    for imgs, labels in tqdm(loader, leave=False):
        imgs, labels = imgs.to(device), labels.to(device).unsqueeze(1)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(imgs)
            loss = F.binary_cross_entropy_with_logits(logits, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item() * imgs.size(0)
    return total_loss / len(loader.dataset)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    all_probs, all_targets = [], []
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device).unsqueeze(1)
        probs = torch.sigmoid(model(imgs))
        all_probs.append(probs.cpu().numpy())
        all_targets.append(labels.cpu().numpy())
    all_probs = np.concatenate(all_probs).ravel()
    all_targets = np.concatenate(all_targets).ravel()
    return roc_auc_score(all_targets, all_probs)

In [None]:
# ==================
# 8. Train loop
# ==================
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
scaler = torch.cuda.amp.GradScaler()
early = EarlyStopper(patience=3)

best_path = BASE_DIR/'best_resnet18.pt'
for epoch in range(1, 16):
    loss = train_one_epoch(model, train_loader, optimizer, scaler)
    val_auc = evaluate(model, valid_loader)
    print(f"Epoch {epoch}: loss={loss:.4f}, val_auc={val_auc:.4f}")
    if early.step(val_auc):
        torch.save(model.state_dict(), best_path)
        print("  ✓ Saved best model")
    if early.should_stop(): break

In [None]:
# ==================
# 9. Inference
# ==================
model.load_state_dict(torch.load(best_path, map_location=device))
model.eval()

test_ds = MaizeDataset(test_df, IMAGES_DIR, 'test', valid_tfms)
test_loader = DataLoader(test_ds, BATCH_SIZE, shuffle=False)

probs, ids = [], []
@torch.no_grad()
def predict_tta(imgs):
    return torch.sigmoid((model(imgs) + model(torch.flip(imgs,[3])))/2)

for imgs, img_ids in tqdm(test_loader):
    imgs = imgs.to(device)
    p = predict_tta(imgs).cpu().numpy().ravel()
    probs.extend(p); ids.extend(img_ids)

sub = pd.DataFrame({'Image_id': ids, 'Label': probs})
sub.to_csv(BASE_DIR/'submission.csv', index=False)
print(sub.head())
print("Saved submission.csv")

In [None]:
# ==================
# 10. Download file
# ==================
from google.colab import files
files.download(str(BASE_DIR/'submission.csv'))