In [None]:
"""

Author: Annam.ai IIT Ropar
Team Name: Ice 'N' Dagger
Team Members: Barun Saha, Bibaswan Das
Leaderboard Rank: 70 

"""

# This is the notebook used for training the model.

In [None]:
# training.ipynb
!pip install -q torch torchvision timm pandas scikit-learn joblib

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import timm
import numpy as np
from sklearn.metrics import f1_score
from tqdm import tqdm
import joblib

from preprocessing import SoilDataset, train_transform, test_transform

# Constants
BATCH_SIZE = 32
NUM_EPOCHS = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "swin_base_patch4_window7_224"
TRAIN_DIR = "/kaggle/input/soil-classification/soil_classification-2025/train"
BEST_MODEL_PATH = "best_model.pth"

# Load encoded data
df = pd.read_csv("encoded_train.csv")
label_encoder = joblib.load("label_encoder.pkl")

# Data
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'])
train_dataset = SoilDataset(train_df, TRAIN_DIR, transform=train_transform)
val_dataset = SoilDataset(val_df, TRAIN_DIR, transform=test_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Model
model = timm.create_model(MODEL_NAME, pretrained=True, num_classes=len(label_encoder.classes_))
model.to(DEVICE)

# Loss and optimizer
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label']), y=train_df['label'])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
best_min_f1 = 0.0
for epoch in range(NUM_EPOCHS):
    model.train()
    correct = 0
    for images, labels, _ in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        correct += (outputs.argmax(dim=1) == labels).sum().item()

    train_acc = correct / len(train_loader.dataset)

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for images, labels, _ in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            preds = outputs.argmax(dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    f1_scores = f1_score(val_labels, val_preds, average=None, labels=list(range(len(label_encoder.classes_))))
    min_f1 = f1_scores.min()
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Train Acc: {train_acc:.4f}, Min F1 Score: {min_f1:.4f}")

    if min_f1 > best_min_f1:
        best_min_f1 = min_f1
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"Saved new best model with min F1: {best_min_f1:.4f}")