In [None]:
# Training notebook for Soil Classification

# ============================ Imports ============================
import os
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# ============================ Device Setup ============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ============================ Load and Prepare Data ============================

# Load CSV with labels
train_df = pd.read_csv('/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv')

# Add full image paths
train_df['image_path'] = '/kaggle/input/soil-classification/soil_classification-2025/train/' + train_df['image_id']

# Map labels to integers
label_to_index = {label: idx for idx, label in enumerate(train_df['soil_type'].unique())}
index_to_label = {idx: label for label, idx in label_to_index.items()}
train_df['label_idx'] = train_df['soil_type'].map(label_to_index)

# Stratified train/validation split
train_data, val_data = train_test_split(
    train_df,
    test_size=0.1,
    stratify=train_df['label_idx'],
    random_state=42
)

# ============================ Data Transforms ============================
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor()
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# ============================ Dataset Class ============================

class SoilDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        image_path = self.dataframe.iloc[idx]['image_path']
        label = self.dataframe.iloc[idx]['label_idx']
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

# ============================ DataLoaders ============================
batch_size = 32

train_dataset = SoilDataset(train_data, transform=train_transforms)
val_dataset = SoilDataset(val_data, transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# ============================ Model Setup ============================
model = models.resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, len(label_to_index))
model = model.to(device)

# Load pretrained weights if available
pretrained_path = "/kaggle/input/resnet18/resnet18-f37072fd.pth"
if os.path.exists(pretrained_path):
    state_dict = torch.load(pretrained_path, map_location=device)
    model.load_state_dict(state_dict, strict=False)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# ============================ Training Function ============================

def train_model(epochs=5):
    best_val_acc = 0.0
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()
            total_train += labels.size(0)

        train_loss = running_loss / total_train
        train_acc = correct_train / total_train

        model.eval()
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, preds = torch.max(outputs, 1)
                correct_val += (preds == labels).sum().item()
                total_val += labels.size(0)

        val_acc = correct_val / total_val

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Acc: {val_acc:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_soil_model.pth")
            print("✅ Saved best model")

# ============================ Start Training ============================
train_model(epochs=5)
