# Soil Image Classification Challenge 2025
This notebook contains the full pipeline for training a binary classifier to identify soil images using PyTorch and Albumentations.

## 1. Imports and Setup

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models
import albumentations as A
from albumentations.pytorch import ToTensorV2

## 2. Load and Prepare Dataset

In [None]:
# Load CSV files
train_csv = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
test_csv = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

# Add image paths
train_df['image_path'] = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train/' + train_df['image_id']
test_df['image_path'] = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test/' + test_df['image_id']

# Convert labels to int
train_df['label'] = train_df['label'].astype(int)

# Stratified split
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(
    train_df,
    test_size=0.1,
    stratify=train_df['label'],
    random_state=42
)

## 3. Define Data Augmentations and Dataset

In [None]:
train_transform = A.Compose([
    A.Resize(224, 224),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.3),
    A.Rotate(limit=15, p=0.5),
    A.Normalize(),
    ToTensorV2()
])

val_test_transform = A.Compose([
    A.Resize(224, 224),
    A.Normalize(),
    ToTensorV2()
])

class SoilDataset(Dataset):
    def __init__(self, df, transform=None, is_test=False):
        self.df = df
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['image_path']
        image = Image.open(img_path).convert('RGB')
        image = np.array(image)

        if self.transform:
            image = self.transform(image=image)['image']

        if self.is_test:
            return image
        else:
            label = self.df.iloc[idx]['label']
            return image, torch.tensor(label, dtype=torch.long)

## 4. Create DataLoaders

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(SoilDataset(train_data, train_transform), batch_size=32, shuffle=True)
val_loader = DataLoader(SoilDataset(val_data, val_test_transform), batch_size=32)
test_loader = DataLoader(SoilDataset(test_df, val_test_transform, is_test=True), batch_size=32)

## 5. Model Initialization

In [None]:
import torch.nn as nn
from torchvision import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, 2)  # Binary classification
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## 6. Training Function

In [None]:
from sklearn.metrics import f1_score

def train_model(epochs=5):
    best_f1 = 0.0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * images.size(0)
            all_preds.extend(torch.argmax(outputs, 1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        train_f1 = f1_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_loader.dataset):.4f}, Train F1 = {train_f1:.4f}")

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = torch.argmax(outputs, 1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_f1 = f1_score(val_labels, val_preds)
        print(f"Validation F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), "best_soil_model.pth")
            print("✅ New best model saved")

## 7. Train the Model

In [None]:
train_model(epochs=5)

## 8. Inference on Test Set

In [None]:
model.load_state_dict(torch.load("best_soil_model.pth"))
model.eval()

test_predictions = []
with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, 1)
        test_predictions.extend(preds.cpu().numpy())

## 9. Prepare Submission File

In [None]:
idx2label_bin = {0: "0", 1: "1"}
test_df['label'] = [idx2label_bin[pred] for pred in test_predictions]
submission = test_df[['image_id', 'label']]
submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as submission.csv")