In [None]:
## 📦 Import Libraries

# Importing standard Python libraries
import os  # Provides functions to interact with the operating system
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations and working with arrays
from PIL import Image  # For image loading and preprocessing

# PyTorch libraries for deep learning
import torch  # Core PyTorch functionality
from torch.utils.data import Dataset, DataLoader  # For dataset management and batch loading
import torch.nn as nn  # For building neural network layers
import torch.optim as optim  # For optimization algorithms like SGD, Adam, etc.

# TorchVision libraries for pretrained models and image transformations
from torchvision import transforms, models  # `transforms` for image preprocessing, `models` for pretrained CNNs

# Scikit-learn for preprocessing and evaluation
from sklearn.model_selection import train_test_split  # For splitting the dataset into train and validation sets
from sklearn.preprocessing import LabelEncoder  # For converting class labels to integers
from sklearn.metrics import f1_score  # F1 score is the competition metric

# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)  # Set PyTorch seed
np.random.seed(SEED)  # Set NumPy seed

# Set device configuration (CPU only in this environment)
DEVICE = torch.device('cpu')  # Use 'cuda' if GPU is available

# Set the data directory path
DATA_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025'  # Root directory for dataset

print("Setup done.")  # Confirm environment is initialized


In [None]:
## ⚙️ Configuration

# CELL 2: Load data and fix labels to zero-based indexing

# Load the training labels CSV into a pandas DataFrame
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train_labels.csv'))

# Display the original labels to verify their distribution and indexing
print("Original labels:", train_df['label'].unique())

# Use LabelEncoder to convert categorical labels into integer labels starting from 0
# This ensures labels are continuous integers, which is required for PyTorch classification
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])

# Get the number of unique classes after encoding
NUM_CLASSES = train_df['label'].nunique()
print(f"Number of classes after label encoding: {NUM_CLASSES}")

# Display the transformed labels to verify successful encoding
print("Labels after encoding:", train_df['label'].unique())


In [None]:
# CELL 3: Custom Dataset class for loading soil images and labels with transforms

from torch.utils.data import Dataset
from PIL import Image
import torch
import os

class SoilDataset(Dataset):
    def __init__(self, df, root_dir, transform=None):
        """
        Args:
            df (pd.DataFrame): DataFrame containing 'image_id' and 'label' columns
            root_dir (str): Directory where images are stored
            transform (callable, optional): Optional torchvision transforms to apply to images
        """
        self.df = df.reset_index(drop=True)  # Reset index for safe integer indexing
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        # Returns total number of samples in the dataset
        return len(self.df)

    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the sample to retrieve

        Returns:
            image (Tensor): Transformed image tensor
            label (Tensor): Corresponding label as a LongTensor (required for loss functions like CrossEntropyLoss)
        """
        # Get the image filename using index
        img_name = self.df.loc[idx, 'image_id']
        # Get the label for this image
        label = self.df.loc[idx, 'label']

        # Construct full image path
        img_path = os.path.join(self.root_dir, img_name)

        # Load the image and convert to RGB (to ensure 3 channels)
        image = Image.open(img_path).convert('RGB')

        # Apply image transformations if provided (e.g., resize, normalize, augmentations)
        if self.transform:
            image = self.transform(image)

        # Convert label to a torch LongTensor for compatibility with classification loss
        label = torch.tensor(label).long()

        return image, label


In [None]:
# CELL 4: Define data transformations and create train-validation split with DataLoaders

from torchvision import transforms
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Define training data augmentation and preprocessing pipeline
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize all images to 224x224 (standard input size for many CNNs)
    transforms.RandomHorizontalFlip(),  # Randomly flip images horizontally (data augmentation)
    transforms.RandomVerticalFlip(),  # Randomly flip images vertically (data augmentation)
    transforms.ToTensor(),  # Convert PIL Image to PyTorch tensor and scale pixel values [0,1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize with ImageNet mean and std for pretrained models
                         std=[0.229, 0.224, 0.225])
])

# Validation transformations: only resize and normalization (no augmentation)
transform_val = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Split dataset into training and validation sets using stratified split to maintain label distribution
train_data, val_data = train_test_split(
    train_df, 
    test_size=0.2,  # 20% validation split
    stratify=train_df['label'],  # Maintain class proportions in both sets
    random_state=SEED  # Reproducible split
)

# Create PyTorch Dataset objects for train and validation sets
train_dataset = SoilDataset(train_data, os.path.join(DATA_DIR, 'train'), transform=transform_train)
val_dataset = SoilDataset(val_data, os.path.join(DATA_DIR, 'train'), transform=transform_val)

# Create DataLoaders for batch processing and shuffling during training
train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, num_workers=2
)
val_loader = DataLoader(
    val_dataset, batch_size=32, shuffle=False, num_workers=2
)

# Print dataset sizes to verify split
print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")


In [None]:
# CELL 5: Define model architecture without pretrained weights

from torchvision import models
import torch.nn as nn

# Initialize a ResNet18 model without pretrained weights to avoid download issues on Kaggle
model = models.resnet18(weights=None)

# Replace the final fully connected layer to match the number of classes in our dataset
# model.fc.in_features gives the input features to the last layer
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)

# Move the model to the specified device (CPU here, GPU if available)
model = model.to(DEVICE)

print("Model created.")


In [None]:
# CELL 6: Define loss function, optimizer, and evaluation metric

import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

# CrossEntropyLoss is standard for multi-class classification problems
criterion = nn.CrossEntropyLoss()

# Adam optimizer with a learning rate of 0.001 for efficient gradient updates
optimizer = optim.Adam(model.parameters(), lr=0.001)

def calculate_min_f1(y_true, y_pred):
    """
    Calculate the minimum F1 score across all classes.
    This is the competition evaluation metric, ensuring balanced performance.

    Args:
        y_true (array-like): Ground truth labels
        y_pred (array-like): Predicted labels

    Returns:
        float: Minimum class-wise F1 score
    """
    # Compute F1 score for each class separately (average=None)
    f1_scores = f1_score(y_true, y_pred, average=None)
    # Return the smallest F1 score to capture the worst-performing class
    return f1_scores.min()


In [None]:
# CELL 7: Training loop with validation and minimum F1 evaluation

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10):
    """
    Train the model and evaluate on validation set each epoch.

    Args:
        model (nn.Module): The neural network model to train
        criterion (loss): Loss function (e.g., CrossEntropyLoss)
        optimizer (optim): Optimizer (e.g., Adam)
        train_loader (DataLoader): DataLoader for training data
        val_loader (DataLoader): DataLoader for validation data
        epochs (int): Number of training epochs

    Prints:
        Epoch number, average training loss, and minimum validation F1 score per epoch
    """
    for epoch in range(epochs):
        model.train()  # Set model to training mode (enables dropout, batchnorm, etc.)

        running_loss = 0.0  # Accumulate batch losses to compute average per epoch

        for images, labels in train_loader:
            # Move data to the computation device (CPU/GPU)
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()  # Clear gradients from previous step
            outputs = model(images)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            running_loss += loss.item()  # Accumulate loss for monitoring

        avg_loss = running_loss / len(train_loader)  # Average loss for this epoch

        # Validation phase - no gradient computation for efficiency
        model.eval()  # Set model to evaluation mode (disables dropout, batchnorm updates)

        all_preds = []  # Store predictions for all validation samples
        all_labels = []  # Store true labels for validation samples

        with torch.no_grad():  # Disable gradient tracking during validation
            for images, labels in val_loader:
                images, labels = images.to(DEVICE), labels.to(DEVICE)
                outputs = model(images)
                _, preds = torch.max(outputs, 1)  # Get predicted class indices

                # Move predictions and labels to CPU and convert to numpy for metric calculation
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate minimum F1 score across all classes on validation set
        min_f1 = calculate_min_f1(all_labels, all_preds)

        # Print training stats for the current epoch
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Min F1: {min_f1:.4f}")

# Start training for 10 epochs
train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10)


In [None]:
# Step 1: Load test images using a custom Dataset

from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

class TestDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        """
        Args:
            image_dir (str): Path to directory containing test images
            transform (callable, optional): Transformations to apply to each image
        """
        self.image_dir = image_dir
        self.image_ids = sorted(os.listdir(image_dir))  # Ensure consistent ordering
        self.transform = transform

    def __len__(self):
        # Return the number of test images
        return len(self.image_ids)

    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the image to retrieve

        Returns:
            image (Tensor): Transformed image tensor
            img_id (str): Corresponding image file name
        """
        img_id = self.image_ids[idx]  # Get image file name
        img_path = os.path.join(self.image_dir, img_id)  # Full path to the image
        image = Image.open(img_path).convert('RGB')  # Ensure image has 3 channels (RGB)

        # Apply transforms if defined (e.g., resize, normalize)
        if self.transform:
            image = self.transform(image)

        return image, img_id

# Initialize the test dataset using validation transforms (no augmentation)
test_dataset = TestDataset(os.path.join(DATA_DIR, 'test'), transform=transform_val)

# Use DataLoader for efficient batch processing (no shuffling for test set)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)


In [None]:
# Step 2: Predict on test data using trained model

model.eval()  # Set model to evaluation mode (important for inference consistency)
test_preds = []   # Store predicted label indices
image_ids = []    # Store corresponding image filenames

# Disable gradient computation to speed up inference and reduce memory usage
with torch.no_grad():
    for images, ids in test_loader:
        images = images.to(DEVICE)  # Move batch to CPU/GPU
        outputs = model(images)  # Forward pass
        _, preds = torch.max(outputs, 1)  # Get class index with highest score
        test_preds.extend(preds.cpu().numpy())  # Move predictions to CPU and store
        image_ids.extend(ids)  # Save image filenames in original order

# Convert predicted label indices back to original label names using inverse transform
pred_labels = le.inverse_transform(test_preds)


In [None]:
# Step 3: Prepare submission CSV file

import pandas as pd

# Create a DataFrame with image IDs and corresponding predicted labels
submission_df = pd.DataFrame({
    'image_id': image_ids,  # Filenames of test images
    'label': pred_labels    # Predicted string labels (after inverse transform)
})

# Save the DataFrame to a CSV file in the required format for submission
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file saved as /kaggle/working/submission.csv")


# Display the first few rows of the submission file for verification
submission_df.head()
