## 📘 Soil Classification - PyTorch Notebook Overview

### 🔹 Cell 1 – Importing Libraries
Imports all essential libraries including **PyTorch**, **torchvision**, **scikit-learn**, **PIL**, and **tqdm**. These are used for:
- Data handling and preprocessing  
- Model building and evaluation  
- Image transformations and training utilities

In [None]:

# Core Python and data science libraries
import os
import pandas as pd
import numpy as np

# Sklearn utilities for evaluation and splitting
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Torchvision utilities for pretrained models and transforms
from torchvision import datasets, transforms, models

# PyTorch core libraries
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image  # For image processing

# PyTorch modules for model building and optimization
import torch.nn as nn
import torch.optim as optim

# tqdm for progress visualization
from tqdm import tqdm


### 🔹 Cell 2 – Device Setup and Path Configuration
- Sets up the computation device (GPU if available, else CPU)  
- Configures paths for:
  - Image directories  
  - CSV files for training and inference

In [None]:

# Redundant imports removed by kernel (kept for clarity during execution)
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image

# Select GPU if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

# Define base paths to the dataset folders and CSV files
BASE_PATH = '/kaggle/input/soil-classification/soil_classification-2025'
TRAIN_DIR = os.path.join(BASE_PATH, 'train')
TEST_DIR = os.path.join(BASE_PATH, 'test')
LABELS_CSV = os.path.join(BASE_PATH, 'train_labels.csv')
TEST_IDS_CSV = os.path.join(BASE_PATH, 'test_ids.csv')


Using: cpu


### 🔹 Cell 3 – Data Preparation and Transformations
- Loads and encodes label data  
- Splits the dataset into training and validation sets  
- Applies `torchvision.transforms` for:
  - Data augmentation (training)
  - Normalization (both training and validation)

In [None]:
# Read the CSV file containing image labels
df = pd.read_csv(LABELS_CSV)

# Duplicate the image_id column to an 'image' column for convenience
df['image'] = df['image_id']

# Encode string labels (soil types) as integers
label_mapping = {label: idx for idx, label in enumerate(df['soil_type'].unique())}
inv_label_mapping = {v: k for k, v in label_mapping.items()}  # Inverse mapping for decoding predictions
df['label'] = df['soil_type'].map(label_mapping)

# Split dataset into training and validation sets using stratification
train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)

# Define image transformations for training, validation, and testing
image_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ])
}


### 🔹 Cell 4 – Dataset Class and Model Setup
- Defines a custom PyTorch `Dataset` for loading and transforming image-label pairs  
- Creates DataLoaders for efficient batching  
- Loads a pretrained **ResNet-18** model  
- Modifies the final layer to match the number of soil classes


In [None]:
# Custom PyTorch dataset class for loading soil images and labels
class SoilDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None, is_test=False):
        self.df = dataframe
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['image']
        img_path = os.path.join(self.img_dir, image_id)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            return image, image_id  # Return image ID for test set
        else:
            label = self.df.iloc[idx]['label']
            return image, label  # Return image and label for training/validation

# Initialize datasets and dataloaders
train_dataset = SoilDataset(train_df, TRAIN_DIR, transform=image_transforms['train'])
val_dataset = SoilDataset(val_df, TRAIN_DIR, transform=image_transforms['val'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Load pretrained ResNet-18 model and modify final layer to match number of soil classes
from torchvision.models import resnet18, ResNet18_Weights
weights = ResNet18_Weights.DEFAULT
model = resnet18(weights=weights)
model.fc = nn.Linear(model.fc.in_features, len(label_mapping))  # Replace final layer
model = model.to(device)  # Move model to GPU/CPU

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 157MB/s]


### 🔹 Cell 5 – Training Loop with F1 Evaluation
- Implements the training loop with loss tracking  
- Evaluates on validation data each epoch  
- Computes:
  - Per-class F1 scores  
  - Minimum F1 score (used for competition evaluation)

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    # Training phase
    for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}'):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_preds, val_labels = [], []

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.numpy())

    # Compute per-class F1 scores
    f1_scores = []
    for i in range(len(label_mapping)):
        f1 = f1_score(np.array(val_labels) == i, np.array(val_preds) == i)
        f1_scores.append(f1)

    print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f}, Min F1: {min(f1_scores):.4f}, F1s: {f1_scores}")


Epoch 1/10: 100%|██████████| 33/33 [02:36<00:00,  4.74s/it]


Epoch 1 - Train Loss: 16.5100, Min F1: 0.8788, F1s: [0.954248366013072, 0.8787878787878789, 1.0, 0.9565217391304348]


Epoch 2/10: 100%|██████████| 33/33 [02:27<00:00,  4.47s/it]


Epoch 2 - Train Loss: 4.1323, Min F1: 0.9355, F1s: [0.9681528662420382, 0.9354838709677419, 1.0, 0.9855072463768115]


Epoch 3/10: 100%|██████████| 33/33 [02:26<00:00,  4.45s/it]


Epoch 3 - Train Loss: 3.7286, Min F1: 0.9508, F1s: [0.9746835443037974, 0.9508196721311476, 1.0, 0.9855072463768115]


Epoch 4/10: 100%|██████████| 33/33 [02:29<00:00,  4.54s/it]


Epoch 4 - Train Loss: 2.1773, Min F1: 0.9836, F1s: [0.9873417721518988, 0.9836065573770492, 1.0, 0.9855072463768115]


Epoch 5/10: 100%|██████████| 33/33 [02:28<00:00,  4.49s/it]


Epoch 5 - Train Loss: 1.8781, Min F1: 0.9474, F1s: [0.975, 0.9473684210526316, 1.0, 0.9577464788732395]


Epoch 6/10: 100%|██████████| 33/33 [02:28<00:00,  4.49s/it]


Epoch 6 - Train Loss: 1.7770, Min F1: 0.9855, F1s: [0.9937106918238994, 1.0, 1.0, 0.9855072463768115]


Epoch 7/10: 100%|██████████| 33/33 [02:28<00:00,  4.50s/it]


Epoch 7 - Train Loss: 1.6376, Min F1: 0.9831, F1s: [0.9875, 0.983050847457627, 1.0, 0.9855072463768115]


Epoch 8/10: 100%|██████████| 33/33 [02:27<00:00,  4.48s/it]


Epoch 8 - Train Loss: 1.0764, Min F1: 0.9836, F1s: [0.9873417721518988, 0.9836065573770492, 1.0, 0.9855072463768115]


Epoch 9/10: 100%|██████████| 33/33 [02:28<00:00,  4.50s/it]


Epoch 9 - Train Loss: 0.9340, Min F1: 0.9831, F1s: [0.9937106918238994, 0.983050847457627, 1.0, 1.0]


Epoch 10/10: 100%|██████████| 33/33 [02:27<00:00,  4.46s/it]


Epoch 10 - Train Loss: 0.8765, Min F1: 0.9524, F1s: [0.9806451612903226, 0.9523809523809523, 1.0, 1.0]


### 🔹 Cell 6 – Prediction and Submission File
- Runs inference on test images  
- Maps predicted numeric labels back to original soil types  
- Creates `submission.csv` file in the required format


In [None]:
# Load test IDs and prepare test dataset
test_ids = pd.read_csv(TEST_IDS_CSV)
test_ids['image'] = test_ids['image_id']
test_dataset = SoilDataset(test_ids, TEST_DIR, transform=image_transforms['test'], is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Generate predictions
model.eval()
test_preds = []
image_names = []

with torch.no_grad():
    for images, image_ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        preds = outputs.argmax(1).cpu().numpy()
        test_preds.extend(preds)
        image_names.extend(image_ids)

# Map numeric predictions back to original labels
final_labels = [inv_label_mapping[p] for p in test_preds]
submission = pd.DataFrame({
    'image_id': image_names,
    'soil_type': final_labels
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv saved!")


✅ submission.csv saved!


### 🔹 Cell 7 – Notebook Completion Marker
- Marks successful pipeline execution  
- Outputs a final message indicating completion

In [7]:
import pandas as pd

# Load and display the submission file
submission = pd.read_csv('submission.csv')
submission.head(77)  # Show the first 10 predictions (you can change the number)

Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
...,...,...
72,img_64d9cdbe.jpg,Clay soil
73,img_5e5ff453.jpg,Clay soil
74,img_2c4f84e3.jpg,Clay soil
75,img_0a40bbe2.jpg,Clay soil
