In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===============================
# Soil Image Classification Challenge
# Author: Siddharth Chhoria
# Date: [24/05/25]
# ===============================

# --- 1. Library Imports ---
# Import all necessary libraries for data manipulation, model building, and evaluation.

import numpy as np                    # For numerical operations
import pandas as pd                   # For dataframes and data analysis
import torch                          # For PyTorch tensor operations and model building
import torch.nn as nn                 # For neural network layers and loss functions
from torch.utils.data import Dataset, DataLoader  # For custom dataset and batching
import torchvision.transforms as transforms       # For image augmentations and preprocessing
import torchvision.models as models              # For pretrained CNN architectures
from PIL import Image                 # For image loading and processing
from sklearn.metrics import f1_score  # For model evaluation using F1-score
from tqdm.notebook import tqdm        # For progress bars in notebook cells
import random                        # For reproducibility and randomization


In [None]:
# ============================================
# Utility Function: Set Random Seed for Reproducibility
# ============================================
# Setting a random seed ensures that your results are reproducible.
# This function sets the seed for Python's random module, NumPy, and PyTorch (both CPU and all GPUs).
# Using the same seed each run will produce the same random numbers, which is important for debugging and sharing results[5][6].

def seed_everything(seed=42):
    random.seed(seed)                  # Set seed for Python's built-in random module
    np.random.seed(seed)               # Set seed for NumPy's random number generator
    torch.manual_seed(seed)            # Set seed for PyTorch (CPU)
    torch.cuda.manual_seed_all(seed)   # Set seed for all CUDA devices (GPUs), if available

# Call the function at the start of your notebook or script to ensure reproducibility
seed_everything()


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input/soil-classification/soil_classification-2025'):
    print(dirname)
    for filename in filenames:
        print("   ", filename)


In [None]:
train_csv = '/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv'
test_csv = '/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv'
train_dir = '/kaggle/input/soil-classification/soil_classification-2025/train'
test_dir = '/kaggle/input/soil-classification/soil_classification-2025/test'

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)
print(train_df.head())
print(test_df.head())


In [None]:
soil_types = train_df['soil_type'].unique()
label2idx = {label: idx for idx, label in enumerate(soil_types)}
idx2label = {idx: label for label, idx in label2idx.items()}

train_df['label'] = train_df['soil_type'].map(label2idx)
print(label2idx)


In [None]:
from sklearn.model_selection import train_test_split

train_df_, val_df = train_test_split(
    train_df, test_size=0.15, stratify=train_df['label'], random_state=42)
print(f"Train size: {len(train_df_)}, Validation size: {len(val_df)}")


In [None]:
import torchvision.transforms as transforms

IMG_SIZE = 224 

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),             
    transforms.RandomHorizontalFlip(),                       
    transforms.RandomVerticalFlip(),                     
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), 
    transforms.ToTensor(),                               
    transforms.Normalize(mean=[0.485, 0.456, 0.406],      
                         std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [None]:
from torch.utils.data import Dataset
from PIL import Image
import os

class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx]['image_id']
        img_path = os.path.join(self.img_dir, img_id)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        if self.is_test:
            return image, img_id
        label = self.df.iloc[idx]['label']
        return image, label


In [None]:
from torch.utils.data import DataLoader

train_dataset = SoilDataset(train_df_, train_dir, transform=train_transform)
val_dataset = SoilDataset(val_df, train_dir, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)


In [None]:
import torch
import torch.nn as nn
from torchvision import models


local_weights_path = '/kaggle/input/resnet/resnet18-f37072fd.pth' 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


model = models.resnet18(weights=None)
state_dict = torch.load(local_weights_path, map_location='cpu') 
model.load_state_dict(state_dict)


model.fc = nn.Linear(model.fc.in_features, 4)
model = model.to(device)


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
from sklearn.metrics import f1_score
from tqdm import tqdm

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
    return running_loss / len(loader.dataset)

def validate(model, loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            preds.extend(predicted.cpu().numpy())
            targets.extend(labels.numpy())
    f1s = f1_score(targets, preds, average=None, labels=[0,1,2,3])
    min_f1 = f1s.min()
    return min_f1, f1s


In [None]:
EPOCHS = 10
best_min_f1 = 0

for epoch in range(EPOCHS):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    min_f1, f1s = validate(model, val_loader)
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Min F1={min_f1:.4f}, F1s={f1s}")
    if min_f1 > best_min_f1:
        best_min_f1 = min_f1
        torch.save(model.state_dict(), 'best_model.pth')


In [None]:
test_dataset = SoilDataset(test_df, test_dir, transform=val_transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)


In [None]:
model.load_state_dict(torch.load('best_model.pth', map_location=device))
model.eval()


In [None]:
all_preds = []
image_ids = []

with torch.no_grad():
    for images, img_ids in tqdm(test_loader):
        images = images.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        image_ids.extend(img_ids)


In [None]:
# idx2label was defined earlier
pred_labels = [idx2label[idx] for idx in all_preds]


In [None]:
import pandas as pd

submission = pd.DataFrame({
    'image_id': image_ids,
    'soil_type': pred_labels
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")
