In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from PIL import Image

# Define dataset and label paths
training_images_directory = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'
test_images_directory = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'
training_labels_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
test_ids_csv_path = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'

# Load training labels
labels_dataframe = pd.read_csv(training_labels_csv_path)

# Stratified split for training and validation
training_dataframe, validation_dataframe = train_test_split(
    labels_dataframe,
    test_size=0.2,
    stratify=labels_dataframe['label'],
    random_state=42
)

# Image transformations
image_transformation_pipeline = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet statistics
                         std=[0.229, 0.224, 0.225])
])

# Custom dataset for training and validation
class SoilImageDataset(Dataset):
    def __init__(self, dataframe, image_directory, transform=None):
        self.dataframe = dataframe
        self.image_directory = image_directory
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        image_filename = self.dataframe.iloc[index]['image_id']
        image_label = int(self.dataframe.iloc[index]['label'])
        full_image_path = os.path.join(self.image_directory, image_filename)
        image = Image.open(full_image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, image_label

# Create datasets and dataloaders
training_dataset = SoilImageDataset(training_dataframe, training_images_directory, image_transformation_pipeline)
validation_dataset = SoilImageDataset(validation_dataframe, training_images_directory, image_transformation_pipeline)

training_dataloader = DataLoader(training_dataset, batch_size=32, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

# Select computation device
computation_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pretrained ResNet18 model and modify final layer
binary_classification_model = models.resnet18(pretrained=True)
binary_classification_model.fc = nn.Sequential(
    nn.Linear(binary_classification_model.fc.in_features, 1),
    nn.Sigmoid()
)
binary_classification_model = binary_classification_model.to(computation_device)

# Define binary classification loss and optimizer
binary_loss_function = nn.BCELoss()
adam_optimizer = optim.Adam(binary_classification_model.parameters(), lr=1e-4)

# Training function with validation
def train_soil_model(model, train_loader, val_loader, total_epochs=10):
    for epoch in range(total_epochs):
        model.train()
        cumulative_training_loss = 0.0

        for batch_images, batch_labels in train_loader:
            batch_images = batch_images.to(computation_device)
            batch_labels = batch_labels.float().unsqueeze(1).to(computation_device)

            adam_optimizer.zero_grad()
            output_probabilities = model(batch_images)
            batch_loss = binary_loss_function(output_probabilities, batch_labels)
            batch_loss.backward()
            adam_optimizer.step()

            cumulative_training_loss += batch_loss.item()

        # Validation loop
        model.eval()
        validation_predictions = []
        validation_targets = []

        with torch.no_grad():
            for batch_images, batch_labels in val_loader:
                batch_images = batch_images.to(computation_device)
                batch_labels = batch_labels.to(computation_device)
                output_probabilities = model(batch_images)
                predicted_labels = (output_probabilities > 0.5).int().cpu().numpy()
                validation_predictions.extend(predicted_labels.flatten())
                validation_targets.extend(batch_labels.cpu().numpy())

        validation_f1 = f1_score(validation_targets, validation_predictions)
        print(f"Epoch {epoch+1}/{total_epochs}, Training Loss: {cumulative_training_loss:.4f}, Validation F1 Score: {validation_f1:.4f}")

# Train the model
train_soil_model(binary_classification_model, training_dataloader, validation_dataloader, total_epochs=10)

# Load test image IDs
test_image_ids_dataframe = pd.read_csv(test_ids_csv_path)

# Custom dataset for test images
class SoilTestImageDataset(Dataset):
    def __init__(self, dataframe, image_directory, transform=None):
        self.dataframe = dataframe
        self.image_directory = image_directory
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        image_filename = self.dataframe.iloc[index]['image_id']
        full_image_path = os.path.join(self.image_directory, image_filename)
        image = Image.open(full_image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, image_filename

# Create test dataset and dataloader
test_dataset = SoilTestImageDataset(test_image_ids_dataframe, test_images_directory, image_transformation_pipeline)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Generate predictions
binary_classification_model.eval()
test_predictions = []

with torch.no_grad():
    for batch_images, image_names in test_dataloader:
        batch_images = batch_images.to(computation_device)
        output_probabilities = binary_classification_model(batch_images)
        predicted_labels = (output_probabilities > 0.5).int().cpu().numpy().flatten()
        for image_name, predicted_label in zip(image_names, predicted_labels):
            test_predictions.append({'image_id': image_name, 'label': predicted_label})

# Create and save submission file
submission_dataframe = pd.DataFrame(test_predictions)
submission_dataframe.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 150MB/s]


Epoch 1/10, Training Loss: 8.0836, Validation F1 Score: 0.9597
Epoch 2/10, Training Loss: 1.3351, Validation F1 Score: 1.0000
Epoch 3/10, Training Loss: 0.7120, Validation F1 Score: 1.0000
Epoch 4/10, Training Loss: 0.4389, Validation F1 Score: 1.0000
Epoch 5/10, Training Loss: 0.3365, Validation F1 Score: 1.0000
Epoch 6/10, Training Loss: 0.2537, Validation F1 Score: 1.0000
Epoch 7/10, Training Loss: 0.2382, Validation F1 Score: 1.0000
Epoch 8/10, Training Loss: 0.1856, Validation F1 Score: 1.0000
Epoch 9/10, Training Loss: 0.1281, Validation F1 Score: 1.0000
Epoch 10/10, Training Loss: 0.1146, Validation F1 Score: 1.0000
Submission file saved as 'submission.csv'
