In [1]:
!pip install -q torch torchvision timm pandas scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import timm
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import f1_score
import numpy as np

BATCH_SIZE = 32
IMG_SIZE = 224
NUM_EPOCHS = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "swin_base_patch4_window7_224"
TRAIN_DIR = "/kaggle/input/soil-classification/soil_classification-2025/train"
TEST_DIR = "/kaggle/input/soil-classification/soil_classification-2025/test"
TRAIN_CSV = "/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv"
TEST_CSV = "/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv"
BEST_MODEL_PATH = "best_model.pth"  # Path to save the best model

df = pd.read_csv(TRAIN_CSV)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['soil_type'])

class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['image_id'])
        image = Image.open(img_path).convert("RGB")
        label = row['label'] if 'label' in row else -1
        if self.transform:
            image = self.transform(image)
        return image, label, row['image_id']

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    #transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])


train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'])

train_dataset = SoilDataset(train_df, TRAIN_DIR, transform=train_transform)
val_dataset = SoilDataset(val_df, TRAIN_DIR, transform=test_transform)
test_df = pd.read_csv(TEST_CSV)
test_dataset = SoilDataset(test_df, TEST_DIR, transform=test_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = timm.create_model(MODEL_NAME, pretrained=True, num_classes=len(label_encoder.classes_))
model.to(DEVICE)

from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Initialize best min F1 score
best_min_f1 = 0.0

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    correct = 0

    for images, labels, _ in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()

    train_acc = correct / len(train_loader.dataset)
    val_acc = 0
    model.eval()
    val_preds = []
    val_labels = []
    
    with torch.no_grad():
        for images, labels, _ in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            preds = outputs.argmax(dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_acc = (torch.tensor(val_preds) == torch.tensor(val_labels)).float().mean().item()
    
    f1_scores = f1_score(val_labels, val_preds, average=None, labels=list(range(len(label_encoder.classes_))))
    min_f1 = f1_scores.min()
    
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Min F1 Score: {min_f1:.4f}")
    
    # Save the model if it has the best min F1 score so far
    if min_f1 > best_min_f1:
        best_min_f1 = min_f1
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"New best model saved with min F1 score: {best_min_f1:.4f}")

# Load the best model for inference
print(f"Loading best model with min F1 score: {best_min_f1:.4f}")
model.load_state_dict(torch.load(BEST_MODEL_PATH))

model.eval()
predictions = []
image_ids = []
with torch.no_grad():
    for images, _, ids in test_loader:
        images = images.to(DEVICE)
        outputs = model(images)
        preds = outputs.argmax(dim=1)
        predictions.extend(preds.cpu().numpy())
        image_ids.extend(ids)

submission = pd.DataFrame({
    'image_id': image_ids,
    'soil_type': label_encoder.inverse_transform(predictions)
})
submission.to_csv("submission.csv", index=False)
print("Saved predictions to submission.csv using the best model")


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 31/31 [00:46<00:00,  1.51s/it]


Epoch [1/20], Train Acc: 0.8598, Val Acc: 0.9388, Min F1 Score: 0.8941
New best model saved with min F1 score: 0.8941


Epoch 2: 100%|██████████| 31/31 [00:39<00:00,  1.28s/it]


Epoch [2/20], Train Acc: 0.9519, Val Acc: 0.9510, Min F1 Score: 0.9070
New best model saved with min F1 score: 0.9070


Epoch 3: 100%|██████████| 31/31 [00:41<00:00,  1.34s/it]


Epoch [3/20], Train Acc: 0.9785, Val Acc: 0.9510, Min F1 Score: 0.9091
New best model saved with min F1 score: 0.9091


Epoch 4: 100%|██████████| 31/31 [00:45<00:00,  1.47s/it]


Epoch [4/20], Train Acc: 0.9795, Val Acc: 0.9673, Min F1 Score: 0.9398
New best model saved with min F1 score: 0.9398


Epoch 5: 100%|██████████| 31/31 [00:43<00:00,  1.41s/it]


Epoch [5/20], Train Acc: 0.9816, Val Acc: 0.9714, Min F1 Score: 0.9565
New best model saved with min F1 score: 0.9565


Epoch 6: 100%|██████████| 31/31 [00:45<00:00,  1.46s/it]


Epoch [6/20], Train Acc: 0.9928, Val Acc: 0.9714, Min F1 Score: 0.9412


Epoch 7: 100%|██████████| 31/31 [00:44<00:00,  1.43s/it]


Epoch [7/20], Train Acc: 0.9826, Val Acc: 0.9796, Min F1 Score: 0.9750
New best model saved with min F1 score: 0.9750


Epoch 8: 100%|██████████| 31/31 [00:45<00:00,  1.48s/it]


Epoch [8/20], Train Acc: 0.9887, Val Acc: 0.9469, Min F1 Score: 0.9375


Epoch 9: 100%|██████████| 31/31 [00:43<00:00,  1.42s/it]


Epoch [9/20], Train Acc: 0.9867, Val Acc: 0.9714, Min F1 Score: 0.9500


Epoch 10: 100%|██████████| 31/31 [00:44<00:00,  1.43s/it]


Epoch [10/20], Train Acc: 0.9918, Val Acc: 0.9673, Min F1 Score: 0.9302


Epoch 11: 100%|██████████| 31/31 [00:45<00:00,  1.46s/it]


Epoch [11/20], Train Acc: 0.9939, Val Acc: 0.9837, Min F1 Score: 0.9756
New best model saved with min F1 score: 0.9756


Epoch 12: 100%|██████████| 31/31 [00:46<00:00,  1.48s/it]


Epoch [12/20], Train Acc: 0.9765, Val Acc: 0.9633, Min F1 Score: 0.9302


Epoch 13: 100%|██████████| 31/31 [00:47<00:00,  1.53s/it]


Epoch [13/20], Train Acc: 0.9857, Val Acc: 0.9633, Min F1 Score: 0.9451


Epoch 14: 100%|██████████| 31/31 [00:47<00:00,  1.55s/it]


Epoch [14/20], Train Acc: 0.9918, Val Acc: 0.9837, Min F1 Score: 0.9756


Epoch 15: 100%|██████████| 31/31 [00:47<00:00,  1.53s/it]


Epoch [15/20], Train Acc: 0.9949, Val Acc: 0.9755, Min F1 Score: 0.9639


Epoch 16: 100%|██████████| 31/31 [00:47<00:00,  1.52s/it]


Epoch [16/20], Train Acc: 0.9949, Val Acc: 0.9878, Min F1 Score: 0.9787
New best model saved with min F1 score: 0.9787


Epoch 17: 100%|██████████| 31/31 [00:47<00:00,  1.52s/it]


Epoch [17/20], Train Acc: 0.9806, Val Acc: 0.9673, Min F1 Score: 0.9302


Epoch 18: 100%|██████████| 31/31 [00:47<00:00,  1.52s/it]


Epoch [18/20], Train Acc: 0.9969, Val Acc: 0.9755, Min F1 Score: 0.9512


Epoch 19: 100%|██████████| 31/31 [00:47<00:00,  1.52s/it]


Epoch [19/20], Train Acc: 0.9826, Val Acc: 0.9714, Min F1 Score: 0.9412


Epoch 20: 100%|██████████| 31/31 [00:46<00:00,  1.51s/it]


Epoch [20/20], Train Acc: 0.9877, Val Acc: 0.9796, Min F1 Score: 0.9684
Loading best model with min F1 score: 0.9787
Saved predictions to submission.csv using the best model
