In [2]:
import os, cv2, numpy as np, pandas as pd, torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from torchvision.models import resnet18, ResNet18_Weights
from PIL import Image
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight

BASE_PATH = "/kaggle/input/soil-classification/soil_classification-2025/"
TRAIN_IMG_PATH = os.path.join(BASE_PATH, "train")
TEST_IMG_PATH = os.path.join(BASE_PATH, "test")
LABEL_PATH = os.path.join(BASE_PATH, "train_labels.csv")
TEST_IDS_PATH = os.path.join(BASE_PATH, "test_ids.csv")

df = pd.read_csv(LABEL_PATH)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['soil_type'])

weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['label']), y=df['label'])
weights = torch.tensor(weights, dtype=torch.float)

def smart_crop(image, threshold=15):
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    _, thresh_img = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return image
    largest = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest)
    return image[y:y+h, x:x+w]

class SoilDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.df = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]['image_id']
        label = self.df.iloc[idx]['label'] if 'label' in self.df.columns else -1
        img_path = os.path.join(self.img_dir, img_name)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = smart_crop(image)
        image = Image.fromarray(image)
        if self.transform:
            image = self.transform(image)
        return image, label, img_name

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(0.2, 0.2, 0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def run_training(model_id, df, img_dir, transform, val_transform):
    print(f"\n🔁 Boost Round {model_id+1}")
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    train_ds = SoilDataset(train_df, img_dir, transform)
    val_ds = SoilDataset(val_df, img_dir, val_transform)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

    model = resnet18(weights=ResNet18_Weights.DEFAULT)
    model.fc = nn.Linear(model.fc.in_features, 4)
    model = model.to(device)

    criterion = nn.CrossEntropyLoss(weight=weights.to(device))
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scaler = torch.cuda.amp.GradScaler()
    best_min_f1 = 0

    for epoch in range(15):
        model.train()
        loss_total = 0
        for images, labels, _ in tqdm(train_loader):
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            loss_total += loss.item()

        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for images, labels, _ in val_loader:
                images = images.to(device)
                outputs = model(images)
                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.numpy())
        report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_, output_dict=True)
        min_f1 = min([report[c]['f1-score'] for c in label_encoder.classes_])
        print(f"Epoch {epoch+1} | Train Loss: {loss_total/len(train_loader):.4f} | Min F1: {min_f1:.4f}")
        if min_f1 > best_min_f1:
            best_min_f1 = min_f1
            torch.save(model.state_dict(), f"/kaggle/working/best_model_{model_id}.pth")
            print("✅ Saved best model for this round")

    return model

models_list = []
for i in range(3):
    model = run_training(i, df.copy(), TRAIN_IMG_PATH, train_transform, val_transform)
    model.load_state_dict(torch.load(f"/kaggle/working/best_model_{i}.pth"))
    model.eval()
    models_list.append(model)

test_ids = pd.read_csv(TEST_IDS_PATH)
test_df = pd.DataFrame({'image_id': test_ids['image_id']})
test_ds = SoilDataset(test_df, TEST_IMG_PATH, val_transform)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=2)

final_preds, image_names = [], []

with torch.no_grad():
    for images, _, names in tqdm(test_loader):
        images = images.to(device)
        outputs = [torch.softmax(model(images), dim=1) for model in models_list]
        avg_output = sum(outputs) / len(outputs)
        preds = torch.argmax(avg_output, dim=1).cpu().numpy()
        final_preds.extend(preds)
        image_names.extend(names)

final_labels = label_encoder.inverse_transform(final_preds)
submission = pd.DataFrame({'image_id': image_names, 'soil_type': final_labels})
submission_path = "/kaggle/working/submission.csv"
submission.to_csv(submission_path, index=False)
print("📁 submission.csv generated successfully")

if os.path.exists(submission_path):
    print("✅ File saved at:", submission_path)
    display(pd.read_csv(submission_path).head(50))
else:
    print("❌ submission.csv not found.")


🔁 Boost Round 1


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 194MB/s]
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:12<00:00,  2.52it/s]


Epoch 1 | Train Loss: 0.4901 | Min F1: 0.8333
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.03it/s]


Epoch 2 | Train Loss: 0.1338 | Min F1: 0.9524
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.99it/s]


Epoch 3 | Train Loss: 0.0884 | Min F1: 0.8989


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.89it/s]


Epoch 4 | Train Loss: 0.0721 | Min F1: 0.9684
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.06it/s]


Epoch 5 | Train Loss: 0.0567 | Min F1: 0.9639


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.10it/s]


Epoch 6 | Train Loss: 0.0585 | Min F1: 0.9684


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]


Epoch 7 | Train Loss: 0.0382 | Min F1: 0.9524


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]


Epoch 8 | Train Loss: 0.0308 | Min F1: 0.9684


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.04it/s]


Epoch 9 | Train Loss: 0.0265 | Min F1: 0.9787
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Epoch 10 | Train Loss: 0.0208 | Min F1: 0.9195


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:08<00:00,  3.86it/s]


Epoch 11 | Train Loss: 0.0223 | Min F1: 0.9620


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Epoch 12 | Train Loss: 0.0195 | Min F1: 0.9639


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.16it/s]


Epoch 13 | Train Loss: 0.0168 | Min F1: 0.9787


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.97it/s]


Epoch 14 | Train Loss: 0.0338 | Min F1: 0.9684


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.13it/s]


Epoch 15 | Train Loss: 0.0258 | Min F1: 0.9250

🔁 Boost Round 2


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.08it/s]


Epoch 1 | Train Loss: 0.4581 | Min F1: 0.8444
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.18it/s]


Epoch 2 | Train Loss: 0.1313 | Min F1: 0.8966
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.07it/s]


Epoch 3 | Train Loss: 0.0791 | Min F1: 0.9302
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Epoch 4 | Train Loss: 0.0454 | Min F1: 0.9524
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]


Epoch 5 | Train Loss: 0.0494 | Min F1: 0.8974


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.98it/s]


Epoch 6 | Train Loss: 0.0476 | Min F1: 0.9684
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.03it/s]


Epoch 7 | Train Loss: 0.0286 | Min F1: 0.9412


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.03it/s]


Epoch 8 | Train Loss: 0.0384 | Min F1: 0.9524


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.90it/s]


Epoch 9 | Train Loss: 0.0385 | Min F1: 0.9524


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.00it/s]


Epoch 10 | Train Loss: 0.0527 | Min F1: 0.9524


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.92it/s]


Epoch 11 | Train Loss: 0.0440 | Min F1: 0.9512


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.03it/s]


Epoch 12 | Train Loss: 0.0295 | Min F1: 0.9639


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.04it/s]


Epoch 13 | Train Loss: 0.0259 | Min F1: 0.9756
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]


Epoch 14 | Train Loss: 0.0298 | Min F1: 0.9639


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Epoch 15 | Train Loss: 0.0339 | Min F1: 0.8831

🔁 Boost Round 3


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.90it/s]


Epoch 1 | Train Loss: 0.4867 | Min F1: 0.8864
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.12it/s]


Epoch 2 | Train Loss: 0.1733 | Min F1: 0.9574
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.08it/s]


Epoch 3 | Train Loss: 0.1263 | Min F1: 0.9070


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.93it/s]


Epoch 4 | Train Loss: 0.0714 | Min F1: 0.9583
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  3.96it/s]


Epoch 5 | Train Loss: 0.0563 | Min F1: 0.9512


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Epoch 6 | Train Loss: 0.0384 | Min F1: 0.9524


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.08it/s]


Epoch 7 | Train Loss: 0.0687 | Min F1: 0.9639
✅ Saved best model for this round


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.08it/s]


Epoch 8 | Train Loss: 0.0396 | Min F1: 0.9639


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]


Epoch 9 | Train Loss: 0.0312 | Min F1: 0.9512


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.13it/s]


Epoch 10 | Train Loss: 0.0254 | Min F1: 0.9024


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:08<00:00,  3.87it/s]


Epoch 11 | Train Loss: 0.0232 | Min F1: 0.9524


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.01it/s]


Epoch 12 | Train Loss: 0.0204 | Min F1: 0.9500


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.09it/s]


Epoch 13 | Train Loss: 0.0136 | Min F1: 0.9367


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:08<00:00,  3.80it/s]


Epoch 14 | Train Loss: 0.0255 | Min F1: 0.9383


  with torch.cuda.amp.autocast():
100%|██████████| 31/31 [00:07<00:00,  4.06it/s]


Epoch 15 | Train Loss: 0.0190 | Min F1: 0.9639


100%|██████████| 11/11 [00:02<00:00,  4.12it/s]


📁 submission.csv generated successfully
✅ File saved at: /kaggle/working/submission.csv


Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
5,img_3d6e64a1.jpg,Alluvial soil
6,img_ccb81e64.jpeg,Alluvial soil
7,img_9a499fc5.jpeg,Alluvial soil
8,img_b957da08.jpeg,Alluvial soil
9,img_c62bd1a1.jpg,Alluvial soil


In [4]:
df_sub = pd.read_csv("/kaggle/working/submission.csv")
df_sub.head(50)

Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
5,img_3d6e64a1.jpg,Alluvial soil
6,img_ccb81e64.jpeg,Alluvial soil
7,img_9a499fc5.jpeg,Alluvial soil
8,img_b957da08.jpeg,Alluvial soil
9,img_c62bd1a1.jpg,Alluvial soil


In [5]:
submission_df = pd.DataFrame({
    'image_id': image_names,
    'soil_type': label_encoder.inverse_transform(final_preds)
})

test_ids = pd.read_csv(TEST_IDS_PATH)
submission_df = submission_df.set_index('image_id')
ordered_submission = test_ids.set_index('image_id').join(submission_df, how='left').reset_index()
ordered_submission.to_csv("/kaggle/working/submission.csv", index=False)
print("✅ Final submission.csv written in the exact order of test_ids.csv")
print("🧪 Sanity Check:", all(ordered_submission['image_id'].values == test_ids['image_id'].values))

display(ordered_submission.head(50))


✅ Final submission.csv written in the exact order of test_ids.csv
🧪 Sanity Check: True


Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
5,img_3d6e64a1.jpg,Alluvial soil
6,img_ccb81e64.jpeg,Alluvial soil
7,img_9a499fc5.jpeg,Alluvial soil
8,img_b957da08.jpeg,Alluvial soil
9,img_c62bd1a1.jpg,Alluvial soil
