
# CSIRO Biomass Prediction - Baseline Solution

This notebook implements a baseline solution for the CSIRO Biomass Prediction competition.
It uses an EfficientNet-B0 model to predict 3 independent biomass components (`Dry_Green_g`, `Dry_Clover_g`, `Dry_Dead_g`) and derives the others (`GDM_g`, `Dry_Total_g`).


In [1]:

import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


## Dataset Class

In [2]:

class BiomassDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None, is_test=False):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
            is_test (bool): If True, csv_file is treated as test.csv (no targets).
        """
        self.root_dir = root_dir
        self.transform = transform
        self.is_test = is_test
        
        df = pd.read_csv(csv_file)
        
        if not is_test:
            # Pivot to have one row per image
            # We need Dry_Green_g, Dry_Clover_g, Dry_Dead_g
            self.data = df.pivot(index='image_path', columns='target_name', values='target').reset_index()
            # Ensure columns exist
            for col in ['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g']:
                if col not in self.data.columns:
                    self.data[col] = 0.0
            
            # Filter to keep only necessary columns
            self.data = self.data[['image_path', 'Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g']]
        else:
            # For test, we just need unique images
            self.data = df[['image_path']].drop_duplicates().reset_index(drop=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['image_path'])
        
        # Handle potential path issues (e.g. if csv has 'train/ID...' and root_dir is '.../train')
        # The csv seems to have 'train/ID...' and images are in 'train' folder.
        # So if root_dir is the base folder, it should work.
        
        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            # Try adjusting path if needed
            # If img_path is 's:/.../train/train/ID...'
            # Let's assume root_dir is the base directory containing 'train' and 'test' folders
            pass

        if self.transform:
            image = self.transform(image)

        if not self.is_test:
            # Get targets
            dry_green = self.data.iloc[idx]['Dry_Green_g']
            dry_clover = self.data.iloc[idx]['Dry_Clover_g']
            dry_dead = self.data.iloc[idx]['Dry_Dead_g']
            
            targets = torch.tensor([dry_green, dry_clover, dry_dead], dtype=torch.float32)
            return image, targets, self.data.iloc[idx]['image_path']
        else:
            return image, self.data.iloc[idx]['image_path']


## Model Architecture

In [3]:

class BiomassModel(nn.Module):
    def __init__(self, num_outputs=3):
        super(BiomassModel, self).__init__()
        # Use EfficientNet B0 as backbone
        # weights='DEFAULT' loads the best available weights (ImageNet)
        try:
            self.backbone = models.efficientnet_b0(weights='DEFAULT')
            num_ftrs = self.backbone.classifier[1].in_features
            self.backbone.classifier = nn.Identity() # Remove original classifier
        except:
            # Fallback to ResNet18 if EfficientNet is not available
            print("EfficientNet not found, using ResNet18")
            self.backbone = models.resnet18(pretrained=True)
            num_ftrs = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()

        self.head = nn.Sequential(
            nn.Linear(num_ftrs, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, num_outputs),
            nn.ReLU() # Ensure non-negative output
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.head(x)
        return x


## Training Loop

In [4]:

def train_model(num_epochs=2, batch_size=16, learning_rate=1e-4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Data Transforms
    data_transforms = {
        'train': transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }

    # Dataset
    full_dataset = BiomassDataset(
        csv_file='/kaggle/input/csiro-biomass/train.csv',
        root_dir='/kaggle/input/csiro-biomass',
        transform=data_transforms['train'] # Initial transform, will override for val
    )

    # Split indices
    dataset_size = len(full_dataset)
    indices = list(range(dataset_size))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

    # Samplers
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    # DataLoaders
    train_loader = DataLoader(full_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=0)
    # For validation, we strictly should use val transforms, but SubsetRandomSampler makes it hard to change transform per sample easily without two datasets.
    # For simplicity, we'll use the same dataset but maybe without heavy augmentation if possible.
    # Actually, let's create two dataset instances.
    
    train_dataset = BiomassDataset(
        csv_file='/kaggle/input/csiro-biomass/train.csv',
        root_dir='/kaggle/input/csiro-biomass',
        transform=data_transforms['train']
    )
    val_dataset = BiomassDataset(
        csv_file='/kaggle/input/csiro-biomass/train.csv',
        root_dir='/kaggle/input/csiro-biomass',
        transform=data_transforms['val']
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=0)

    # Model
    model = BiomassModel(num_outputs=3).to(device)

    # Loss and Optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
                dataloader = train_loader
            else:
                model.eval()
                dataloader = val_loader

            running_loss = 0.0
            
            for inputs, targets, _ in dataloader:
                inputs = inputs.to(device)
                targets = targets.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)

            epoch_loss = running_loss / len(dataloader.sampler) # Use sampler length
            print(f'{phase} Loss: {epoch_loss:.4f}')

            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                torch.save(model.state_dict(), 'best_model.pth')
                print("Model saved.")

    print(f'Best Val Loss: {best_loss:.4f}')


## Prediction & Submission

In [5]:

def predict():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Transforms
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # Dataset
    test_dataset = BiomassDataset(
        csv_file='/kaggle/input/csiro-biomass/test.csv',
        root_dir='/kaggle/input/csiro-biomass',
        transform=transform,
        is_test=True
    )
    
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)

    # Model
    model = BiomassModel(num_outputs=3).to(device)
    model.load_state_dict(torch.load('best_model.pth', map_location=device))
    model.eval()

    results = []

    print("Starting prediction...")
    with torch.no_grad():
        for inputs, image_paths in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # Outputs: [Dry_Green_g, Dry_Clover_g, Dry_Dead_g]
            dry_green = outputs[0][0].item()
            dry_clover = outputs[0][1].item()
            dry_dead = outputs[0][2].item()
            
            # Derived
            gdm = dry_green + dry_clover # Assuming GDM is Green + Clover? Wait, let's check check_data_output_2.txt again.
            # H2: GDM_g = Dry_Green_g + Dry_Clover_g. Yes.
            
            dry_total = gdm + dry_dead
            # H1: Dry_Total_g = GDM_g + Dry_Dead_g. Yes.
            
            # Image ID from path
            # image_path is like 'test/ID1001187975.jpg'
            # We need ID1001187975
            img_path = image_paths[0]
            basename = os.path.basename(img_path)
            image_id = os.path.splitext(basename)[0]
            
            # Append results for each target type
            results.append({'sample_id': f"{image_id}__Dry_Green_g", 'target': dry_green})
            results.append({'sample_id': f"{image_id}__Dry_Clover_g", 'target': dry_clover})
            results.append({'sample_id': f"{image_id}__Dry_Dead_g", 'target': dry_dead})
            results.append({'sample_id': f"{image_id}__GDM_g", 'target': gdm})
            results.append({'sample_id': f"{image_id}__Dry_Total_g", 'target': dry_total})

    # Create DataFrame
    submission_df = pd.DataFrame(results)
    
    # Save
    submission_df.to_csv('submission.csv', index=False)
    print("Submission saved to submission.csv")


## Execution

In [6]:

# Train the model
# Reduced epochs for demonstration, increase for better results
train_model(num_epochs=5, batch_size=32, learning_rate=1e-4)

# Generate submission
predict()


Using device: cuda


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 182MB/s]


Epoch 1/5
----------
train Loss: 628.3000
val Loss: 507.5863
Model saved.
Epoch 2/5
----------
train Loss: 599.4013
val Loss: 468.7160
Model saved.
Epoch 3/5
----------
train Loss: 544.3199
val Loss: 384.9710
Model saved.
Epoch 4/5
----------
train Loss: 460.0891
val Loss: 323.2371
Model saved.
Epoch 5/5
----------
train Loss: 379.9856
val Loss: 303.9390
Model saved.
Best Val Loss: 303.9390
Using device: cuda
Starting prediction...
Submission saved to submission.csv
