In [1]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

In [3]:
from PIL import Image
from collections import defaultdict
from torchvision.models import resnet34
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
folders = [
    "/kaggle/input/csiro-biomass/train",
    "/kaggle/input/csiro-biomass/test",
]

In [5]:
def get_unique_sizes(directory):
    size_counts = defaultdict(int)
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', 'JPG')):
                try:
                    with Image.open(os.path.join(root, file)) as img:
                        size = img.size
                        size_counts[size] += 1
                except Exception as e:
                    print(f"Error {file}: {e}")

    return size_counts

In [10]:
for folder in folders:
    print(get_unique_sizes(folder))
    print()

/kaggle/input/csiro-biomass/train
[]
defaultdict(<class 'int'>, {(2000, 1000): 357})

/kaggle/input/csiro-biomass/test
[]
defaultdict(<class 'int'>, {(2000, 1000): 1})



In [11]:
train = pd.read_csv('/kaggle/input/csiro-biomass/train.csv')
train.head()

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,ID1011485656__Dry_Clover_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
1,ID1011485656__Dry_Dead_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
2,ID1011485656__Dry_Green_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Green_g,16.2751
3,ID1011485656__Dry_Total_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Total_g,48.2735
4,ID1011485656__GDM_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,GDM_g,16.275


In [12]:
train["target_name"].unique()

array(['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g',
       'GDM_g'], dtype=object)

In [14]:
class DatasetCS(Dataset):
    def __init__(self, 
                 df: pd.DataFrame, 
                 images_dir: str, 
                 transform: callable = None, 
                 is_test: bool = False):
        self.df = df
        self.images_dir = images_dir
        self.transform = transform
        self.is_test = is_test
        
        if not is_test:
            self.target_mapping = {
                'Dry_Green_g': 0, 
                'Dry_Dead_g': 1, 
                'Dry_Clover_g': 2,
                'GDM_g': 3, 
                'Dry_Total_g': 4
            }

    
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.images_dir, row['image_path'])
        
        image = Image.open(image_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        if self.is_test:
            return image, row['sample_id']
        else:
            target_value = row['target']
            target_type = self.target_mapping[row['target_name']]
            return image, torch.tensor(target_value, dtype=torch.float32), target_type

In [None]:
train_transform = transforms.Compose([
    transforms.Resize((500, 250)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # zscore norm, image net values
])

val_transform = transforms.Compose([
    transforms.Resize((500, 250)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
class ResNet34(nn.Module):
    def __init__(self, num_targets=5):
        super(ResNet34, self).__init__()
        self.backbone = resnet34(weights=None)
        self.backbone.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()
        
        self.shared_features = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
        )
        
        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Dropout(0.2),
                nn.Linear(256, 128),
                nn.ReLU(inplace=True),
                nn.Linear(128, 1)
            ) for _ in range(num_targets)
        ])
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x, target_type=None):
        features = self.backbone(x)
        shared_out = self.shared_features(features)
        
        if target_type is not None:
            outputs = []
            for i, t_type in enumerate(target_type):
                outputs.append(self.heads[t_type](shared_out[i].unsqueeze(0)))
            return torch.cat(outputs, dim=0)
        else:
            all_outputs = [head(shared_out) for head in self.heads]
            return torch.cat(all_outputs, dim=1)
training ->

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=15):
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        
        for images, targets, target_types in train_loader:
            images = images.to(device)
            targets = targets.to(device)
            target_types = target_types.to(device)
            
            optimizer.zero_grad()
            outputs = model(images, target_types).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for images, targets, target_types in val_loader:
                images = images.to(device)
                targets = targets.to(device)
                target_types = target_types.to(device)
                
                outputs = model(images, target_types).squeeze()
                loss = criterion(outputs, targets)
                val_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    return train_losses, val_losses
not sure ->

train_indices, val_indices = train_test_split(
    range(len(train)), 
    test_size=0.2, 
    random_state=42, 
    stratify=train['target_name']
)

train_subset = train.iloc[train_indices].reset_index(drop=True)
val_subset = train.iloc[val_indices].reset_index(drop=True)

print(f"train length: {len(train_subset)}")
print(f"val length: {len(val_subset)}")
not sure ->

train_dataset = DatasetCS(train_subset, '/kaggle/input/csiro-biomass', transform=train_transform)
val_dataset = DatasetCS(val_subset, '/kaggle/input/csiro-biomass', transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
actual training ->

model = ResNet34(num_targets=5).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)

train_losses, val_losses = train_model(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    num_epochs=3
)
test->

test = pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
test_dataset = DatasetCS(test, '/kaggle/input/csiro-biomass', transform=val_transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)

model.eval()
predictions = []
sample_ids = []

target_mapping = {
    'Dry_Green_g': 0, 'Dry_Dead_g': 1, 'Dry_Clover_g': 2,
    'GDM_g': 3, 'Dry_Total_g': 4
}

with torch.no_grad():
    for images, batch_sample_ids in test_loader:
        images = images.to(device)
        batch_outputs = model(images)
        
        for i, sample_id in enumerate(batch_sample_ids):
            row = test[test['sample_id'] == sample_id].iloc[0]
            target_idx = target_mapping[row['target_name']]
            prediction = batch_outputs[i, target_idx].item()
            predictions.append(prediction)
            sample_ids.append(sample_id)

submission = pd.DataFrame({
    'sample_id': sample_ids,
    'target': predictions
})

submission.to_csv('submission.csv', index=False)
submission