# **PlantTraits**

### Imports

In [None]:
!pip install wandb -qU
# Log in to your W&B account
import wandb
wandb.login()

In [None]:
#!pip install -q kaggle
#!pip install -q albumentations

TODO befor running:Create New API Token and add it as kaggle.json to root

In [None]:
#!mkdir ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#!kaggle competitions download -c planttraits2024

In [None]:
#!unzip -q './content/planttraits2024.zip'

In [None]:
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import albumentations as A
import cv2

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import OneCycleLR
from torchvision.io import read_image
from torch.utils.data import Dataset, ConcatDataset, DataLoader, Subset, random_split
from torchvision import transforms

from sklearn.model_selection import train_test_split

import re

#import optuna

### Set the Device

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
random_state = 7

np.random.seed(0)

random.seed(0)

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
TRAIN_IMAGES_PATH = './content/train_images'
TEST_IMAGES_PATH = './content/test_images'

### Untility functions

In [None]:
mean_pattern = r'X\d+_mean'
sd_pattern = r'X\d+_sd'

In [None]:
def find_matching_elements(lst,pattern):
    matching_elements = [element for element in lst if re.search(pattern, element)]
    return matching_elements

In [None]:
train_df = pd.read_csv('./content/train.csv')
test_df = pd.read_csv('./content/test.csv')

In [None]:
train_df.sample(10)

In [None]:
len(train_df.columns), len(test_df.columns)

In [None]:
len(train_df), len(test_df)

In [None]:
targets = [col for col in train_df.columns if col not in test_df.columns]
targets = targets[:6]

In [None]:
targets

In [None]:
metadata_columns = [col for col in train_df.columns if col in test_df.columns and col != 'id']

 TODO: hiányzó értékek kezelésének pontosítása

In [None]:
# Check for missing values in the train DataFrame
train_missing_values = train_df.isnull().sum()
print("Missing values in Train DataFrame:")
print(train_missing_values)

In [None]:
train_df.fillna(train_df.mean(), inplace=True)

# Getting the log of the targets

In [None]:
columns_to_log = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean',
                  'X4_sd', 'X11_sd', 'X18_sd', 'X26_sd', 'X50_sd', 'X3112_sd']

# Apply log10 transformation to the specified columns
for column in columns_to_log:
    train_df[column] = np.log10(train_df[column] + 1)  # Adding 1 to avoid log(0)


In [None]:
train_df.isna().sum()['X4_mean']

In [None]:
train_df = train_df.dropna()

# Augmentation

In [None]:
class AlbumentationsTransform:
    def __init__(self, augmentation):
        self.augmentation = augmentation

    def __call__(self, img):
        img_np = np.array(img)
        augmented = self.augmentation(image=img_np)
        return augmented['image']

train_augmentation =A.Compose([
          A.HorizontalFlip(p=0.5),
          A.VerticalFlip(p=0.25),
          A.Rotate(limit=45, p=0.4),
          A.RandomResizedCrop(height=256, width=256, scale=(0.8, 1.0), p=0.5),
          A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
          A.CoarseDropout(max_holes=6, max_height=8, max_width=8, min_holes=1, fill_value=0, p=0.5),
          A.Sharpen(alpha=0.3, lightness=0.3, p=0.5),
          A.Resize(256, 256)
        ])
# train_augmentation = A.Compose([
#     A.HorizontalFlip(p=0.5),
#     A.VerticalFlip(p=0.5),
#     A.Rotate(limit=45, p=0.5),
#     A.RandomResizedCrop(height=256, width=256, scale=(0.8, 1.0), p=0.5),
#     A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
#     # A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02, p=0.2),
#     # A.ElasticTransform(alpha=1, sigma=15, alpha_affine=15, p=0.2),
#     # A.GaussianBlur(blur_limit=3, p=0.5),
#     # A.CLAHE(clip_limit=4.0, tile_grid_size=(8, 8), p=0.5),
#     #A.CoarseDropout(max_holes=8, max_height=8, max_width=8, min_holes=2, fill_value=0, p=0.5),
#     A.Sharpen(alpha=0.3, lightness=0.3, p=0.5),
#     # A.RandomShadow(shadow_roi=(0, 0.5, 1, 1), num_shadows_lower=1, num_shadows_upper=3, shadow_dimension=50, p=0.5),
#     A.Resize(256, 256)
# ])

test_val_augmentation = A.Compose([
    A.Resize(256, 256) # ViT 224
])

final_transforms = transforms.Compose([
    AlbumentationsTransform(test_val_augmentation),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
class PlantTraits2024_Dataset(Dataset):
    def __init__(self, df, columns, images_path, augmentation=None,
                 transform=None, augment_times=1, metadata_columns=None,
                 augment_metadata=False,
                 metadata_augmentation_std=0.1, target = None):

        self.df = df
        self.columns = columns
        self.images_path = images_path
        self.augmentation = augmentation
        self.transform = transform
        self.augment_times = augment_times
        self.augment_metadata = augment_metadata
        self.metadata_augmentation_std = metadata_augmentation_std
        self.target = target

    def __len__(self):
        return len(self.df) * self.augment_times

    def __getitem__(self, idx) :
        original_idx = idx // self.augment_times
        id = self.df.iloc[original_idx, 0]
        image_path = f'{self.images_path}/{id}.jpeg'
        bgr_image = cv2.imread(image_path)
        image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
        filtered_df = self.df[self.df['id'] == id].iloc[0]

        meta_data = torch.as_tensor(filtered_df[self.columns].to_numpy().astype(np.float64))

        if (self.augment_metadata):
          meta_data = self.augment_tabular_data(meta_data)

        if self.augmentation:
            image = AlbumentationsTransform(self.augmentation)(image)

        if self.transform:
            image = self.transform(image)

        # target
        #target_values = meta_data[self.target]
        #feature_values = meta_data[[col for col in meta_data.columns if col not in self.target]]
        return image, meta_data[:self.target[0]], meta_data[self.target[0]:]

    def augment_tabular_data(self, meta_data):
        noise = np.random.normal(0, self.metadata_augmentation_std, meta_data.shape)
        augmented_meta_data = meta_data # + noise
        return torch.as_tensor(augmented_meta_data, dtype=torch.float32)

    def get_original_image_and_metadata(self, idx):
        image_name = self.df.iloc[idx, 0]
        image_path = f'{self.images_path}/{image_name}.jpeg'
        bgr_image = cv2.imread(image_path)
        image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)

        meta_data = self.df.iloc[idx][self.columns].to_numpy().astype(np.float64)
        meta_data = torch.tensor(meta_data, dtype=torch.float64)

        # target
        #target_values = meta_data[self.target]
        #feature_values = meta_data[[col for col in meta_data.columns if col not in self.target]]
        return image, meta_data[:self.target[0]], meta_data[self.target[0]:]


In [None]:
def shorten_feature_name(name, max_length=15):
    if len(name) > max_length:
        return name[:max_length - 3] + "..."
    return name

def visualize_augmentations(dataset, idx, title, augment_times=4, n_features=5):
    features_to_display = dataset.columns[:n_features]
    original_image, meta_data, targets = dataset.get_original_image_and_metadata(idx)

    fig, axes = plt.subplots(1, augment_times + 1, figsize=(20, 4))
    fig.suptitle(title, fontsize=16)

    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])

    original_image = np.array(original_image)
    axes[0].imshow(original_image)
    axes[0].set_title("Original Image")
    axes[0].axis("off")

    for i in range(augment_times):
        augmented_image,  augmented_metadata, augmented_targets = dataset[idx * augment_times + i]
        augmented_image = augmented_image.permute(1, 2, 0).numpy()
        augmented_image = std * augmented_image + mean
        augmented_image = np.clip(augmented_image, 0, 1)
        axes[i + 1].imshow(augmented_image)
        axes[i + 1].set_title(f"Aug Version {i+1}")
        axes[i + 1].axis("off")

    plt.tight_layout()
    plt.show()

    header = ["Feature"] + ["Original"] + [f"Aug Version {i+1}" for i in range(augment_times)]
    header_format = "{:<30}" + "{:<15}" * (1 + augment_times)
    print(header_format.format(*header))

    data = torch.cat((meta_data, targets), -1)

    for feature in features_to_display:
        shortened_feature = shorten_feature_name(feature)
        row = [shortened_feature] + [f"{data[dataset.columns.index(feature)].item():.2f}"]
        for i in range(augment_times):
            _, augmented_metadata, augmented_targets = dataset[idx * augment_times + i]
            augmented_data = torch.cat((augmented_metadata, augmented_targets),-1)
            row.append(f"{augmented_data[dataset.columns.index(feature)].item():.2f}")
        print(header_format.format(*row))

In [None]:
def split_dataframe(train_df, test_df, train_size=0.95, val_size=0.05):
    if train_size + val_size != 1:
        raise ValueError("train_size, val_size, and test_size must sum to 1")

    train_df, val_df = train_test_split(train_df, train_size=train_size, random_state=random_state)
    return train_df, val_df, test_df


In [None]:
def create_dataset(train_df, test_df, columns, targets, train_images_path, test_images_path, augment_times, train_augmentation):

    train_df, val_df, test_df = split_dataframe(train_df.sample(frac=1, random_state=1), test_df.sample(frac=1, random_state=1))
    target = [i-1 for i, col in enumerate(train_df.columns) if col not in test_df.columns]
    target = target[:6]
    train_dataset = PlantTraits2024_Dataset(train_df, columns + targets, train_images_path, augmentation=train_augmentation, transform=final_transforms, augment_times=augment_times, augment_metadata=True,target=target)
    val_dataset = PlantTraits2024_Dataset(val_df, columns + targets, train_images_path, augmentation=test_val_augmentation, transform=final_transforms, augment_times=1, target=target)
    test_dataset = PlantTraits2024_Dataset(test_df, columns, test_images_path, augmentation=test_val_augmentation, transform=final_transforms, augment_times=1, target=target)

    return train_dataset, val_dataset, test_dataset

In [None]:
train_dataset, val_dataset, test_dataset = create_dataset(train_df, test_df, metadata_columns, targets, TRAIN_IMAGES_PATH, TEST_IMAGES_PATH, augment_times=3, train_augmentation=train_augmentation)

In [None]:
len(train_dataset), len(val_dataset), len(test_dataset)

In [None]:
pip freeze

# Visualization

In [None]:
#visualize_augmentations(dataset=train_dataset, title='Train dataset augmentations', idx=14, augment_times=2)

In [None]:
#visualize_augmentations(dataset=val_dataset, title='Validation dataset augmentations', idx=14, augment_times=2)

In [None]:
#visualize_augmentations(dataset=test_dataset, title='Test dataset augmentations', idx=14, augment_times=2)

#**Baselines**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader

## Create dataloaders


In [None]:
train_loader = DataLoader(train_dataset, batch_size= 48, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = 64, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size= 64, shuffle=False)

In [None]:
class PlantTraitsPretrainedVGG16(nn.Module):
    def __init__(self):
        super().__init__()

        self.network = models.vgg16(pretrained=True)
        # Replace last layer
        self.network.classifier = nn.Sequential(
           nn.Linear(in_features=25088, out_features=4096, bias=True),
           nn.ReLU(inplace=True),
           nn.Dropout(p=0.3, inplace=False),
           nn.Linear(in_features=4096, out_features=2048, bias=True),
           nn.ReLU(inplace=True),
           nn.Dropout(p=0.3, inplace=False),
           nn.Linear(in_features=2048, out_features=6, bias=True),
        )

    def forward(self, xb):
        return self.network(xb)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = PlantTraitsPretrainedVGG16()
model = model.to(device)
model

In [None]:
!pip -q install torchmetrics

In [None]:
from torchmetrics.regression import R2Score

In [None]:
def r2_loss(output, target):
    target = target
    output = output
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)
    ss_res = torch.sum((target - output) ** 2)
    r2 = 1 - ss_res / ss_tot
    return 1 - r2

In [None]:
#Adatok GPU-ra való átmásolását végző függvény
def to_device(data, device):
  if isinstance(data, (list, tuple)):
    return [to_device(x, device) for x in data]
  return data.to(device, non_blocking=True)

In [None]:
class DeviceDataLoader():
  def __init__(self, dl, device):
    self.dl = dl
    self.device = device

  def __iter__(self):
    for b in self.dl:
      yield to_device(b, self.device)

  def __len__(self):
    return len(self.dl)

In [None]:
train_loader = DeviceDataLoader(train_loader, device)
val_loader = DeviceDataLoader(val_loader, device)
test_loader = DeviceDataLoader(test_loader, device)

In [None]:
from tqdm.notebook import tqdm #a progress bár esztétikus megjelenítésére

In [None]:
def save_checkpoint(state, filename="./content/model_checkpoint.pth.tar"):
    torch.save(state, filename)
    print(f"Checkpoint saved to {filename}")

def load_checkpoint(checkpoint_file, model, optimizer, scheduler, device):
    checkpoint = torch.load(checkpoint_file, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    return model

In [None]:
class EarlyStoppingR2:
    def __init__(self, patience=10, delta=0, save=None):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.save = save

    def __call__(self, val_score, epoch, model, optimizer, scheduler):
        if self.best_score is None:
            self.best_score = val_score
            print(f"Best model loss: {val_score}")
            if self.save:
                checkpoint_filename = './content/best_model.pth.tar'
                self.save({
                    'epoch': epoch + 1,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                }, filename=checkpoint_filename)
        elif val_score < self.best_score - self.delta:
            self.counter += 1
            print(f"EarlyStopping patience counter: {self.counter}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            if val_score > self.best_score:
                self.best_score = val_score
                self.counter = 0
                print(f"Best model loss: {val_score}")
                if self.save:
                    checkpoint_filename = './content/best_model.pth.tar'
                    self.save({
                        'epoch': epoch + 1,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'scheduler_state_dict': scheduler.state_dict(),
                    }, filename=checkpoint_filename)

In [None]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def train(model, train_loader, val_loader, criterion, max_lr, grad_clip=0, opt_func=torch.optim.Adam,  weight_decay=0, epochs=10,
          architecture_name= "CNN", dataset_ratio = 0.1, resume = False, checkpoint_path='./content/best_model.pth.tar', experiment_name="exp"):
    torch.cuda.empty_cache()
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    r2_score_metrics = R2Score(num_outputs=6).to(device)
     # set up one cycle lr scheduler
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs,  steps_per_epoch=len(train_loader))
    early_stopping = EarlyStoppingR2(patience=8, save=save_checkpoint)

    wandb.init(
      # Set the project where this run will be logged
      project="Plant_traits",
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=experiment_name,
      # Track hyperparameters and run metadata
      config={
        "max_learning_rate": max_lr,
        "architecture": architecture_name,
        "epochs": epochs,
        "weight_decay": weight_decay,
        "dataset_ratio" : dataset_ratio,
        "criterion" : criterion,
        "grad_clip" : grad_clip,
        "opt_func" : opt_func
      })

    start_epoch = 0
    if resume and checkpoint_path:
        start_epoch = load_checkpoint(checkpoint_path, model, optimizer, sched, device)

    best_loss = float('inf')
    for epoch in range(start_epoch, start_epoch + epochs):
        model.train()
        epoch_loss = 0
        lrs = []
        for images, meta_data, targets in tqdm(train_loader):


            outputs = model(images)
            loss = criterion(outputs, targets)
            loss.backward()
            if grad_clip:
               nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            optimizer.step()
            optimizer.zero_grad()
            epoch_loss += loss.item() * images.size(0)
            lrs.append(get_lr(optimizer))
            sched.step()

        epoch_loss = epoch_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Loss: {epoch_loss}')

        model.eval()  # Set model to evaluation mode

        with torch.no_grad():
            epoch_val_loss = 0.0
            r2_epoch_score = 0.0
            for images, meta_data, targets in tqdm(val_loader):

                outputs = model(images)
                loss = criterion(outputs, targets)
                epoch_val_loss += loss.item() * images.size(0)
                r2_epoch_score += r2_score_metrics(outputs, targets)

            epoch_val_loss = epoch_val_loss /len(val_loader)
            r2_score = r2_epoch_score / len(val_loader)


            print(f'Epoch {epoch + 1}/{epochs} Validation Loss: {epoch_val_loss:.4f}')
            print(f'Validation R2 Score: {r2_score}')
            wandb.log({"val_loss": epoch_val_loss, "loss": epoch_loss, "r2_score": r2_score})
            early_stopping(r2_score, epoch, model, optimizer, sched)
            if early_stopping.early_stop:
                print("Early stopping triggered.")
                break

    # Mark the run as finished
    wandb.finish()

def test_model(model, criterion, test_loader):
    model.eval()  # Set model to evaluation mode
    epoch_loss = 0.0
    r2_score_metrics = R2Score(num_outputs=6).to(device)
    r2_epoch_score = 0
    with torch.no_grad():
       for images, meta_data, targets in test_loader:
            outputs = model(images)
            loss = criterion(outputs, targets)
            epoch_loss += loss.item() * images.size(0)
            r2_epoch_score += r2_score_metrics(outputs, targets)

    test_loss = epoch_loss / len(test_loader)
    r2_score = r2_epoch_score / len(val_loader)
    print(f'Test Loss: {test_loss:.4f}')
    print(f'Validation R2 Score: {r2_score}')


Train a
- ResNet (too slow)
- VGG
- Efficientnet

In [None]:
epochs = 7
opt_func = torch.optim.SGD
loss_func =r2_loss #  nn.MSELoss()
max_lr = 0.006387
grad_clip = 0.07608
weight_decay =  0.00009996
dataset_ratio =1
architecture_name = "Vgg16"

In [None]:
#train(model, train_loader, val_loader, criterion = loss_func, max_lr = max_lr, grad_clip=grad_clip, opt_func=opt_func, weight_decay= weight_decay, epochs=epochs,
#      architecture_name= architecture_name,dataset_ratio = dataset_ratio, experiment_name = "Vgg16" )

In [None]:
optimizer=opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
scheduler=torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs,  steps_per_epoch=len(train_loader))
load_checkpoint("./content/best_model.pth.tar", model, optimizer, scheduler, device)

In [119]:
test_model(model, r2_loss, val_loader)

Test Loss: 8.2639
Validation R2 Score: 0.28465747833251953


In [None]:
# Function to extract outputs from different layers
def extract_outputs(model, xb):
    # Forward pass through the model
    output = model.network(xb)
    
    # Extract outputs from different layers
    output_L0 = output.detach().cpu().numpy()
    output_L1 = model.network.classifier[3].weight.detach().cpu().numpy()  # Output before last layer
    output_L2 = model.network.classifier[0].weight.detach().cpu().numpy()  # Output before last-1 layer
    
    return output_L0, output_L1

In [None]:
# Initialize lists to store data
ids = []
VGG_L0_outputs = []
VGG_L1_outputs = []
mylabels = []

In [None]:
# Iterate over the DataLoader
for batch in tqdm(train_loader, desc="Processing batches", unit="batch", leave=False):
    # Extract data from batch
    inputs, labels, ids_batch = batch
    
    # Forward pass through the model
    outputs_L0, outputs_L1 = extract_outputs(model, inputs)
    
    # Append data to lists
    ids.extend(ids_batch)
    VGG_L0_outputs.extend(outputs_L0)
    mylabels.extend(labels)

In [None]:
for i in range(len(mylabels)):
    mylabels[i] = mylabels[i].cpu()

In [None]:
# Create a DataFrame
test_features_df = pd.DataFrame({
    'labels': mylabels,
    'VGG_L0': VGG_L0_outputs
})

In [None]:
# Create a DataFrame
train_features_df = pd.DataFrame({
    'labels': mylabels,
    'VGG_L0': VGG_L0_outputs
})

In [None]:
train_merged = pd.merge(train_df,train_features_df, on='id', how='in')
test_merged = pd.merge(test_df,test_features_df, on='id', how='in')

In [None]:
X_train = train_merged.drop(columns=targets)
y_train = train_merged[targets]
X_test = test_merged.drop(columns=targets)
y_test = test_merged[targets]

In [None]:
import xgb
from sklearn.multioutput import MultiOutputRegressor

In [None]:
multi_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, random_state=42))

In [None]:
multi_xgb_model.fit(X_train, y_train)

In [None]:
y_pred_multi = multi_xgb_model.predict(X_val)

In [118]:
print("R² scores for each target (on scaled features):", r2_scores)

R² scores for each target (on scaled features): 0.32534680
