# Usage

Simply run the below code to train and evaluate on a validation set!\
Parameters you will want to configure:
- `PATH`: the path to your training/testing data
- `EN_VAL`: enables splitting training data to produce a validation set for evaluation
- `EN_TST`: when enabled the model predicts the test data after training

# Imports

In [None]:
import os
from PIL import Image

from catboost import CatBoostRegressor
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import timm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from tqdm.autonotebook import tqdm


# Classes

In [None]:
class Cfg():
    IMG_SZ = 128
    TRG_COLS = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
    PATH = 'C:\\data-fast\\cs480-kaggle\\data\\'
    N_TRG = len(TRG_COLS)
    BATCH_SZ = 32
    SEED = None #42
    X_SZ = 163
    DEVICE = 'cuda'
    N_PF = 1000
    VAL_SZ = 0.2
    EN_VAL = False
    EN_TST = True

CFG = Cfg()

In [None]:
class RawDataset(Dataset):
    def __init__(self, image_paths, features, labels=None, transform=None):
        self.image_paths = image_paths
        self.features = features
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # load & augment image
        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # load features & labels
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        if self.labels is not None:
            label = torch.tensor(self.labels[idx], dtype=torch.float32)
            return (image, feature), label
        else:
            return image, feature

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self):
        super(EmbeddingModel, self).__init__()

        # load pre-trained efficientnet without the top classification layers
        self.efficientnet = models.efficientnet_b7(weights=models.EfficientNet_B7_Weights.IMAGENET1K_V1)
        self.efficientnet = nn.Sequential(*list(self.efficientnet.children())[:-2])
        
        # load pre-trained mobilevit without the top classification layer
        model_name = 'mobilevitv2_100'
        self.mobilevit = timm.create_model(model_name, pretrained=True)
        self.mobilevit = nn.Sequential(*list(self.mobilevit.children())[:-1])

        self.img_to_flat = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
        )

    def forward(self, image_input, data_input):
        # CNN embedding
        x_eff = self.efficientnet(image_input)
        x_eff = self.img_to_flat(x_eff)

        # ViT embedding
        x_mv = self.mobilevit(image_input)
        x_mv = self.img_to_flat(x_mv)

        output = torch.cat((x_eff, x_mv, data_input), dim=1)

        return output

# Data

In [None]:
# load training data
train_data = pd.read_csv(f'{CFG.PATH}/train.csv')

# get aux feature col names
prefixes = ['WORLDCLIM_BIO', 'SOIL_', 'MODIS_', 'VOD_']
feature_cols = [c for c in train_data.columns if any(c.startswith(prefix) for prefix in prefixes)]

train_features = train_data[feature_cols].values
train_filenames = train_data['id'].values
train_labels = train_data[CFG.TRG_COLS].values
train_labels = np.log10(train_labels)

if CFG.EN_TST:
    test_data = pd.read_csv(f'{CFG.PATH}/test.csv')
    test_features = test_data[feature_cols].values
    test_filenames = test_data['id'].values



In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

# augmentation for training data
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(
        brightness=0.1,
        contrast=0.1,
        saturation=0.1
    ),
    transforms.ToTensor(),
])

train_image_paths = [os.path.join(f'{CFG.PATH}/train_images', f'{filename}.jpeg') for filename in train_filenames]

scaler = StandardScaler()

# split out val set
if CFG.EN_VAL:
    train_image_paths, val_image_paths, train_features, val_features, train_labels, val_labels = train_test_split(
        train_image_paths, train_features, train_labels, test_size=CFG.VAL_SZ, random_state=42
    )

train_features = scaler.fit_transform(train_features)
train_features = PolynomialFeatures(2).fit_transform(train_features)[:, :CFG.N_PF]
train_dataset = RawDataset(train_image_paths, train_features, train_labels, transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SZ, shuffle=True, num_workers=0, pin_memory=True)

if CFG.EN_VAL:
    val_features = scaler.transform(val_features)
    val_features = PolynomialFeatures(2).fit_transform(val_features)[:, :CFG.N_PF]
    val_dataset = RawDataset(val_image_paths, val_features, val_labels, transform=transform)
    val_loader = DataLoader(val_dataset, batch_size=CFG.BATCH_SZ, shuffle=False, num_workers=0, pin_memory=True)

if CFG.EN_TST:
    test_image_paths = [os.path.join(f'{CFG.PATH}/test_images', f'{filename}.jpeg') for filename in test_filenames]
    test_features = scaler.transform(test_features)
    test_features = PolynomialFeatures(2).fit_transform(test_features)[:, :CFG.N_PF]
    test_dataset = RawDataset(test_image_paths, test_features, None, transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=CFG.BATCH_SZ, shuffle=False, num_workers=0, pin_memory=True)

In [None]:
# instantiate embedding model
embed_model = EmbeddingModel()
embed_model.to(CFG.DEVICE)

## Compute Embeds

### Generate Train + Validation Embeddings

In [None]:

embed_model.eval()

train_features_catboost = []
train_labels_catboost = []

if CFG.EN_VAL:
    val_features_catboost = []
    val_labels_catboost = []

with torch.no_grad():
    for (image_batch, feature_batch), label_batch in tqdm(train_loader):
        image_batch = image_batch.to(CFG.DEVICE)
        feature_batch = feature_batch.to(CFG.DEVICE)

        # embed
        features = embed_model(image_batch, feature_batch)
        train_features_catboost.append(features.cpu().numpy())
        train_labels_catboost.append(label_batch.cpu().numpy())

    if CFG.EN_VAL:
        for (image_batch, feature_batch), label_batch in tqdm(val_loader):
            image_batch = image_batch.to(CFG.DEVICE)
            feature_batch = feature_batch.to(CFG.DEVICE)

            # embed
            features = embed_model(image_batch, feature_batch)
            val_features_catboost.append(features.cpu().numpy())
            val_labels_catboost.append(label_batch.cpu().numpy())

train_features_catboost = np.concatenate(train_features_catboost, axis=0)
train_labels_catboost = np.concatenate(train_labels_catboost, axis=0)
np.save(f'{CFG.PATH}/../train_features_catboost', np.array(train_features_catboost))
np.save(f'{CFG.PATH}/../train_labels_catboost', np.array(train_labels_catboost))

if CFG.EN_VAL:
    val_features_catboost = np.concatenate(val_features_catboost, axis=0)
    val_labels_catboost = np.concatenate(val_labels_catboost, axis=0)
    np.save(f'{CFG.PATH}/../val_features_catboost', np.array(val_features_catboost))
    np.save(f'{CFG.PATH}/../val_labels_catboost', np.array(val_labels_catboost))

In [None]:
train_features_catboost = np.load(f'{CFG.PATH}/../train_features_catboost.npy')
train_labels_catboost = np.load(f'{CFG.PATH}/../train_labels_catboost.npy')

if CFG.EN_VAL:
    val_features_catboost = np.load(f'{CFG.PATH}/../val_features_catboost.npy')
    val_labels_catboost = np.load(f'{CFG.PATH}/../val_labels_catboost.npy')

### Generate Test Embeddings

In [None]:
if CFG.EN_TST:
    embed_model.eval()

    test_features_catboost = []
    test_labels_catboost = []

    from tqdm.autonotebook import tqdm

    with torch.no_grad():
        for image_batch, feature_batch in tqdm(test_loader):
            image_batch = image_batch.to(CFG.DEVICE)
            feature_batch = feature_batch.to(CFG.DEVICE)

            # Extract features
            features = embed_model(image_batch, feature_batch)
            test_features_catboost.append(features.cpu().numpy())

    # Convert lists to numpy arrays
    test_features_catboost = np.concatenate(test_features_catboost, axis=0)

    np.save(f'{CFG.PATH}/../test_features_catboost', np.array(test_features_catboost))

In [None]:
if CFG.EN_TST:
    test_features_catboost = np.load(f'{CFG.PATH}/../test_features_catboost.npy')

## Train Catboost

In [None]:
cat_models = {}
scores = {}

for i, col in tqdm(enumerate(CFG.TRG_COLS), total=len(CFG.TRG_COLS)):
    model = CatBoostRegressor(
        depth=6,
        learning_rate=0.1,
        iterations=1000,
        loss_function='RMSE',
        verbose=True,
        task_type=('GPU' if CFG.DEVICE == 'cuda' else 'CPU'),
        random_state=CFG.SEED,
    )

    if CFG.EN_VAL:
        model.fit(train_features_catboost, train_labels_catboost[:, i], verbose=True,
                eval_set=(val_features_catboost, val_labels_catboost[:, i]))
        
        y_curr_val_pred = model.predict(val_features_catboost)
        
        r2_col = r2_score(val_labels_catboost[:, i], y_curr_val_pred)
        scores[col] = r2_col
        print(f'Target Trained: {col}, R2: {r2_col:.3f}')
    else:
        model.fit(train_features_catboost, train_labels_catboost[:, i], verbose=True)
        print(f'Target Trained: {col}')

    cat_models[col] = model

if CFG.EN_VAL:
    mean_r2 = np.mean(list(scores.values()))
    print(f'Mean R2: {mean_r2}')


# Generate Submission

In [None]:
if CFG.EN_TST:
    predictions_df = pd.DataFrame()
    test_data = pd.read_csv(f'{CFG.PATH}/test.csv')
    predictions_df['id'] = test_data['id']

    for trait in CFG.TRG_COLS:
        model = cat_models[trait]
        predictions = model.predict(test_features_catboost)
        predictions = np.power(10, predictions)
        predictions_df[trait.split('_')[0]] = predictions

    predictions_df.to_csv(f'{CFG.PATH}/../submission.csv', index=False)


# Validation Variation

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Some R^2 values I've collected on different train/val splits
data = [
    0.2596423142137745,
    0.260868698490226,
    0.2574254267340103,
    0.2583413629448405,
]

# Calculate mean and standard deviation
mean = np.mean(data)
std_dev = np.std(data)

# Plotting
plt.figure(figsize=(8, 6))
plt.axhline(y=mean, color='blue', linestyle='-', label=f'Mean $R^2$: {mean:.5f}')
plt.axhline(y=mean + std_dev, color='red', linestyle='--', label=f'Std Dev ($\pm$): $\pm${std_dev:.5f}')
plt.axhline(y=mean - std_dev, color='red', linestyle='--')
plt.scatter([0]*len(data), data, marker='o', label='Measured Values')
plt.gca().get_xaxis().set_visible(False)
plt.title('Variation of Validation Scores')
plt.ylabel('$R^2$ Value')
plt.legend()
plt.grid(True)
plt.show()
