In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/metasub-data/metasub_training_testing_data.csv


# Import Libraries

In [2]:
# --- SYSTEM LIBS (optional but may be required for geospatial stuff) ---
!apt-get update -y && apt-get install -y \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libsm6 \
    libxrender1 \
    libxext6 \
    libgeos-dev \
    gdal-bin \
    libgdal-dev \
    python3-gdal

# --- PYTHON LIBRARIES ---

!pip install --quiet \
    numpy \
    pandas \
    seaborn \
    matplotlib \
    scikit-learn \
    imbalanced-learn \
    xgboost==1.7.6 \
    lightgbm \
    catboost \
    optuna \
    shapely \
    geopandas \
    geopy \
    tqdm \
    folium \
    plotly \
    tab-transformer-pytorch \
    torch torchvision torchaudio \
    tabpfn==2.0.9 \
    tabpfn-extensions==0.0.4 \
    einops \
    adjusttext \
    sqlalchemy==2.0.41 \
    alembic \
    colorlog \
    huggingface-hub \
    fsspec \
    pyyaml \
    sympy \
    filelock \
    mako \
    tabpfn_client

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,918 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]           
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,773 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]     
Get:11 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [5,103 kB]
Get:12 https://ppa.launchpadcontent.net/graphics-dr

In [3]:
# ==================== OS & Warnings ====================
import os
import copy
import time
import random
import warnings
warnings.filterwarnings('ignore')

# ==================== Logging ====================
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ==================== Numerical & Data Processing ====================
import numpy as np
import pandas as pd

# ==================== Visualization ====================
import matplotlib.pyplot as plt
import seaborn as sns

# ==================== Scikit-learn ====================
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, StratifiedShuffleSplit, KFold, cross_val_score
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    log_loss, mean_squared_error, mean_absolute_error, r2_score
)

# ==================== Imbalanced Data ====================
from imblearn.over_sampling import SMOTE

# ==================== Geospatial ====================
import geopandas as gpd
from shapely.geometry import Point
from scipy.spatial import cKDTree
from geopy.distance import geodesic 

# ==================== Machine Learning Libraries ====================
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
import optuna

# ==================== Deep Learning - PyTorch ====================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils import clip_grad_norm_

# ==================== TabPFN ====================
from tabpfn import TabPFNClassifier, TabPFNRegressor
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier
from tabpfn_extensions.hpo import TunedTabPFNClassifier
from tabpfn_extensions.many_class.many_class_classifier import ManyClassClassifier


# CatBoost

In [4]:
class CatBoostClassifierOptimizer:
    def __init__(self, X_train, y_train, X_test, y_test, random_state=42, n_trials=20, timeout=1200, cat_features=None):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.random_state = random_state
        self.n_trials = n_trials
        self.timeout = timeout
        self.cat_features = cat_features
        self.best_params = None
        self.final_model = None

    def default_params(self):
        return {
            'loss_function': 'MultiClass',
            'iterations': 300,
            'learning_rate': 0.1,
            'depth': 6,
            'l2_leaf_reg': 3.0,
            'random_seed': self.random_state,
            'verbose': False
        }

    def objective(self, trial):
        params = {
            'loss_function': 'MultiClass',
            'iterations': trial.suggest_int('iterations', 100, 400),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'depth': trial.suggest_int('depth', 3, 10),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10.0),
            'random_seed': self.random_state,
            'verbose': False
        }
        model = CatBoostClassifier(**params)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        # For categorical features, use .fit(X, y, cat_features=cat_features)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=skf, scoring='accuracy', fit_params={'cat_features': self.cat_features})
        return scores.mean()

    def tune(self):
        study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2))
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        self.best_params = study.best_params
        self.best_params.update({
            'loss_function': 'MultiClass',
            'random_seed': self.random_state,
            'verbose': False
        })
        return self.best_params

    def train(self, params):
        model = CatBoostClassifier(**params)
        model.fit(self.X_train, self.y_train, cat_features=self.cat_features)
        self.final_model = model
        return model

    def evaluate(self, model=None):
        if model is None:
            model = self.final_model
        preds = model.predict(self.X_test)
        probs = model.predict_proba(self.X_test)
        acc = accuracy_score(self.y_test, preds)
        print("\nClassification Report:")
        print(classification_report(self.y_test, preds))
        print(f"\nAccuracy: {acc:.4f}")
        return preds, probs, acc


def run_catboost_classifier(X_train, y_train, X_test, y_test, 
                           tune_hyperparams=False, random_state=42, 
                           n_trials=20, timeout=1200, params=None, verbose=False):
    """
    CatBoost classification wrapper for ensemble.
    """
    tuner = CatBoostClassifierOptimizer(X_train, y_train, X_test, y_test, 
                         random_state=random_state, n_trials=n_trials,timeout=timeout,cat_features=None)

    if tune_hyperparams:
        best_params = tuner.tune()
        if verbose:
            print("Using tuned parameters:", best_params)
    else:
        best_params = tuner.default_params()
        if params:
            best_params.update(params)
        if verbose:
            print("Using default (or custom) parameters:", best_params)

    model = tuner.train(best_params)
    preds, probs, acc = tuner.evaluate(model) if verbose else (model.predict(X_test), model.predict_proba(X_test), accuracy_score(y_test, model.predict(X_test)))
    
    if verbose:
        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': best_params
        }
    else:
        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': best_params
        }



class CatBoostRegressionOptimizer:
    def __init__(self, X_train, y_train, X_test, y_test,
                 random_state=42, n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.random_state = random_state
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.final_model = None

    def default_params(self):
        return {
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'learning_rate': 0.1,
            'depth': 6,
            'l2_leaf_reg': 3,
            'random_strength': 1,
            'bagging_temperature': 1,
            'border_count': 254,
            'iterations': 300,
        }

    def objective(self, trial):
        params = {
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            'depth': trial.suggest_int("depth", 3, 10),
            'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1, 10),
            'random_strength': trial.suggest_float("random_strength", 1e-9, 10, log=True),
            'bagging_temperature': trial.suggest_float("bagging_temperature", 0, 10),
            'border_count': trial.suggest_int("border_count", 1, 255),
            'iterations': trial.suggest_int("iterations", 100, 500),
        }

        model = CatBoostRegressor(**params, random_seed=self.random_state, verbose=False)
        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, scoring='neg_mean_absolute_error')
        return np.mean(scores)

    def tune(self):
        study = optuna.create_study(direction='maximize',
                                    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2))
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        self.best_params = study.best_params
        self.best_params.update({
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
        })
        return self.best_params

    def train(self, params):
        model = CatBoostRegressor(**params, random_seed=self.random_state, verbose=False)
        model.fit(self.X_train, self.y_train)
        self.final_model = model
        return model

    def evaluate(self, model=None):
        if model is None:
            model = self.final_model
        preds = model.predict(self.X_test)
        mae = mean_absolute_error(self.y_test, preds)
        r2 = r2_score(self.y_test, preds)
        print("\nRegression Report:")
        print(f"MAE:  {mae:.4f}")
        print(f"R2:   {r2:.4f}")
        return preds, mae, r2


def run_catboost_regressor(X_train, y_train, X_test, y_test,
                           tune_hyperparams=False, random_state=42,
                           n_trials=20, timeout=1200, params=None, verbose=True):
    """CatBoost regression wrapper for ensemble with proper error handling"""
    
    try:
        # Handle multi-dimensional targets
        if len(y_train.shape) > 1 and y_train.shape[1] > 1:
            if verbose:
                print("Warning: CatBoost doesn't support multi-output regression natively. Using first dimension only.")
            y_train = y_train[:, 0]
            y_test = y_test[:, 0]

        tuner = CatBoostRegressionOptimizer(X_train, y_train, X_test, y_test,
                                       random_state=random_state, n_trials=n_trials, timeout=timeout)

        if tune_hyperparams:
            best_params = tuner.tune()
            if verbose:
                print("Using tuned parameters:", best_params)
        else:
            best_params = tuner.default_params()
            if params:
                best_params.update(params)
            if verbose:
                print("Using default (or custom) parameters:", best_params)

        model = tuner.train(best_params)
        preds, mae, r2 = tuner.evaluate(model) if verbose else (model.predict(X_test), None, None)

        # Calculate additional metrics if not verbose
        if not verbose:
            mae = mean_absolute_error(y_test, preds)
            r2 = r2_score(y_test, preds)

        return {
            'model': model,
            'predictions': preds,
            'mae': mae,
            'r2_score': r2,  # Use r2_score for consistency
            'params': best_params,
            'skipped': False
        }
        
    except Exception as e:
        if verbose:
            print(f"Error in CatBoost regressor: {e}")
        # Return dummy predictions on error
        n_samples = X_test.shape[0]
        dummy_preds = np.zeros(n_samples)
        
        return {
            'model': None,
            'predictions': dummy_preds,
            'mae': float('inf'),
            'r2_score': -float('inf'),
            'params': params,
            'skipped': True,
            'error': str(e)
        }



# GrowNet

In [5]:
# This model is used for classification tasks
def grownet_classification_default_params():
    return {
        "hidden_size": 256,
        "num_nets": 10,
        "boost_rate": 0.4,
        "lr": 1e-3,
        "weight_decay": 1e-5,
        "batch_size": 128,
        "epochs_per_stage": 30,
        "early_stopping_steps": 7,
        "gradient_clip": 1.0,
        "val_split": 0.2,
        "test_split": 0.2,
        "random_state": 42,
    }


class GrowNetClassificationTuner:
    def __init__(self, X_train, y_train, X_val, y_val, params, device="cpu",n_trials = 20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.params = params
        self.device = device
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.best_score = None 

    def objective(self,trial):
        # Suggest hyperparameters
        params = self.params.copy()
        params.update({
            "hidden_size": trial.suggest_categorical("hidden_size", [128, 256, 512]),
            "num_nets": trial.suggest_int("num_nets", 10, 30),
            "boost_rate": trial.suggest_float("boost_rate", 0.1, 0.8),
            "lr": trial.suggest_loguniform("lr", 1e-4, 1e-2),
            "batch_size": trial.suggest_categorical("batch_size", [64, 128, 256]),
            "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-3),
            "epochs_per_stage": trial.suggest_int("epochs_per_stage", 5, 10),
            "gradient_clip": trial.suggest_float("gradient_clip", 0.5, 2.0),
        })

        # Train model
        model = GrowNetClassifierUnique(params, device=self.device)
        model.fit(self.X_train, self.y_train, X_val=self.X_val, y_val=self.y_val)
        val_metrics = model.evaluate(self.X_val, self.y_val)
        val_acc = val_metrics['class_accuracy']
        return val_acc  # maximize accuracy

    def tune(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        
        self.best_params = study.best_params
        self.best_score = study.best_value
        
        print(f"Best score: {self.best_score:.4f}")
        print(f"Best parameters: {self.best_params}")
        
        return self.best_params, self.best_score


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Dataset class for continent classification
class GrowNetClassificationTrainDataset(Dataset):
    def __init__(self, features, n_targets):
        self.features = features
        self.n_targets = n_targets

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        return {
            'x': torch.tensor(self.features[idx], dtype=torch.float),
            'n_classes': torch.tensor(self.n_targets[idx], dtype=torch.float)
        }

    
# DynamicNet for classification
class GrowNetClassificationDynamicNet(nn.Module):
    def __init__(self, c0_classes, lr):
        super(GrowNetClassificationDynamicNet,self).__init__()
        self.models = []
        self.c0_classes = c0_classes
        self.lr = lr

        self.boost_rate = nn.Parameter(torch.tensor(lr,requires_grad=True,device=device))

    def to(self,device):
        self.c0_classes = self.c0_classes.to(device)
        self.boost_rate = self.boost_rate.to(device)
        for m in self.models:
            m.to(device)
    
    def add(self,model):
        self.models.append(model)

    def parameters(self):
        params = []
        for m in self.models:
            params.extend(m.parameters())
        params.append(self.boost_rate)
        return params
    
    def zero_grad(self):
        for m in self.models:
            m.zero_grad()

    def to_eval(self):
        for m in self.models:
            m.eval()
    
    def to_train(self):
        for m in self.models:
            m.train(True)

    def forward(self,x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0_classes = self.c0_classes.repeat(batch,1)
            return None, c0_classes
        
        middle_feat_cum = None
        classes_pred = None

        with torch.no_grad():
            for m in self.models:
                if middle_feat_cum is None:
                    middle_feat_cum, classes_out = m(x,middle_feat_cum)
                    classes_pred = classes_out
                else:
                    middle_feat_cum, classes_out = m(x,middle_feat_cum)
                    classes_pred += classes_out

        final_classes = self.c0_classes + self.boost_rate * classes_pred
        return middle_feat_cum, final_classes
    
    def forward_grad(self,x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0_classes = self.c0_classes.repeat(batch,1)
            return None, c0_classes
        
        middle_feat_cum = None
        classes_pred = None

        for m in self.models:
            if middle_feat_cum is None:
                middle_feat_cum, classes_out = m(x,middle_feat_cum)
                classes_pred = classes_out
            else:
                middle_feat_cum, classes_out = m(x,middle_feat_cum)
                classes_pred += classes_out

        final_classes = self.c0_classes + self.boost_rate * classes_pred
        return middle_feat_cum, final_classes


class GrowNetClassificationMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(GrowNetClassificationMLP,self).__init__()
        self.bn = nn.BatchNorm1d(input_dim)

        # Simple feedforward layers
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(hidden_dim1)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.dropout2 = nn.Dropout(0.4)
        self.class_head = nn.Linear(hidden_dim2, output_dim)

    def forward(self,x, lower_f): # In a hierarchical network each new model can receive not just original input but also previously learned features
        if lower_f is not None:
            x = torch.cat([x,lower_f],dim=1)
            x = self.bn(x)

        # Simple feedforward
        x = self.dropout1(self.relu1(self.fc1(x)))
        x = self.bn1(x)
        x = self.dropout2(self.relu2(self.fc2(x)))
        x = self.bn2(x)
        shared_features = x

        # Prediction
        n_classes = self.class_head(shared_features)

        return shared_features, n_classes
    
    @classmethod
    def get_model(cls,stage,params):
        if stage == 0:
            dim_in = params['feat_d']
        else:
            dim_in = params['feat_d'] + params['hidden_size']

        model = cls(
            dim_in,
            params['hidden_size'],
            params['hidden_size'],
            params['n_classes']
        )

        return model


class GrowNetClassifierUnique:
    def __init__(self, params = None, device="cpu"):
        if params is None:
            self.params = grownet_classification_default_params()
        else:
            self.params = params
        self.device = device
        self.net_ensemble = None
        self.class_weights_tensor = None

    def _one_hot(self, y):
        """Ensure y is one-hot encoded."""
        if y.ndim == 1:
            n_classes = np.max(y) + 1
            return np.eye(n_classes)[y]
        return y

    def fit(self, X_train, y_train, X_val=None, y_val = None):

        # Determine the input feature size
        self.params['feat_d'] = X_train.shape[1]
        
        # One hot encode for this model
        y_train = self._one_hot(y_train)
        if y_val is not None:
            y_val = self._one_hot(y_val)
        self.params["n_classes"] = y_train.shape[1]

        # Determine the number of classes
        self.params['n_classes'] = y_train.shape[1]
        
        # Class weighting to deal with imbalanced datasets
        # Compute and store class weights tensor once
        class_labels_flat = np.argmax(y_train, axis=1)
        class_weights = compute_class_weight(
            class_weight="balanced",
            classes=np.unique(class_labels_flat),
            y=class_labels_flat
        )
        self.class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(self.device)


        # Split validation if not provided
        if X_val is None or y_val is None:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=self.params['val_split'],random_state=self.params['random_state'],stratify=class_labels_flat)

        train_ds = GrowNetClassificationTrainDataset(X_train, y_train)
        val_ds = GrowNetClassificationTrainDataset(X_val, y_val)
        train_loader = DataLoader(train_ds, batch_size=self.params['batch_size'], shuffle=True)

        print(f"Train size: {len(train_ds)}, Val size: {len(val_ds)}")

        train_loader = DataLoader(train_ds, batch_size=self.params['batch_size'], shuffle=True)
        
        # Init ensemble
        c0_classes = torch.tensor(np.log(np.mean(y_train, axis=0)), dtype=torch.float).unsqueeze(0).to(self.device)
        self.net_ensemble = GrowNetClassificationDynamicNet(c0_classes, self.params['boost_rate'])
        self.net_ensemble.to(self.device)
        
        best_val_loss = float("inf")
        best_stage = 0
        early_stop = 0
        lr = self.params["lr"]
        
        for stage in range(self.params['num_nets']):
            t0 = time.time()
            
            print(f"\nTraining weak learner {stage+1}/{self.params['num_nets']}")
            model = GrowNetClassificationMLP.get_model(stage, self.params).to(self.device)
            optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=self.params['weight_decay'])
            self.net_ensemble.to_train()
            
            stage_train_losses = []
            
            for epoch in range(self.params["epochs_per_stage"]):
                for batch in train_loader:
                    x = batch["x"].to(self.device)
                    targets = batch["n_classes"].to(self.device)
                    
                    with torch.no_grad():
                        _, prev_logits = self.net_ensemble.forward(x)
                        prev_probs = torch.softmax(prev_logits, dim=1)
                        grad = targets - prev_probs
                        hessian = prev_probs * (1 - prev_probs)
                        hessian = hessian.sum(dim=1, keepdim=True)
                    
                    middle_feat, logits = model(x, None if stage == 0 else self.net_ensemble.forward_grad(x)[0])
                    loss_stagewise = nn.MSELoss(reduction="none")
                    boosting_loss = loss_stagewise(self.net_ensemble.boost_rate * logits, grad)
                    boosting_loss = (boosting_loss * hessian).mean()
                    class_loss = F.cross_entropy(logits, torch.argmax(targets, dim=1), weight=self.class_weights_tensor)
                    total_loss = class_loss * boosting_loss # Optionally combine with boosting_loss
                    
                    model.zero_grad()
                    total_loss.backward()
                    clip_grad_norm_(model.parameters(), self.params['gradient_clip'])
                    optimizer.step()
                    stage_train_losses.append(total_loss.item())
            
            self.net_ensemble.add(model)
            avg_stage_loss = np.mean(stage_train_losses)
            print(f"Stage {stage+1} finished | Avg Train Loss: {avg_stage_loss:.5f} | Time: {time.time() - t0:.1f}s")
            
            val_metrics = self.evaluate(X_val, y_val)
            val_loss = val_metrics['class_loss']
            print(f"Validation - Classification Acc: {val_metrics['class_accuracy']:.3f}")
            print(f"Boost rate: {self.net_ensemble.boost_rate.item():.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_stage = stage
                early_stop = 0
            else:
                early_stop += 1
                if early_stop > self.params["early_stopping_steps"]:
                    print("Early stopping!")
                    break
        print(f"\nBest model was at stage {best_stage+1} with Val Loss: {best_val_loss:.5f}")


    def evaluate(self, X, y):
        self.net_ensemble.to_eval()
        all_preds = []
        all_preds_prob = []
        all_targets = []
        class_losses = []
        y = self._one_hot(y)

        loader = DataLoader(GrowNetClassificationTrainDataset(X, y), batch_size=self.params['batch_size'], shuffle=False)
        with torch.no_grad():
            for batch in loader:
                x = batch["x"].to(self.device)
                targets = batch["n_classes"].to(self.device)
                _, logits = self.net_ensemble.forward(x)
                loss = F.cross_entropy(logits, torch.argmax(targets, dim=1), weight=self.class_weights_tensor)
                class_losses.append(loss.item())
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                labs = torch.argmax(targets, dim=1).cpu().numpy()
                probs = torch.softmax(logits,dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_preds_prob.extend(probs)
                all_targets.extend(labs)
        acc = accuracy_score(all_targets, all_preds)
        return {
            'class_loss': np.mean(class_losses),
            'class_accuracy': acc,
            'predictions': all_preds,
            'probabilities': np.array(all_preds_prob),
            'targets': all_targets
        }
    
    def predict(self, X):
        self.net_ensemble.to_eval()
        all_preds = []
        all_preds_prob = []
        loader = DataLoader(GrowNetClassificationTrainDataset(X, np.zeros((X.shape[0], self.params['n_classes']))), batch_size=self.params['batch_size'], shuffle=False)
        with torch.no_grad():
            for batch in loader:
                x = batch["x"].to(self.device)
                _, logits = self.net_ensemble.forward(x)
                probs = torch.softmax(logits,dim=1).cpu().numpy()
                preds = np.argmax(probs, axis=1)
                all_preds.extend(preds)
                all_preds_prob.extend(probs)
        return {
            'predictions': np.array(all_preds),
            'probabilities': np.array(all_preds_prob)        
        }
    
def run_grownet_classifier(X_train,y_train,X_test,y_test,params=None,
                tune_hyperparams = False, n_trials=20,timeout=1200, verbose=False):
    # Handle device detection internally
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Use default if params not given
    if params is None:
        params = grownet_classification_default_params()
    else:
        default = grownet_classification_default_params()
        default.update(params)
        params = default
    
    if tune_hyperparams:
        # Split validation set from training data
        X_train_split, X_val, y_train_split, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42, stratify=y_train)

        tuner = GrowNetClassificationTuner(X_train_split,y_train_split,X_val,y_val,params,device=device,n_trials=n_trials,timeout=timeout)
        best_params, best_score = tuner.tune()
        params.update(best_params)
        if verbose:
            print("Using best params:", params)
    
    # Train final model on full training data
    model = GrowNetClassifierUnique(params,device=device)
    model.fit(X_train,y_train)
    
    results = model.evaluate(X_test,y_test)
    if verbose:
        print("\nClassification Report:")
        print(classification_report(results['targets'], results['predictions']))
        print("\nAccuracy:", results['class_accuracy'])
    
    return {
        'model': model,
        'predictions': results['predictions'],
        'predicted_probabilities': results['probabilities'],
        'accuracy': results['class_accuracy'],
        'params': params
    }



# Set device globally
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


def grownet_regression_default_params():
    return {
        "hidden_size": 256,
        "num_nets": 10,
        "boost_rate": 0.4,
        "lr": 1e-3,
        "weight_decay": 1e-5,
        "batch_size": 128,
        "epochs_per_stage": 30,
        "early_stopping_steps": 7,
        "gradient_clip": 1.0,
        "val_split": 0.2,
        "test_split": 0.2,
        "random_state": 42,
        "n_outputs":3
    }


class GrowNetRegressionTuner:
    def __init__(self, X_train, y_train, X_val, y_val, params, device="cpu",n_trials = 20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.params = params
        self.device = device
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.best_score = None 

    def objective(self,trial):
        # Suggest hyperparameters
        params = self.params.copy()
        params.update({
            "hidden_size": trial.suggest_categorical("hidden_size", [128, 256, 512]),
            "num_nets": trial.suggest_int("num_nets", 10, 30),
            "boost_rate": trial.suggest_float("boost_rate", 0.1, 0.8),
            "lr": trial.suggest_loguniform("lr", 1e-4, 1e-2),
            "batch_size": trial.suggest_categorical("batch_size", [64, 128, 256]),
            "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-3),
            "epochs_per_stage": trial.suggest_int("epochs_per_stage", 5, 10),
            "gradient_clip": trial.suggest_float("gradient_clip", 0.5, 2.0),
        })

        # Train model
        model = GrowNetRegressorUnique(params, device=self.device)
        model.fit(self.X_train, self.y_train, X_val=self.X_val, y_val=self.y_val)
        val_metrics = model.evaluate(self.X_val, self.y_val)
        return -val_metrics['rmse'] # minimize

    def tune(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        
        self.best_params = study.best_params
        self.best_score = study.best_value
        
        print(f"Best score: {self.best_score:.4f}")
        print(f"Best parameters: {self.best_params}")
        
        return self.best_params, self.best_score


# Dataset class for regression
class GrowNetRegressionTrainDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        return {
            'x': torch.tensor(self.features[idx], dtype=torch.float),
            'y': torch.tensor(self.targets[idx], dtype=torch.float)
        }

    
# DynamicNet for regression
class GrowNetRegressionDynamicNet(nn.Module):
    def __init__(self, c0_coords, lr):
        super(GrowNetRegressionDynamicNet, self).__init__()
        self.models = nn.ModuleList()

        self.c0_coords = c0_coords
        self.lr = lr
        self.boost_rate = nn.Parameter(torch.tensor(lr, requires_grad=True, device=device))

    def to(self, device):
        self.c0_coords = self.c0_coords.to(device)
        self.boost_rate = self.boost_rate.to(device)
        for m in self.models:
            m.to(device)

    def add(self, model):
        self.models.append(model)

    def parameters(self):
        params = []
        for m in self.models:
            params.extend(m.parameters())
        params.append(self.boost_rate)
        return params

    def zero_grad(self):
        for m in self.models:
            m.zero_grad()

    def to_eval(self):
        for m in self.models:
            m.eval()

    def to_train(self):
        for m in self.models:
            m.train(True)

    def forward(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0_coords = self.c0_coords.repeat(batch, 1)
            return None, c0_coords

        middle_feat_cum = None
        coords_pred = None
        with torch.no_grad():
            for m in self.models:
                if middle_feat_cum is None:
                    middle_feat_cum, coords_out = m(x, middle_feat_cum)
                    coords_pred = coords_out
                else:
                    middle_feat_cum, coords_out = m(x, middle_feat_cum)
                    coords_pred += coords_out
        final_coords = self.c0_coords + self.boost_rate * coords_pred
        return middle_feat_cum, final_coords

    def forward_grad(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0_coords = self.c0_coords.repeat(batch, 1)
            return None, c0_coords
        middle_feat_cum = None
        coords_pred = None
        for m in self.models:
            if middle_feat_cum is None:
                middle_feat_cum, coords_out = m(x, middle_feat_cum)
                coords_pred = coords_out
            else:
                middle_feat_cum, coords_out = m(x, middle_feat_cum)
                coords_pred += coords_out
        final_coords = self.c0_coords + self.boost_rate * coords_pred
        return middle_feat_cum, final_coords


class GrowNetRegressionMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(GrowNetRegressionMLP,self).__init__()
        self.bn = nn.BatchNorm1d(input_dim)

        # Simple feedforward layers
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(hidden_dim1)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.dropout2 = nn.Dropout(0.4)
        self.reg_head = nn.Linear(hidden_dim2, output_dim)

    def forward(self,x, lower_f):
        x = self.bn(x)
        x = self.dropout1(self.relu1(self.fc1(x)))
        x = self.bn1(x)
        x = self.dropout2(self.relu2(self.fc2(x)))
        x = self.bn2(x)
        coord_out = self.reg_head(x)
        return None, coord_out

    @classmethod
    def get_model(cls,stage,params):
        dim_in = params['feat_d']
        model = cls(
            dim_in,
            params['hidden_size'],
            params['hidden_size'],
            params['n_outputs']
        )
        return model


class GrowNetRegressorUnique:
    def __init__(self, params = None, device="cpu"):
        if params is None:
            self.params = grownet_regression_default_params()
        else:
            self.params = params
        self.device = device
        self.net_ensemble = None

    def fit(self,X_train,y_train, X_val=None, y_val = None):
        # Split validation if not provided
        if X_val is None or y_val is None:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=self.params['val_split'],random_state=self.params['random_state'])

        train_ds = GrowNetRegressionTrainDataset(X_train, y_train)
        val_ds = GrowNetRegressionTrainDataset(X_val, y_val)
        train_loader = DataLoader(train_ds, batch_size=self.params['batch_size'], shuffle=True)

        print(f"Train size: {len(train_ds)}, Val size: {len(val_ds)}")

        train_loader = DataLoader(train_ds, batch_size=self.params['batch_size'], shuffle=True)
        
        # Init ensemble
        c0 = torch.tensor(np.mean(y_train, axis=0), dtype=torch.float).unsqueeze(0).to(self.device)

        self.net_ensemble = GrowNetRegressionDynamicNet(c0, self.params['boost_rate'])
        self.net_ensemble.to(self.device)
        
        best_val_loss = float("inf")
        best_stage = 0
        early_stop = 0
        lr = self.params["lr"]
        
        for stage in range(self.params['num_nets']):
            t0 = time.time()
            
            print(f"\nTraining weak learner {stage+1}/{self.params['num_nets']}")
            model = GrowNetRegressionMLP.get_model(stage, self.params).to(self.device)
            optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=self.params['weight_decay'])
            self.net_ensemble.to_train()
            
            stage_train_losses = []
            
            for epoch in range(self.params["epochs_per_stage"]):
                for batch in train_loader:
                    x = batch["x"].to(self.device)
                    targets = batch["y"].to(self.device)

                    with torch.no_grad():
                        _, prev_preds = self.net_ensemble.forward(x)
                        grad = targets - prev_preds

                    # Always pass None for lower_f (no feature extraction)
                    middle_feat, preds = model(x, None)
                    loss_stagewise = nn.MSELoss(reduction="none")
                    boosting_loss = loss_stagewise(self.net_ensemble.boost_rate * preds, grad)
                    boosting_loss = boosting_loss.mean()
                    total_loss = boosting_loss
                    
                    model.zero_grad()
                    total_loss.backward()
                    clip_grad_norm_(model.parameters(), self.params['gradient_clip'])
                    optimizer.step()
                    stage_train_losses.append(total_loss.item())
            self.net_ensemble.add(model)
            avg_stage_loss = np.mean(stage_train_losses)
            print(f"Stage {stage+1} finished | Avg Train Loss: {avg_stage_loss:.5f} | Time: {time.time() - t0:.1f}s")
            val_metrics = self.evaluate(X_val, y_val)
            val_loss = val_metrics['rmse']
            print(f"Validation - RMSE: {val_loss:.3f}")
            print(f"Boost rate: {self.net_ensemble.boost_rate.item():.4f}")
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_stage = stage
                early_stop = 0
            else:
                early_stop += 1
                if early_stop > self.params["early_stopping_steps"]:
                    print("Early stopping!")
                    break
        print(f"\nBest model was at stage {best_stage+1} with Val RMSE: {best_val_loss:.5f}")


    def evaluate(self, X, y):
        self.net_ensemble.to_eval()
        all_preds = []
        all_targets = []
        losses = []
        loader = DataLoader(GrowNetRegressionTrainDataset(X, y), batch_size=self.params['batch_size'], shuffle=False)
        with torch.no_grad():
            for batch in loader:
                x = batch["x"].to(self.device)
                targets = batch["y"].to(self.device)
                _, preds = self.net_ensemble.forward(x)
                loss = F.mse_loss(preds, targets)
                losses.append(loss.item())
                all_preds.append(preds.cpu().numpy())
                all_targets.append(targets.cpu().numpy())
        all_preds = np.concatenate(all_preds, axis=0)
        all_targets = np.concatenate(all_targets, axis=0)
        rmse = np.sqrt(mean_squared_error(all_targets, all_preds))
        r2 = r2_score(all_targets, all_preds)
        return {
            'rmse': rmse,
            'r2': r2,
            'predictions': all_preds,
            'targets': all_targets
        }

    def predict(self, X):
        self.net_ensemble.to_eval()
        all_preds = []
        loader = DataLoader(GrowNetRegressionTrainDataset(X, np.zeros((X.shape[0], self.params['n_outputs']))), batch_size=self.params['batch_size'], shuffle=False)
        with torch.no_grad():
            for batch in loader:
                x = batch["x"].to(self.device)
                _, preds = self.net_ensemble.forward(x)
                all_preds.append(preds.cpu().numpy())
        all_preds = np.concatenate(all_preds, axis=0)
        return all_preds
    
def run_grownet_regressor(X_train, y_train, X_test, y_test, params=None,
                          tune_hyperparams=False, n_trials=20, timeout=1200, 
                          device=None, verbose=True):
    """Run GrowNet regressor with proper error handling and interface consistency"""
    
    try:
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            
        if verbose:
            print(f"Running GrowNet regressor on device: {device}")
            
        if params is None:
            params = grownet_regression_default_params()
        else:
            default = grownet_regression_default_params()
            default.update(params)
            params = default
            
        # Handle both 1D and multi-dimensional targets
        if len(y_train.shape) == 1:
            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)
            
        params['feat_d'] = X_train.shape[1]
        params['n_outputs'] = y_train.shape[1]
        
        if tune_hyperparams:
            X_train_split, X_val, y_train_split, y_val = train_test_split(
                X_train, y_train, test_size=0.2, random_state=42
            )
            tuner = GrowNetRegressionTuner(X_train_split, y_train_split, X_val, y_val, 
                                         params, device=device, n_trials=n_trials, timeout=timeout)
            best_params, best_score = tuner.tune()
            params.update(best_params)
            if verbose:
                print("Using best params:", params)
                
        model = GrowNetRegressorUnique(params, device=device)
        model.fit(X_train, y_train)
        results = model.evaluate(X_test, y_test)
        
        if verbose:
            print("\nRegression Report:")
            print(f"RMSE: {results['rmse']:.4f}")
            print(f"R2 Score: {results['r2']:.4f}")
            
        return {
            'model': model,
            'predictions': results['predictions'],
            'rmse': results['rmse'],
            'r2_score': results['r2'],  # Use r2_score for consistency
            'params': params,
            'skipped': False
        }
        
    except Exception as e:
        if verbose:
            print(f"Error in GrowNet regressor: {e}")
        # Return dummy predictions on error
        n_samples = X_test.shape[0]
        output_dim = y_train.shape[1] if len(y_train.shape) > 1 else 1
        dummy_preds = np.zeros((n_samples, output_dim))
        
        return {
            'model': None,
            'predictions': dummy_preds,
            'rmse': float('inf'),
            'r2_score': -float('inf'),
            'params': params,
            'skipped': True,
            'error': str(e)
        }



Using device: cuda
Using device: cuda


# LightGBM

In [6]:
class LightGBMTuner:
    def __init__(self, X_train, y_train, X_test, y_test, random_state=42, n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.random_state = random_state
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.final_model = None

    def default_params(self):
        return {
            'objective': 'multiclass',
            'num_class': len(np.unique(self.y_train)),
            'metric': 'multi_logloss',
            'learning_rate': 0.1,
            'max_depth': 6,
            'num_leaves': 31,
            'min_child_samples': 20,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'n_estimators': 300,
            'random_state': self.random_state,
            'min_gain_to_split': 1e-3,  # Suppress split gain warning
            'verbose': -1  # Suppress LightGBM output
        }

    def objective(self, trial):
        params = {
            'objective': 'multiclass',
            'num_class': len(np.unique(self.y_train)),
            'metric': 'multi_logloss',
            'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            'max_depth': trial.suggest_int("max_depth", 3, 12),
            'num_leaves': trial.suggest_int("num_leaves", 15, 256),
            'min_child_samples': trial.suggest_int("min_child_samples", 5, 100),
            'subsample': trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
            'reg_lambda': trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
            'reg_alpha': trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
            'n_estimators': trial.suggest_int("n_estimators", 100, 400),
            'random_state': self.random_state,
            'min_gain_to_split': 1e-3,  # Suppress split gain warning
            'verbose': -1  # Suppress LightGBM output
        }
        model = lgb.LGBMClassifier(**params)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=skf, scoring='accuracy')
        return scores.mean()

    def tune(self):
        study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2))
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        self.best_params = study.best_params
        self.best_params.update({
            'objective': 'multiclass',
            'num_class': len(np.unique(self.y_train)),
            'metric': 'multi_logloss',
            'random_state': self.random_state,
            'min_gain_to_split': 1e-3,  # Suppress split gain warning
            'verbose': -1  # Suppress LightGBM output
        })
        return self.best_params

    def train(self, params):
        model = lgb.LGBMClassifier(**params)
        model.fit(self.X_train, self.y_train)
        self.final_model = model
        return model

    def evaluate(self, model=None):
        if model is None:
            model = self.final_model
        preds = model.predict(self.X_test)
        probs = model.predict_proba(self.X_test)
        acc = accuracy_score(self.y_test, preds)
        print("\nClassification Report:")
        print(classification_report(self.y_test, preds))
        print(f"\nAccuracy: {acc:.4f}")
        return preds, probs, acc



def run_lightgbm_classifier(X_train, y_train, X_test, y_test, 
                            tune_hyperparams=False, random_state=42,
                            n_trials=20, timeout=1200, params=None, verbose=False):
    tuner = LightGBMTuner(X_train, y_train, X_test, y_test, 
                          random_state=random_state, n_trials=n_trials, timeout=timeout)
    if tune_hyperparams:
        best_params = tuner.tune()
        if verbose:
            print("Using tuned parameters:", best_params)
    else:
        best_params = tuner.default_params()
        if params:
            best_params.update(params)
        if verbose:
            print("Using default (or custom) parameters:", best_params)

    model = tuner.train(best_params)
    preds, probs, acc = tuner.evaluate(model) if verbose else (model.predict(X_test), model.predict_proba(X_test), accuracy_score(y_test, model.predict(X_test)))
    
    if verbose:
        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': best_params
        }
    else:
        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': best_params
        }


class LightGBMRegressorTuner:
    def __init__(self, X_train, y_train, X_test, y_test,
                 random_state=42, n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.random_state = random_state
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.final_model = None

    def default_params(self):
        return {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'learning_rate': 0.1,
            'max_depth': 6,
            'min_child_samples': 20,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_lambda': 1.0,
            'reg_alpha': 0.0,
            'n_estimators': 300,
        }

    def objective(self, trial):
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            'max_depth': trial.suggest_int("max_depth", 3, 12),
            'min_child_samples': trial.suggest_int("min_child_samples", 5, 100),
            'subsample': trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
            'reg_lambda': trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
            'reg_alpha': trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
            'n_estimators': trial.suggest_int("n_estimators", 100, 400),
        }

        model = lgb.LGBMRegressor(**params, random_state=self.random_state, verbose=-1)
        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, scoring='neg_mean_absolute_error')
        return np.mean(scores)

    def tune(self):
        study = optuna.create_study(direction='maximize',
                                    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2))
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        self.best_params = study.best_params
        self.best_params.update({
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt'
        })
        return self.best_params

    def train(self, params):
        model = lgb.LGBMRegressor(**params, random_state=self.random_state)
        model.fit(self.X_train, self.y_train)
        self.final_model = model
        return model

    def evaluate(self, model=None):
        if model is None:
            model = self.final_model
        preds = model.predict(self.X_test)
        mae = mean_absolute_error(self.y_test, preds)
        r2 = r2_score(self.y_test, preds)
        print("\nRegression Report:")
        print(f"MAE:  {mae:.4f}")
        print(f"R2:   {r2:.4f}")
        return preds, mae, r2


def run_lightgbm_regressor(X_train, y_train, X_test, y_test,
                          tune_hyperparams=False, random_state=42,
                          n_trials=20, timeout=1200, params=None, verbose=True):
    """LightGBM regressor with proper error handling"""
    
    try:
        # Handle multi-dimensional targets
        if len(y_train.shape) > 1 and y_train.shape[1] > 1:
            if verbose:
                print("Warning: LightGBM doesn't support multi-output regression natively. Using first dimension only.")
            y_train = y_train[:, 0]
            y_test = y_test[:, 0]

        tuner = LightGBMRegressorTuner(X_train, y_train, X_test, y_test,
                                      random_state=random_state, n_trials=n_trials, timeout=timeout)

        if tune_hyperparams:
            best_params = tuner.tune()
            if verbose:
                print("Using tuned parameters:", best_params)
        else:
            best_params = tuner.default_params()
            if params:
                best_params.update(params)
            if verbose:
                print("Using default (or custom) parameters:", best_params)

        model = tuner.train(best_params)
        preds, mae, r2 = tuner.evaluate(model) if verbose else (model.predict(X_test), None, None)

        # Calculate additional metrics if not verbose
        if not verbose:
            mae = mean_absolute_error(y_test, preds)
            r2 = r2_score(y_test, preds)

        return {
            'model': model,
            'predictions': preds,
            'mae': mae,
            'r2_score': r2,  # Use r2_score for consistency
            'params': best_params,
            'skipped': False
        }
        
    except Exception as e:
        if verbose:
            print(f"Error in LightGBM regressor: {e}")
        # Return dummy predictions on error
        n_samples = X_test.shape[0]
        dummy_preds = np.zeros(n_samples)
        
        return {
            'model': None,
            'predictions': dummy_preds,
            'mae': float('inf'),
            'r2_score': -float('inf'),
            'params': params,
            'skipped': True,
            'error': str(e)
        }


# Neural Networks

In [7]:
def default_classification_params():
    return {
        "input_dim": 200,
        "hidden_dim": [128, 64],
        "output_dim": 7,
        "use_batch_norm": True,
        "initial_dropout": 0.3,
        "final_dropout": 0.8,
        "lr": 1e-3,
        "weight_decay": 1e-5,
        "batch_size": 128,
        "epochs": 400,
        "early_stopping_steps": 20,
        "gradient_clip": 1.0,
        "val_split": 0.2,
        "test_split": 0.2,
        "random_state": 42,
    }


class NNClassificationTuner:
    def __init__(self, X_train, y_train, X_val=None, y_val=None, params=None, device="cpu", n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.params = params
        self.device = device
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.best_score = None

    def objective(self, trial):
        params = self.params.copy()
        params.update({
            "hidden_dim": trial.suggest_categorical(
                "hidden_dim",
                [
                    [64],
                    [128],
                    [128, 64],
                    [256, 128, 64],
                    [256, 128],
                    [512, 256, 128, 64]
                ]
            ),
            "initial_dropout": trial.suggest_float("initial_dropout", 0.1, 0.3),
            "final_dropout": trial.suggest_float("final_dropout", 0.5, 0.8),
            "lr": trial.suggest_loguniform("lr", 1e-4, 1e-2),
            "batch_size": trial.suggest_categorical("batch_size", [64, 128, 256]),
            "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-3),
            "gradient_clip": trial.suggest_float("gradient_clip", 0.5, 2.0),
        })

        # Train model
        model = NNClassifier(params, device=self.device)
        model.fit(self.X_train, self.y_train, X_val=self.X_val, y_val=self.y_val)
        val_metrics = model.evaluate(self.X_val, self.y_val)
        val_acc = val_metrics['class_accuracy']
        return val_acc

    def tune(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        
        self.best_params = study.best_params
        self.best_score = study.best_value
        
        print(f"Best score: {self.best_score:.4f}")
        print(f"Best parameters: {self.best_params}")
        
        return self.best_params, self.best_score


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Dataset class for classification
class ClassificationTrainDataset(Dataset):
    def __init__(self, features, n_targets):
        self.features = features
        self.n_targets = n_targets

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        return {
            'x': torch.tensor(self.features[idx], dtype=torch.float),
            'n_classes': torch.tensor(self.n_targets[idx], dtype=torch.long)
        }
   

# Neural Network
class ClassificationNeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim = [128,64], use_batch_norm=True,
                  initial_dropout:float = 0.2, final_dropout:float =0.7, random_state=42):
        super(ClassificationNeuralNetwork,self).__init__()

        """
        Initialize Continent architecture using Pytorch modules.

        Parameters:
        - input_size: Number of input features. In this case it is the GITs. # 200
        - hidden_layers: List of hidden layers # 128, 64 are the default
        - output_size: Number of classes
        - dropout_rate: [0.2, 0.7]
        - random_state: Random state for reporducibility
        
        """
        self.input_size = input_dim
        self.hidden_dim = hidden_dim
        self.output_size = output_dim
        self.dropout_initial = initial_dropout
        self.dropout_final = final_dropout
        self.use_batch_norm = use_batch_norm

        # Set random seeds
        torch.manual_seed(random_state)
        np.random.seed(random_state)
       

        # Build the neural network
        self.layers = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        self.batch_norms = nn.ModuleList()

        # Create dynamic doprout rates
        dropout_rates = np.linspace(initial_dropout,final_dropout, len(hidden_dim))

        # Create the layer architecture
        layer_sizes = [input_dim] + hidden_dim + [output_dim]

        for i in range(len(layer_sizes)-1):
            # Add the linear layers first
            self.layers.append(nn.Linear(layer_sizes[i],layer_sizes[i+1]))

            # Add batch normalization for hidden layers only and not for the output layers
            if i < len(layer_sizes) - 2 and self.use_batch_norm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))

            # Add dropout for hidden layers onyl and not for the output layers
            if i < len(layer_sizes) - 2:
                self.dropouts.append(nn.Dropout(dropout_rates[i]))

    def forward(self,x):
        """
        Forward propagations through the network
        
        Parameters:
        - x: Input tensor        
        """


        current_input = x

        # Forward pass through the hidden layers
        for i, (layer, dropout) in enumerate(zip(self.layers[:-1],self.dropouts)):
            # Linear transformations
            z = layer(current_input)

            # Batch normalization if enabled
            if self.use_batch_norm:
                z = self.batch_norms[i](z)

            # Acitvation function
            a = F.relu(z)

            # Apply dropout only during training
            if i < len(self.dropouts):
                a = dropout(a) if self.training else a # Apply dropout only during training
            
            current_input = a

        # Output layer (no activation for regression)
        output = self.layers[-1](current_input)

        return output


class NNClassifier:
    def __init__(self, params=None, device="cpu"):
        if params is None:
            self.params = default_classification_params()
        else:
            self.params = params
        self.device = device
        self.model = None
        self.class_weight_tensor = None
        self.best_model_state = None
    
    def fit(self, X_train, y_train, X_val=None, y_val=None):
        """ Train the model"""
        print("Fit the model...")

        # Update the parameters to this input
        self.params['input_dim'] = X_train.shape[1]
        self.params['output_dim'] = len(np.unique(y_train))


        # Compute the class weights
        class_weights = compute_class_weight(class_weight="balanced",classes=np.unique(y_train),y=y_train)
        self.class_weight_tensor = torch.tensor(class_weights,dtype=torch.float32).to(self.device)

        # Split if validation is not given
        if X_val is None or y_val is None:
            X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=self.params['val_split'],
                                                              random_state=self.params['random_state'], stratify=y_train)
        
        # Create datasets and dataloaders
        train_dataset = ClassificationTrainDataset(X_train,y_train)
        val_dataset = ClassificationTrainDataset(X_val,y_val)

        train_loader = DataLoader(train_dataset,batch_size=self.params['batch_size'],shuffle=True)
        val_loader = DataLoader(val_dataset,batch_size=self.params['batch_size'],shuffle=False)

        print(f"Train size {len(train_dataset)}, Val size {len(val_dataset)}")
        

        # Initialize model
        self.model = ClassificationNeuralNetwork(
            input_dim=self.params['input_dim'],
            output_dim=self.params['output_dim'],
            hidden_dim=self.params['hidden_dim'],
            use_batch_norm=self.params['use_batch_norm'],
            initial_dropout=self.params['initial_dropout'],
            final_dropout=self.params['final_dropout'],
            random_state=self.params['random_state']
        ).to(self.device)

        # Loss function and evaluation for classification
        criterion_classification = nn.CrossEntropyLoss(weight=self.class_weight_tensor)
        optimizer = torch.optim.Adam(params=self.model.parameters(),
                                     lr=self.params['lr'],
                                     weight_decay=self.params['weight_decay'])
        
        best_val_loss = float('inf')
        early_stopping_counter = 0

        train_losses = []
        val_losses = []

        print("Strarting training.....")
        for epoch in range(self.params['epochs']):
            # Training phase
            self.model.train()
            train_loss = 0.0
            train_correct = 0
            train_total = 0

            for batch in train_loader:
                features = batch['x'].to(self.device)
                targets = batch['n_classes'].to(self.device)

                # Set optimizer
                optimizer.zero_grad()

                # Forward pass
                preds = self.model(features)

                # Calculate loss
                classification_loss = criterion_classification(preds,targets)

                # Combined loss - adjust weight of the reconstruction loss
                total_loss = classification_loss 

                # Backward pass
                total_loss.backward()

                # Gradient clipping
                if self.params['gradient_clip'] > 0:
                    clip_grad_norm_(self.model.parameters(), self.params['gradient_clip'])

                optimizer.step()


                train_loss += total_loss.item()

                # Calcualte metrics for the epoch
                _, predicted = torch.max(preds.data,1)
                train_total += targets.size(0)
                train_correct += (predicted == targets).sum().item()

            # Validation phase
            self.model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                for batch in val_loader:
                    features = batch['x'].to(self.device)
                    targets = batch['n_classes'].to(self.device)

                    preds = self.model(features)

                    classification_loss = criterion_classification(preds,targets)
                    total_loss = classification_loss 

                    val_loss += total_loss.item()

                    _, predicted = torch.max(preds.data,1)
                    val_total += targets.size(0)
                    val_correct += (predicted == targets).sum().item()

            # Calculate averages
            train_loss /= len(train_loader)
            val_loss /= len(val_loader)
            train_accuracy = 100 * train_correct / train_total
            val_accuracy = 100 * val_correct / val_total

            train_losses.append(train_loss)
            val_losses.append(val_loss)


            if epoch % 10 == 0:
                print(f'Epoch [{epoch}/{self.params["epochs"]}], '
                      f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, '
                      f'Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')

            # Early stopping
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    early_stopping_counter = 0
                    # Save best model state
                    self.best_model_state = copy.deepcopy(self.model.state_dict())
                else:
                    early_stopping_counter += 1
                    if early_stopping_counter >= self.params['early_stopping_steps']:
                        print(f"Early stopping at epoch {epoch}")
                        break

        print(f"Training completed. Best validation loss: {best_val_loss:.4f}")

    def evaluate(self, X, y):
        """
        Evaluate the model
        """

        self.model.eval()
        all_preds = []
        all_targets = []
        class_lossses = []
        all_preds_prob = []

        dataset = ClassificationTrainDataset(X, y)
        loader = DataLoader(dataset, batch_size=self.params['batch_size'], shuffle=False)

        criterion = nn.CrossEntropyLoss(weight=self.class_weight_tensor)

        with torch.no_grad():
            for batch in loader:
                features = batch['x'].to(self.device)
                targets = batch['n_classes'].to(self.device)

                preds = self.model(features)
                loss = criterion(preds,targets)
                class_lossses.append(loss.item())

                probs = F.softmax(preds, dim=1).cpu().numpy()
                _, predicted = torch.max(preds,1)
                all_preds.extend(predicted.cpu().numpy())
                all_targets.extend(targets.cpu().numpy())
                all_preds_prob.extend(probs)

        acc = accuracy_score(all_targets, all_preds)

        return {
                'class_loss': np.mean(class_lossses),
                'class_accuracy': acc,
                'probabilities':np.array(all_preds_prob),
                'predictions': all_preds,
                'targets': all_targets
            }
    
    def predict(self, X):
        """
        Make predictions on new data
        """
        
        self.model.eval()
        all_preds = []
        all_preds_prob = []
        
        # Create dummy targets for dataset
        dummy_targets = np.zeros(X.shape[0])
        dataset = ClassificationTrainDataset(X, dummy_targets)
        loader = DataLoader(dataset, batch_size=self.params['batch_size'], shuffle=False)
        
        with torch.no_grad():
            for batch in loader:
                features = batch['x'].to(self.device)
                outputs = self.model(features)
                
                probs = F.softmax(outputs, dim=1).cpu().numpy()
                preds = np.argmax(probs, axis=1)
                
                all_preds.extend(preds)
                all_preds_prob.extend(probs)
        
        # Convert back to original labels
        
        return {
            'predictions': all_preds,
            'probabilities': np.array(all_preds_prob)
        }
    

def run_nn_classifier(X_train, y_train, X_test, y_test,
                      tune_hyperparams=False, params=None,
                      n_trials=20, timeout=1200, verbose=False):
    """Run the neural network classifier"""
    
    # Handle device detection internally
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    if verbose:
        print(f"Running neural network classifier on device: {device}")
    
    # Use default if params not given
    if params is None:
        params = default_classification_params()
    else:
        default = default_classification_params()
        default.update(params)
        params = default

    # Update input dimension based on actual data
    params['input_dim'] = X_train.shape[1]
    params['output_dim'] = len(np.unique(y_train))

        
    if tune_hyperparams:
        # Split validation set from training data
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train,y_train, test_size=0.2, random_state=42, stratify=y_train
        )

        tuner = NNClassificationTuner(X_train_split, y_train_split, X_val, y_val, 
                               params, device=device, n_trials=n_trials, timeout=timeout)
        best_params, best_score = tuner.tune()
        params.update(best_params)
        if verbose:
            print("Using best params:", params)

    # Train final model on full training data
    model = NNClassifier(params, device=device)
    model.fit(X_train, y_train)

    results = model.evaluate(X_test, y_test)
    if verbose:
        print("\nClassification Report:")
        print(classification_report(results['targets'], results['predictions']))
        print("\nAccuracy:", results['class_accuracy'])
    
    return {
        'model': model,
        'predictions': results['predictions'],
        'predicted_probabilities': results['probabilities'],
        'accuracy': results['class_accuracy'],
        'params': params
    }



def default_regression_params():
    return {
        "input_dim": 200,
        "hidden_dim": [128, 64],
        "output_dim": 3,
        "use_batch_norm": True,
        "initial_dropout": 0.2,
        "final_dropout": 0.5,
        "lr": 1e-3,
        "weight_decay": 1e-5,
        "batch_size": 128,
        "epochs": 400,
        "early_stopping_steps": 50,
        "gradient_clip": 1.0,
        "val_split": 0.2,
        "random_state": 42,
    }


class NNRegressionTuner:
    def __init__(self, X_train, y_train, X_val=None, y_val=None, params=None, device="cpu", n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.params = params
        self.device = device
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.best_score = None

    def objective(self, trial):
        params = self.params.copy()
        params.update({
            "hidden_dim": trial.suggest_categorical(
                "hidden_dim",
                [
                    [64],
                    [128],
                    [128, 64],
                    [256, 128, 64],
                    [256, 128],
                    [512, 256, 128, 64]
                ]
            ),
            "initial_dropout": trial.suggest_float("initial_dropout", 0.1, 0.3),
            "final_dropout": trial.suggest_float("final_dropout", 0.5, 0.8),
            "lr": trial.suggest_loguniform("lr", 1e-4, 1e-2),
            "batch_size": trial.suggest_categorical("batch_size", [64, 128, 256]),
            "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-3),
            "gradient_clip": trial.suggest_float("gradient_clip", 0.5, 2.0),
        })

        # Train model
        model = NNRegressor(params, device=self.device)
        model.fit(self.X_train, self.y_train, X_val=self.X_val, y_val=self.y_val)
        val_metrics = model.evaluate(self.X_val, self.y_val)
        # Use negative MSE for maximization (Optuna maximizes by default)
        val_mse = val_metrics['mse']
        return -val_mse

    def tune(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        
        self.best_params = study.best_params
        self.best_score = study.best_value
        
        print(f"Best score (negative MSE): {self.best_score:.4f}")
        print(f"Best parameters: {self.best_params}")
        
        return self.best_params, self.best_score


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

class RegressionTrainDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        return {
            'x': torch.tensor(self.features[idx], dtype=torch.float),
            'y': torch.tensor(self.targets[idx], dtype=torch.float)
        }

   

# Neural Network for Regression
class RegressionNeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=[128, 64],
                 use_batch_norm=True, initial_dropout=0.2, final_dropout=0.5, random_state=42):
        super().__init__()
        self.input_size = input_dim
        self.output_size = output_dim
        self.hidden_dim = hidden_dim
        self.use_batch_norm = use_batch_norm

        torch.manual_seed(random_state)

        self.layers = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        self.batch_norms = nn.ModuleList()

        layer_sizes = [input_dim] + hidden_dim + [output_dim]
        dropout_rates = np.linspace(initial_dropout, final_dropout, len(hidden_dim))

        for i in range(len(layer_sizes) - 1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            if i < len(layer_sizes) - 2 and use_batch_norm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))
            if i < len(layer_sizes) - 2:
                self.dropouts.append(nn.Dropout(dropout_rates[i]))

    def forward(self, x):
        for i in range(len(self.layers) - 1):
            x = self.layers[i](x)
            if self.use_batch_norm:
                x = self.batch_norms[i](x)
            x = F.relu(x)
            x = self.dropouts[i](x)
        return self.layers[-1](x)  # Output layer: no activation



class NNRegressor:
    def __init__(self, params=None, device="cpu"):
        if params is None:
            self.params = default_regression_params()
        else:
            self.params = params
        self.device = device
        self.model = None
        self.best_model_state = None
        self.target_scaler = None
    
    def fit(self, X_train, y_train, X_val=None, y_val=None):
        """Train the model"""
        print("Fitting the model...")

        # Update the parameters to this input
        self.params['input_dim'] = X_train.shape[1]
        self.params['output_dim'] = y_train.shape[1]  

        
        # Split if validation is not given
        if X_val is None or y_val is None:
            X_train, X_val, y_train, y_val = train_test_split(
                X_train, y_train, test_size=self.params['val_split'],
                random_state=self.params['random_state']
            )
        
        # Create datasets and dataloaders
        train_dataset = RegressionTrainDataset(X_train, y_train)
        val_dataset = RegressionTrainDataset(X_val, y_val)

        train_loader = DataLoader(train_dataset, batch_size=self.params['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=self.params['batch_size'], shuffle=False)

        print(f"Train size {len(train_dataset)}, Val size {len(val_dataset)}")

        # Initialize model
        self.model = RegressionNeuralNetwork(
            input_dim=self.params['input_dim'],
            output_dim=self.params['output_dim'],
            hidden_dim=self.params['hidden_dim'],
            use_batch_norm=self.params['use_batch_norm'],
            initial_dropout=self.params['initial_dropout'],
            final_dropout=self.params['final_dropout'],
            random_state=self.params['random_state']
        ).to(self.device)

        # Loss function and optimizer for regression
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(
            params=self.model.parameters(),
            lr=self.params['lr'],
            weight_decay=self.params['weight_decay']
        )
        
        best_val_loss = float('inf')
        early_stopping_counter = 0

        train_losses = []
        val_losses = []

        print("Starting training...")
        for epoch in range(self.params['epochs']):
            # Training phase
            self.model.train()
            train_loss = 0.0

            for batch in train_loader:
                features = batch['x'].to(self.device)
                targets = batch['y'].to(self.device)

                # Zero optimizer
                optimizer.zero_grad()

                # Forward pass
                preds = self.model(features)

                # Calculate loss
                loss = criterion(preds, targets)

                # Backward pass
                loss.backward()

                # Gradient clipping
                if self.params['gradient_clip'] > 0:
                    clip_grad_norm_(self.model.parameters(), self.params['gradient_clip'])

                optimizer.step()

                train_loss += loss.item()

            # Validation phase
            self.model.eval()
            val_loss = 0.0

            with torch.no_grad():
                for batch in val_loader:
                    features = batch['x'].to(self.device)
                    targets = batch['y'].to(self.device)

                    preds = self.model(features)
                    loss = criterion(preds, targets)
                    val_loss += loss.item()

            # Calculate averages
            train_loss /= len(train_loader)
            val_loss /= len(val_loader)

            train_losses.append(train_loss)
            val_losses.append(val_loss)

            if epoch % 10 == 0:
                print(f'Epoch [{epoch}/{self.params["epochs"]}], '
                      f'Train Loss: {train_loss:.4f}, '
                      f'Val Loss: {val_loss:.4f}')

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                early_stopping_counter = 0
                # Save best model state
                self.best_model_state = copy.deepcopy(self.model.state_dict())
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= self.params['early_stopping_steps']:
                    print(f"Early stopping at epoch {epoch}")
                    break

        print(f"Training completed. Best validation loss: {best_val_loss:.4f}")

        # Load best model
        if self.best_model_state is not None:
            self.model.load_state_dict(self.best_model_state)

    def evaluate(self, X, y):
        """Evaluate the model"""
        self.model.eval()
        all_preds = []
        all_targets = []
        losses = []

        dataset = RegressionTrainDataset(X, y)
        loader = DataLoader(dataset, batch_size=self.params['batch_size'], shuffle=False)
        criterion = nn.MSELoss()

        with torch.no_grad():
            for batch in loader:
                features = batch['x'].to(self.device)
                targets = batch['y'].to(self.device)

                preds = self.model(features)
                loss = criterion(preds, targets)
                losses.append(loss.item())

                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(targets.cpu().numpy())

        # Convert back to numpy arrays
        all_preds = np.array(all_preds)
        all_targets = np.array(all_targets)

        # Calculate metrics
        mse = mean_squared_error(all_targets, all_preds)
        mae = mean_absolute_error(all_targets, all_preds)
        r2 = r2_score(all_targets, all_preds)

        return {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'rmse': np.sqrt(mse),
            'predictions': all_preds,
            'targets': all_targets
        }
    
    def predict(self, X):
        """Make predictions on new data"""
        self.model.eval()
        all_preds = []
        
        # Create dummy targets for dataset
        dummy_targets = np.zeros(X.shape[0])
        dataset = RegressionTrainDataset(X, dummy_targets)
        loader = DataLoader(dataset, batch_size=self.params['batch_size'], shuffle=False)
        
        with torch.no_grad():
            for batch in loader:
                features = batch['x'].to(self.device)
                outputs = self.model(features).squeeze()
                all_preds.extend(outputs.cpu().numpy())
        
        # Convert to numpy array and reshape
        all_preds = np.array(all_preds)
        
        return all_preds


def run_nn_regressor(X_train, y_train, X_test, y_test, device=None,
                     tune_hyperparams=False, params=None,
                     n_trials=20, timeout=1200, verbose=True):
    """Run the neural network regressor"""
    
    try:
        # Set device if not provided
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        
        if verbose:
            print(f"Running neural network regressor on device: {device}")
        
        # Use default if params not given
        if params is None:
            params = default_regression_params()
        else:
            default = default_regression_params()
            default.update(params)
            params = default

        # Update input dimension based on actual data
        params['input_dim'] = X_train.shape[1]
        
        # Handle both 1D and multi-dimensional targets
        if len(y_train.shape) == 1:
            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)
        
        params['output_dim'] = y_train.shape[1]

        if tune_hyperparams:
            # Split validation set from training data
            X_train_split, X_val, y_train_split, y_val = train_test_split(
                X_train, y_train, test_size=0.2, random_state=42
            )

            tuner = NNRegressionTuner(X_train_split, y_train_split, X_val, y_val, 
                           params, device=device, n_trials=n_trials, timeout=timeout)
            best_params, best_score = tuner.tune()
            params.update(best_params)
            if verbose:
                print("Using best params:", params)

        # Train final model on full training data
        model = NNRegressor(params, device=device)
        model.fit(X_train, y_train)

        # Evaluate on test set
        results = model.evaluate(X_test, y_test)
        
        if verbose:
            print(f"\nRegression Results:")
            print(f"MSE: {results['mse']:.4f}")
            print(f"MAE: {results['mae']:.4f}")
            print(f"RMSE: {results['rmse']:.4f}")
            print(f"R2: {results['r2']:.4f}")
        
        return {
            'model': model,
            'predictions': results['predictions'],
            'mse': results['mse'],
            'mae': results['mae'],
            'rmse': results['rmse'],
            'r2_score': results['r2'],  # Add r2_score key for consistency
            'params': params,
            'skipped': False
        }
        
    except Exception as e:
        if verbose:
            print(f"Error in neural network regressor: {e}")
        # Return dummy predictions on error
        n_samples = X_test.shape[0]
        output_dim = y_train.shape[1] if len(y_train.shape) > 1 else 1
        dummy_preds = np.zeros((n_samples, output_dim))
        
        return {
            'model': None,
            'predictions': dummy_preds,
            'mse': float('inf'),
            'mae': float('inf'),
            'rmse': float('inf'),
            'r2_score': -float('inf'),
            'params': params,
            'skipped': True,
            'error': str(e)
        }

Using device: cuda
Using device: cuda


# TabPFN

In [8]:
def run_tabpfn_classifier(X_train, y_train, X_test, y_test, tune_hyperparams=False, max_time=60, params=None, random_state=42):
    """
    Run TabPFN classifier with device and class count checks.
    Uses AutoTabPFNClassifier for hyperparameter tuning.
    """
    os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "1"
    device = 'cpu'
    if params and 'device' in params:
        device = params['device']
    elif torch.cuda.is_available():
        device = 'cuda'

    n_classes = len(np.unique(y_train))

    # Skip if device is CPU
    if device == 'cpu':
        print("TabPFNClassifier skipped: device is CPU.")
        return {
            'model': None,
            'predictions': None,
            'predicted_probabilities': None,
            'accuracy': None,
            'params': params,
            'skipped': True,
            'reason': 'cpu'
        }

    # Skip if too many classes
    if n_classes > 30:
        print(f"TabPFNClassifier skipped: number of classes ({n_classes}) exceeds TabPFN's limit.")
        return {
            'model': None,
            'predictions': None,
            'predicted_probabilities': None,
            'accuracy': None,
            'params': params,
            'skipped': True,
            'reason': 'too_many_classes'
        }

    try:
        # Extract max_time from params if provided
        if params and 'max_time' in params:
            max_time = params['max_time']
        
        # Hyperparameter tuning with AutoTabPFNClassifier
        if tune_hyperparams:
            print(f"Using AutoTabPFN for hyperparameter tuning with max_time={max_time}...")
            model = AutoTabPFNClassifier(device=device, max_time=max_time)
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            probs = model.predict_proba(X_test)
            acc = accuracy_score(y_test, preds)
            
            print(f"AutoTabPFN accuracy: {acc:.4f}")
            print("\nAutoTabPFN Classification Report:")
            print(classification_report(y_test, preds))
            
            return {
                'model': model,
                'predictions': preds,
                'predicted_probabilities': probs,
                'accuracy': acc,
                'params': {'max_time': max_time, 'device': device}
            }
        
        # Regular TabPFN usage
        model = TabPFNClassifier(device=device, ignore_pretraining_limits=True)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)
        acc = accuracy_score(y_test, preds)
        
        print("\nTabPFN Classification Report:")
        print(classification_report(y_test, preds))
        print(f"Accuracy: {acc:.4f}")

        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': params
        }
        
    except Exception as e:
        print(f"Error running TabPFN: {e}")
        return {
            'model': None,
            'predictions': None,
            'predicted_probabilities': None,
            'accuracy': None,
            'params': params,
            'skipped': True,
            'reason': f'error: {str(e)}'
        }

def run_tabpfn_regressor(X_train, y_train, X_test, y_test, tune_hyperparams=False, max_time=60, params=None):
    """
    Runs TabPFNRegressor models to predict x, y, z coordinates.
    Uses AutoTabPFNRegressor for hyperparameter tuning.
    Skips if device is CPU.
    """
    os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "1"
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if params and 'device' in params:
        device = params['device']

    if device == 'cpu':
        print("TabPFNRegressor skipped: device is CPU.")
        # Ensure predictions and lat_lon_predictions are arrays of NaN with correct shapes
        n_samples = X_test.shape[0] if hasattr(X_test, "shape") and len(X_test.shape) > 0 else 0
        preds = np.full((n_samples, 3), np.nan)
        lat_lon_preds = np.full((n_samples, 2), np.nan)
        return {
            'models': None,
            'predictions': preds,
            'lat_lon_predictions': lat_lon_preds,
            'metrics': None,
            'skipped': True,
            'reason': 'cpu'
        }

    # Extract max_time from params if provided
    if params and 'max_time' in params:
        max_time = params['max_time']

    coord_names = ['x', 'y', 'z']
    models = {}
    preds = []
    metrics = {}

    try:
        for i, coord in enumerate(coord_names):
            print(f"\n----- Predicting {coord.upper()} -----")
            
            if tune_hyperparams:
                print(f"Using AutoTabPFN for hyperparameter tuning with max_time={max_time}...")
                model = AutoTabPFNRegressor(device=device, max_time=max_time)
            else:
                model = TabPFNRegressor(device=device, ignore_pretraining_limits=True)
                
            model.fit(X_train, y_train[:, i])
            y_pred = model.predict(X_test)
            preds.append(y_pred)

            mse = mean_squared_error(y_test[:, i], y_pred)
            mae = mean_absolute_error(y_test[:, i], y_pred)
            r2 = r2_score(y_test[:, i], y_pred)

            print(f"{coord.upper()} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")
            metrics[coord] = {'mse': mse, 'mae': mae, 'r2': r2}
            models[coord] = model

        preds = np.stack(preds, axis=1)  # Shape: [n_samples, 3]
        lat_pred_rad = np.arcsin(preds[:, 2])
        lon_pred_rad = np.arctan2(preds[:, 1], preds[:, 0])
        lat_pred_deg = np.degrees(lat_pred_rad)
        lon_pred_deg = np.degrees(lon_pred_rad)

        return {
            'models': models,
            'predictions': preds,
            'lat_lon_predictions': np.stack([lat_pred_deg, lon_pred_deg], axis=1),
            'metrics': metrics
        }
        
    except Exception as e:
        print(f"Error running TabPFNRegressor: {e}")
        n_samples = X_test.shape[0] if hasattr(X_test, "shape") and len(X_test.shape) > 0 else 0
        preds = np.full((n_samples, 3), np.nan)
        lat_lon_preds = np.full((n_samples, 2), np.nan)
        return {
            'models': None,
            'predictions': preds,
            'lat_lon_predictions': lat_lon_preds,
            'metrics': None,
            'skipped': True,
            'reason': f'error: {str(e)}'
        }

# XGBoost

In [9]:
class XGBoostTuner:
    def __init__(self, X_train, y_train, X_test, y_test, random_state=42, n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.random_state = random_state
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.final_model = None

    def default_params(self):
        return {
            'objective': 'multi:softprob',
            'num_class': len(np.unique(self.y_train)),
            'eval_metric': 'mlogloss',
            'tree_method': 'hist',
            'learning_rate': 0.1,
            'max_depth': 6,
            'min_child_weight': 1,
            'gamma': 0,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'lambda': 1.0,
            'alpha': 0.0,
            'n_estimators': 300,
        }

    def objective(self, trial):
        params = {
            'objective': 'multi:softprob',
            'num_class': len(np.unique(self.y_train)),
            'eval_metric': 'mlogloss',
            'tree_method': 'hist',
            'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            'max_depth': trial.suggest_int("max_depth", 3, 12),
            'min_child_weight': trial.suggest_int("min_child_weight", 1, 10),
            'gamma': trial.suggest_float("gamma", 0, 5),
            'subsample': trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
            'lambda': trial.suggest_float("lambda", 1e-3, 10.0, log=True),
            'alpha': trial.suggest_float("alpha", 1e-3, 10.0, log=True),
            'n_estimators': trial.suggest_int("n_estimators", 100, 400),
        }

        model = xgb.XGBClassifier(**params, random_state=self.random_state, verbosity=0, use_label_encoder=False)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=skf, scoring='accuracy')
        return scores.mean()

    def tune(self):
        study = optuna.create_study(direction='maximize',pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)) # Pruning helps the in stopping bad trials
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        self.best_params = study.best_params
        self.best_params.update({
            'objective': 'multi:softprob',
            'num_class': len(np.unique(self.y_train)),
            'eval_metric': 'mlogloss',
            'tree_method': 'hist'
        })
        return self.best_params

    def train(self, params):
        model = xgb.XGBClassifier(**params, use_label_encoder=False)
        model.fit(self.X_train, self.y_train)
        self.final_model = model
        return model

    # Evaluate on the test dataset on how the model 
    def evaluate(self, model=None):
        if model is None:
            model = self.final_model
        preds = model.predict(self.X_test)
        probs = model.predict_proba(self.X_test)
        acc = accuracy_score(self.y_test, preds)
        print("\nClassification Report:")
        print(classification_report(self.y_test, preds))
        print(f"\nAccuracy: {acc:.4f}")
        return preds, probs, acc



def run_xgboost_classifier(X_train, y_train, X_test, y_test, 
                           tune_hyperparams=False, random_state=42, 
                           n_trials=20, timeout=1200, params=None, verbose=False):
    
    tuner = XGBoostTuner(X_train, y_train, X_test, y_test, 
                         random_state=random_state, n_trials=n_trials, timeout=timeout)

    if tune_hyperparams:
        best_params = tuner.tune()
        if verbose:
            print("Using tuned parameters:", best_params)
    else:
        best_params = tuner.default_params()
        if params:
            best_params.update(params)
        if verbose:
            print("Using default (or custom) parameters:", best_params)

    model = tuner.train(best_params)
    preds, probs, acc = tuner.evaluate(model) if verbose else (model.predict(X_test), model.predict_proba(X_test), accuracy_score(y_test, model.predict(X_test)))
    
    if verbose:
        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': best_params
        }
    else:
        return {
            'model': model,
            'predictions': preds,
            'predicted_probabilities': probs,
            'accuracy': acc,
            'params': best_params
        }





class XGBoostRegressorTuner:
    def __init__(self, X_train, y_train, X_test, y_test,
                 random_state=42, n_trials=20, timeout=1200):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.random_state = random_state
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = None
        self.final_model = None

    def default_params(self):
        return {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'learning_rate': 0.1,
            'max_depth': 6,
            'min_child_weight': 1,
            'gamma': 0,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'lambda': 1.0,
            'alpha': 0.0,
            'n_estimators': 300,
        }

    def objective(self, trial):
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            'max_depth': trial.suggest_int("max_depth", 3, 12),
            'min_child_weight': trial.suggest_int("min_child_weight", 1, 10),
            'gamma': trial.suggest_float("gamma", 0, 5),
            'subsample': trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
            'lambda': trial.suggest_float("lambda", 1e-3, 10.0, log=True),
            'alpha': trial.suggest_float("alpha", 1e-3, 10.0, log=True),
            'n_estimators': trial.suggest_int("n_estimators", 100, 400),
        }

        model = xgb.XGBRegressor(**params, random_state=self.random_state, verbosity=0)
        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
        scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, scoring='neg_mean_absolute_error')
        return np.mean(scores)

    def tune(self):
        study = optuna.create_study(direction='maximize',
                                    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2))
        study.optimize(self.objective, n_trials=self.n_trials, timeout=self.timeout)
        self.best_params = study.best_params
        self.best_params.update({
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist'
        })
        return self.best_params

    def train(self, params):
        model = xgb.XGBRegressor(**params)
        model.fit(self.X_train, self.y_train)
        self.final_model = model
        return model

    def evaluate(self, model=None):
        if model is None:
            model = self.final_model
        preds = model.predict(self.X_test)
        mae = mean_absolute_error(self.y_test, preds)
        r2 = r2_score(self.y_test, preds)
        print("\nRegression Report:")
        print(f"MAE:  {mae:.4f}")
        print(f"R2:   {r2:.4f}")
        return preds, mae, r2


def run_xgboost_regressor(X_train, y_train, X_test, y_test,
                          tune_hyperparams=False, random_state=42,
                          n_trials=20, timeout=1200, params=None, verbose=True):
    """XGBoost regressor with proper error handling"""
    
    try:
        # Handle multi-dimensional targets  
        if len(y_train.shape) > 1 and y_train.shape[1] > 1:
            if verbose:
                print("Warning: XGBoost doesn't support multi-output regression natively. Using first dimension only.")
            y_train = y_train[:, 0]
            y_test = y_test[:, 0]

        tuner = XGBoostRegressorTuner(X_train, y_train, X_test, y_test,
                                      random_state=random_state, n_trials=n_trials, timeout=timeout)

        if tune_hyperparams:
            best_params = tuner.tune()
            if verbose:
                print("Using tuned parameters:", best_params)
        else:
            best_params = tuner.default_params()
            if params:
                best_params.update(params)
            if verbose:
                print("Using default (or custom) parameters:", best_params)

        model = tuner.train(best_params)
        preds, mae, r2 = tuner.evaluate(model) if verbose else (model.predict(X_test), None, None)

        # Calculate additional metrics if not verbose
        if not verbose:
            mae = mean_absolute_error(y_test, preds)
            r2 = r2_score(y_test, preds)

        return {
            'model': model,
            'predictions': preds,
            'mae': mae,
            'r2_score': r2,  # Use r2_score for consistency
            'params': best_params,
            'skipped': False
        }
        
    except Exception as e:
        if verbose:
            print(f"Error in XGBoost regressor: {e}")
        # Return dummy predictions on error
        n_samples = X_test.shape[0]
        dummy_preds = np.zeros(n_samples)
        
        return {
            'model': None,
            'predictions': dummy_preds,
            'mae': float('inf'),
            'r2_score': -float('inf'),
            'params': params,
            'skipped': True,
            'error': str(e)
        }





# Main

In [10]:
# The main ensemble model



# Load and process the dataset

# Data processing function for hierarchical model
def process_data_hierarchical(df):
    """Process data for hierarchical prediction"""
    # Process continuous features
    cont_cols = [col for col in df.columns if col not in [
        'latitude', 'longitude',
        'latitude_rad', 'longitude_rad', 'x', 'y', 'z',
        'scaled_x', 'scaled_y', 'scaled_z', 'continent', 'city'
    ]]
    
    # Get the features
    x_cont = df[cont_cols].values
    
    # Encode continent labels
    continent_encoder = LabelEncoder()
    y_continent = continent_encoder.fit_transform(df['continent'].values)
    
    # Encode city labels
    city_encoder = LabelEncoder()
    y_city = city_encoder.fit_transform(df['city'].values)
    
    # Calculate coordinates if not already present
    if not all(col in df.columns for col in ['x', 'y', 'z']):
        df['latitude_rad'] = np.deg2rad(df['latitude'])
        df['longitude_rad'] = np.deg2rad(df['longitude'])
        df['x'] = np.cos(df['latitude_rad']) * np.cos(df['longitude_rad'])
        df['y'] = np.cos(df['latitude_rad']) * np.sin(df['longitude_rad'])
        df['z'] = np.sin(df['latitude_rad'])
    
    # Scale coordinates
    coord_scaler = StandardScaler()
    y_coords = coord_scaler.fit_transform(df[['x', 'y', 'z']].values)
    
    continents = continent_encoder.classes_
    cities = city_encoder.classes_
    
    print(f"Continents: {len(continents)} ({continents})")
    print(f"Cities: {len(cities)}")
    print(f"Continuous features: {len(cont_cols)}")
    
    return {
        'x_cont': x_cont,
        'y_continent': y_continent,
        'y_city': y_city,
        'y_coords': y_coords, # This is for neural networks. Scaling is required
        'y_latitude': df['latitude'].values, # This is for XGBoost, we don't need to scale this
        'y_longitude':df['longitude'].values, # This is for XGBoost, we don't need to scale this
        'encoders': {
            'continent': continent_encoder,
            'city': city_encoder,
            'coord': coord_scaler
        },
        'continents': continents,
        'cities': cities
    }

# Hierarchial split to keep track of the indices
def hierarchical_split(X_cont, y_continent, y_city, y_coords, y_lat, y_lon, test_size=0.2, random_state=42):
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_idx, test_idx = next(sss.split(X_cont, y_continent))

    return {
        'X_train': X_cont[train_idx],
        'X_test': X_cont[test_idx],
        'y_cont_train': y_continent[train_idx],
        'y_cont_test': y_continent[test_idx],
        'y_city_train': y_city[train_idx],
        'y_city_test': y_city[test_idx],
        'y_coords_train': y_coords[train_idx],
        'y_coords_test': y_coords[test_idx],
        'y_lat_train': y_lat[train_idx],
        'y_lat_test': y_lat[test_idx],
        'y_lon_train': y_lon[train_idx],
        'y_lon_test': y_lon[test_idx],
        'train_idx': train_idx,
        'test_idx': test_idx
    }

# Distance between two points on the earth
def haversine_distance(lat1,lon1,lat2,lon2):
    """
    Calculate the great circle distance between two points on the earth
    """
    # Radius of the earth
    R = 6371.0

    # Convert from degrees to radians
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2) **2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c # in kilometers

# Converting cartesian co-ordinates values to latitude and longitude
def xyz_to_latlon(xyz_coords):
    """
    Convert the XYZ coordinates to latitude and longitude
    """
    x,y,z = xyz_coords[:,0],xyz_coords[:,1],xyz_coords[:,2]

    # Convert to latitude and longitude
    lat_rad = np.arcsin(np.clip(z,-1,1)) # Clip to avoid numerical issues
    lon_rad = np.arctan2(y,x)

    # Convert to degrees
    lat_deg = np.degrees(lat_rad)
    lon_deg = np.degrees(lon_rad)

    return np.stack([lat_deg,lon_deg],axis=1)

# Plot the points on the world map for visualization
def plot_points_on_world_map(true_lat, true_long, predicted_lat, predicted_long, filename):
    """
    Plots true and predicted latitude and longitude on a world map.
    Args:
        true_lat: True latitude value
        true_long: True longitude value
        predicted_lat: Prediction by the neural netwrok
        predicted_long: Prediction by the neural network
        filename: Path and the name of the file to save the plot.
    Returns:
        A figure is saved in the correct directory.
    """
    # A file that is required to load the world map with proper countries
    world = gpd.read_file("/home/chandru/binp37/data/geopandas/ne_110m_admin_0_countries.shp") 
    fig, ax = plt.subplots(1, 1, figsize=(12, 6))
    world.plot(ax=ax, color='lightgray', edgecolor='black')
    # Plot true locations
    geometry_true = [Point(xy) for xy in zip(true_long, true_lat)]
    geo_df_true = gpd.GeoDataFrame(geometry_true, crs=world.crs, geometry=geometry_true) 
    geo_df_true.plot(ax=ax, marker='o', color='blue', markersize=15, label='True Locations')
    # Plot predicted locations
    geometry_predicted = [Point(xy) for xy in zip(predicted_long, predicted_lat)]
    geo_df_predicted = gpd.GeoDataFrame(geometry_predicted, crs=world.crs, geometry=geometry_predicted) 
    geo_df_predicted.plot(ax=ax, marker='x', color='red', markersize=15, label='Predicted Locations')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('True vs. Predicted Locations on World Map')
    ax.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename) # Save the plot as an image
    plt.show()

# Train the ensemble models on classification tasks -> Continent and city classification
def train_hierarchical_layer(
        X_train,
        X_test,
        y_train,
        y_test,
        run_xgboost_classifier=None,
        run_grownet_classifier=None,
        run_nn_classifier=None,
        run_tabpfn_classifier=None,
        run_lightgbm_classifier=None,
        run_catboost_classifier = None,
        tune_hyperparams=False,
        apply_smote = False,
        random_state=42,
        n_splits=3,
        accuracy_threshold=0.8):
    """
    Efficient single-stage hierarchical layer:
    1. Run all models with default params in CV to filter AND generate meta-features
    2. Tune hyperparameters only for filtered models
    3. Train final ensemble
    """
    
    # Define all possible models with their configurations
    model_configs = {
        'xgb': {
            'name': 'XGBoost',
            'function': run_xgboost_classifier,
            'enabled': run_xgboost_classifier is not None,
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'grownet': {
            'name': 'GrowNet',
            'function': run_grownet_classifier,
            'enabled': run_grownet_classifier is not None,
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'nn': {
            'name': 'Neural Network',
            'function': run_nn_classifier,
            'enabled': run_nn_classifier is not None,
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'tabpfn': {
            'name': 'TabPFN',
            'function': run_tabpfn_classifier,
            'enabled': run_tabpfn_classifier is not None,
            'tune_params': {'max_time_options': [30, 60, 120, 180]}
        },
        'lightgbm': {
            'name': 'LightGBM',
            'function':run_lightgbm_classifier,
            'enabled':run_lightgbm_classifier is not None,
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'catboost': {
            'name': 'CatBoost',
            'function':run_catboost_classifier,
            'enabled':run_catboost_classifier is not None,
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        }
    }
    
    # Filter to only enabled models
    enabled_models = {k: v for k, v in model_configs.items() if v['enabled']}
    
    if not enabled_models:
        raise ValueError("At least one model function must be provided (not None)")
    
    print(f"Enabled models: {list(enabled_models.keys())}")
    
    # STAGE 1: Single CV loop to filter models AND generate meta-features
    print("STAGE 1: Running cross-validation to filter models and generate meta-features...")
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train_samples = X_train.shape[0]
    n_classes = len(np.unique(y_train))

    # Track accuracies and out-of-fold predictions
    model_fold_accuracies = {model_key: [] for model_key in enabled_models.keys()}
    oof_probs = {model_key: np.zeros((n_train_samples, n_classes)) for model_key in enabled_models.keys()}
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f"Processing Fold {fold+1}/{n_splits}")
        
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
        
        if apply_smote:
            X_fold_train, y_fold_train = SMOTE(random_state=42).fit_resample(X_fold_train, y_fold_train)
        
        for model_key, config in enabled_models.items():
            print(f"  Running {config['name']} on fold {fold+1}...")
            try:
                fold_result = config['function'](
                    X_fold_train, y_fold_train, X_fold_val, y_fold_val,
                    tune_hyperparams=False, params=None, verbose=True
                )
                
                if fold_result.get('skipped', False):
                    print(f"  {config['name']} was skipped on fold {fold+1}")
                    model_fold_accuracies[model_key].append(0.0)
                    oof_probs[model_key][val_idx] = np.full((len(val_idx), n_classes), 1.0/n_classes)
                else:
                    accuracy = fold_result['accuracy']
                    print(f"  {config['name']} fold {fold+1} accuracy: {accuracy:.4f}")
                    model_fold_accuracies[model_key].append(accuracy)
                    oof_probs[model_key][val_idx] = fold_result['predicted_probabilities']
                    
            except Exception as e:
                logging.error(f"Error running {config['name']} on fold {fold+1}: {e}")
                model_fold_accuracies[model_key].append(0.0)
                oof_probs[model_key][val_idx] = np.full((len(val_idx), n_classes), 1.0/n_classes)

    # Calculate average accuracies and filter models
    model_avg_accuracies = {k: np.mean(v) for k, v in model_fold_accuracies.items()}
    passed_models = [k for k, acc in model_avg_accuracies.items() if acc >= accuracy_threshold]
    
    print(f"Model average accuracies: {model_avg_accuracies}")
    print(f"Models passing threshold ({accuracy_threshold*100:.1f}%): {passed_models}")
    
    if not passed_models:
        raise ValueError(f"No models met the accuracy threshold of {accuracy_threshold*100:.1f}%.")

    # STAGE 2: Hyperparameter tuning only for passed models
    best_params = {}
    if tune_hyperparams:
        print("STAGE 2: Tuning hyperparameters for filtered models...")
        
        X_train_hyper, X_test_hyper, y_train_hyper, y_test_hyper = train_test_split(
            X_train, y_train, test_size=0.2, random_state=101, stratify=y_train
        )
        
        for model_key in passed_models:
            config = enabled_models[model_key]
            print(f"Tuning {config['name']} hyperparameters...")
            
            try:
                if model_key == 'tabpfn':
                    # Special handling for TabPFN
                    best_params[model_key] = _tune_tabpfn_hyperparams(
                        config['function'], X_train_hyper, y_train_hyper, 
                        X_test_hyper, y_test_hyper, config['tune_params']['max_time_options']
                    )
                else:
                    # Standard tuning for other models
                    result = config['function'](
                        X_train_hyper, y_train_hyper, X_test_hyper, y_test_hyper,
                        tune_hyperparams=True, verbose=True, **config['tune_params']
                    )
                    best_params[model_key] = result['params']
                
                print(f"Best {config['name']} params: {best_params[model_key]}")
                
            except Exception as e:
                logging.error(f"Error tuning {config['name']}: {e}")
                best_params[model_key] = None
    else:
        best_params = {model_key: None for model_key in passed_models}

    # Create meta training features from out-of-fold predictions
    meta_feature_list = [oof_probs[model_key] for model_key in passed_models]
    meta_X_train = np.hstack(meta_feature_list)
    print(f"Meta training features shape: {meta_X_train.shape}")

    # STAGE 3: Train final models on full training data
    print("STAGE 3: Training final models on full training data...")
    test_results = {}
    for model_key in passed_models:
        config = enabled_models[model_key]
        print(f"Training final {config['name']} model on full training data...")
        
        try:
            result = config['function'](
                X_train, y_train, X_test, y_test,
                tune_hyperparams=False,
                params=best_params[model_key],
                verbose=True
            )
            test_results[model_key] = result['predicted_probabilities']
            print(f"Successfully trained final {config['name']} model")
            
        except Exception as e:
            logging.error(f"Error training final {config['name']} model: {e}")
            test_results[model_key] = np.full((X_test.shape[0], n_classes), 1.0/n_classes)

    # Create meta test features
    meta_test_feature_list = [test_results[model_key] for model_key in passed_models]
    meta_X_test = np.hstack(meta_test_feature_list)
    print(f"Meta test features shape: {meta_X_test.shape}")

    # Train meta model
    print("Training meta model...")
    meta_model = xgb.XGBClassifier(objective='multi:softprob', random_state=random_state)
    meta_model.fit(meta_X_train, y_train)

    # Make predictions
    train_preds = meta_model.predict(meta_X_train)
    test_preds = meta_model.predict(meta_X_test)

    # Print summary
    print(f"\nSummary:")
    print(f"- Used models: {passed_models}")
    print(f"- Meta features: {meta_X_train.shape[1]}")
    print(f"- Meta model train accuracy: {accuracy_score(y_train, train_preds):.4f}")
    print(f"- Meta model test accuracy: {accuracy_score(y_test, test_preds):.4f}")

    return meta_model, meta_X_train, meta_X_test, train_preds, test_preds


def _tune_tabpfn_hyperparams(tabpfn_function, X_train, y_train, X_test, y_test, max_time_options):
    """Special hyperparameter tuning for TabPFN using different max_time values"""
    best_accuracy = 0.0
    best_max_time = max_time_options[0]
    
    print(f"Tuning TabPFN with max_time options: {max_time_options}")
    
    for max_time in max_time_options:
        try:
            result = tabpfn_function(
                X_train, y_train, X_test, y_test,
                tune_hyperparams=True, max_time=max_time
            )
            
            if result.get('skipped', False):
                continue
                
            accuracy = result['accuracy']
            print(f"TabPFN with max_time={max_time}: accuracy={accuracy:.4f}")
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_max_time = max_time
                
        except Exception as e:
            logging.error(f"Error testing TabPFN with max_time={max_time}: {e}")
    
    return {'max_time': best_max_time}

# Train the ensemble models on regression tasks -> Co-ordinates predictions
def train_hierarchical_coordinate_layer(
        X_train, X_test, y_train_lat, y_train_lon,
        y_test_lat, y_test_lon, y_train_coords,
        y_test_coords, coord_scaler,
        run_xgboost_regressor = None,
        run_grownet_regressor = None,
        run_nn_regressor = None,
        run_tabpfn_regressor = None,
        run_lightgbm_regressor = None,
        run_catboost_regressor = None,
        tune_hyperparams = False,
        random_state = 42,
        n_splits = 3
    ):
    """
    Two-stage hierarchical coordinate prediction:
    1. Run all models with default params, select best by average median distance
    2. Tune hyperparameters only for the best model
    """

    model_configs = {
        'xgb':{
            'name':'XGBoost',
            'function':run_xgboost_regressor,
            'enabled': run_xgboost_regressor is not None,
            'prediction_type':'sequential',
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'grownet':{
            'name':'GrowNet',
            'function':run_grownet_regressor,
            'enabled':run_grownet_regressor is not None,
            'prediction_type':'xyz',
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'nn':{
            'name':'Neural Network',
            'function': run_nn_regressor,
            'enabled':run_nn_regressor is not None,
            'prediction_type':'xyz',
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'tabpfn':{
            'name':'TabPFN',
            'function':run_tabpfn_regressor,
            'enabled':run_tabpfn_regressor is not None,
            'prediction_type':'xyz',
            'tune_params': {'n_trials': 20, 'max_time_options': [30, 60, 120]}
        },
        'lightgbm':{
            'name':'LightGBM',
            'function':run_lightgbm_regressor,
            'enabled':run_lightgbm_regressor is not None,
            'prediction_type':'sequential',
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        },
        'catboost':{
            'name':'CatBoost',
            'function':run_catboost_regressor,
            'enabled':run_catboost_regressor is not None,
            'prediction_type':'sequential',
            'tune_params': {'n_trials': 50, 'timeout': 1800}
        }
    }

    enabled_models = {k: v for k,v in model_configs.items() if v['enabled']}

    if not enabled_models:
        raise ValueError("At least one model function must be provided (not None)")
    
    print(f"Enabled models: {list(enabled_models.keys())}")

    # STAGE 1: Run all models with default parameters to calculate average median distance
    print("STAGE 1: Running all models with default parameters...")
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    n_train_samples = X_train.shape[0]
    y_train_combined = np.stack([y_train_lat, y_train_lon], axis=1)

    # Track average median distances across folds for each model
    model_avg_median_distances = {}
    
    for model_key, config in enabled_models.items():
        print(f"Running {config['name']} with default parameters...")
        fold_median_distances = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
            y_fold_train_lat, y_fold_val_lat = y_train_lat[train_idx], y_train_lat[val_idx]
            y_fold_train_lon, y_fold_val_lon = y_train_lon[train_idx], y_train_lon[val_idx]
            y_fold_train_coords, y_fold_val_coords = y_train_coords[train_idx], y_train_coords[val_idx]
            y_fold_val_combined = np.stack([y_fold_val_lat, y_fold_val_lon], axis=1)
            
            try:
                if config['prediction_type'] == "sequential":
                    # Predict latitude first
                    lat_result = config['function'](
                        X_fold_train, y_fold_train_lat, X_fold_val, y_fold_val_lat,
                        tune_hyperparams=False, params=None, verbose=False
                    )
                    
                    if lat_result.get('skipped', False):
                        fold_median_distances.append(float('inf'))
                        continue
                        
                    lat_pred_train = lat_result['model'].predict(X_fold_train)
                    lat_pred_val = lat_result['predictions']

                    # Augment features with latitude predictions
                    X_fold_train_aug = np.hstack([X_fold_train, lat_pred_train.reshape(-1, 1)])
                    X_fold_val_aug = np.hstack([X_fold_val, lat_pred_val.reshape(-1, 1)])

                    # Predict longitude
                    lon_result = config['function'](
                        X_fold_train_aug, y_fold_train_lon, X_fold_val_aug, y_fold_val_lon,
                        tune_hyperparams=False, params=None, verbose=False
                    )
                    
                    if lon_result.get('skipped', False):
                        fold_median_distances.append(float('inf'))
                        continue
                        
                    lon_pred_val = lon_result['predictions']
                    val_predictions = np.stack([lat_pred_val, lon_pred_val], axis=1)

                elif config['prediction_type'] == 'xyz':
                    fold_result = config['function'](
                        X_fold_train, y_fold_train_coords, X_fold_val, y_fold_val_coords,
                        tune_hyperparams=False, params=None, verbose=False
                    )
                    
                    if fold_result.get('skipped', False):
                        fold_median_distances.append(float('inf'))
                        continue
                    
                    xyz_pred = fold_result['predictions']
                    # Ensure predictions are 2D
                    if xyz_pred.ndim == 1:
                        xyz_pred = xyz_pred.reshape(-1, 3)
                    xyz_rescaled = coord_scaler.inverse_transform(xyz_pred)
                    val_predictions = xyz_to_latlon(xyz_rescaled)
                
                # Calculate median distance for this fold
                distances = haversine_distance(
                    y_fold_val_combined[:, 0], y_fold_val_combined[:, 1],
                    val_predictions[:, 0], val_predictions[:, 1]
                )
                fold_median_distances.append(np.median(distances))
                print(f"  {config['name']} fold {fold+1} median distance: {np.median(distances):.2f} km")
                
            except Exception as e:
                logging.error(f"Error running {config['name']} on fold {fold+1}: {e}")
                fold_median_distances.append(float('inf'))
        
        model_avg_median_distances[model_key] = np.mean(fold_median_distances)
        print(f"{config['name']} average median distance: {model_avg_median_distances[model_key]:.2f} km")
    
    # Select best model by lowest average median distance
    best_model = min(model_avg_median_distances, key=model_avg_median_distances.get)
    print(f"Best model by average median distance: {best_model}")
    print(f"Model average median distances: {model_avg_median_distances}")
    
    # STAGE 2: Hyperparameter tuning only for the best model
    best_params = None
    if tune_hyperparams:
        print("STAGE 2: Tuning hyperparameters for the best model...")
        
        X_train_hyper, X_test_hyper, y_train_hyper_lat, y_test_hyper_lat = train_test_split(
            X_train, y_train_lat, test_size=0.2, random_state=101
        )
        _, _, y_train_hyper_lon, y_test_hyper_lon = train_test_split(
            X_train, y_train_lon, test_size=0.2, random_state=101
        )
        _, _, y_train_hyper_coords, y_test_hyper_coords = train_test_split(
            X_train, y_train_coords, test_size=0.2, random_state=101
        )
        
        config = enabled_models[best_model]
        print(f"Tuning {config['name']} hyperparameters...")
        
        try:
            if config['prediction_type'] == 'sequential':
                # For sequential models, tune on latitude prediction
                tune_params = config['tune_params'].copy()
                tune_params.update({
                    'tune_hyperparams': True,
                    'verbose': True
                })
                
                result = config['function'](
                    X_train_hyper, y_train_hyper_lat, X_test_hyper, y_test_hyper_lat,
                    **tune_params
                )
                
            elif config['prediction_type'] == 'xyz':
                tune_params = config['tune_params'].copy()
                tune_params.update({
                    'tune_hyperparams': True,
                    'verbose': True
                })
                
                if best_model == 'tabpfn':
                    # Special handling for TabPFN
                    best_params = _tune_tabpfn_regressor_hyperparams(
                        config['function'], X_train_hyper, y_train_hyper_coords,
                        X_test_hyper, y_test_hyper_coords, tune_params['max_time_options']
                    )
                else:
                    result = config['function'](
                        X_train_hyper, y_train_hyper_coords, X_test_hyper, y_test_hyper_coords,
                        **tune_params
                    )
                    best_params = result.get('params')
            
            if best_params is None and 'result' in locals():
                best_params = result.get('params')
            print(f"Best {config['name']} params: {best_params}")
            
        except Exception as e:
            logging.error(f"Error tuning {config['name']}: {e}")
            best_params = None

    # STAGE 3: Final training with tuned parameters
    print("STAGE 3: Final training with tuned parameters...")
    
    config = enabled_models[best_model]
    
    try:
        if config['prediction_type'] == 'sequential':
            # Sequential prediction with tuned params
            lat_result = config['function'](
                X_train, y_train_lat, X_test, y_test_lat,
                tune_hyperparams=False, params=best_params, verbose=True
            )
            
            if lat_result.get('skipped', False):
                raise ValueError(f"Latitude prediction failed for {config['name']}")
                
            lat_pred_train = lat_result['model'].predict(X_train)
            lat_pred_test = lat_result['predictions']
            
            # Augment features with latitude predictions
            X_train_aug = np.hstack([X_train, lat_pred_train.reshape(-1, 1)])
            X_test_aug = np.hstack([X_test, lat_pred_test.reshape(-1, 1)])
            
            lon_result = config['function'](
                X_train_aug, y_train_lon, X_test_aug, y_test_lon,
                tune_hyperparams=False, params=best_params, verbose=True
            )
            
            if lon_result.get('skipped', False):
                raise ValueError(f"Longitude prediction failed for {config['name']}")
                
            lon_pred_test = lon_result['predictions']
            test_preds = np.stack([lat_pred_test, lon_pred_test], axis=1)

        elif config['prediction_type'] == 'xyz':
            result = config['function'](
                X_train, y_train_coords, X_test, y_test_coords,
                tune_hyperparams=False, params=best_params, verbose=True
            )
            
            if result.get('skipped', False):
                raise ValueError(f"XYZ prediction failed for {config['name']}")
            
            xyz_pred = result['predictions']
            # Ensure predictions are 2D
            if xyz_pred.ndim == 1:
                xyz_pred = xyz_pred.reshape(-1, 3)
            xyz_rescaled = coord_scaler.inverse_transform(xyz_pred)
            test_preds = xyz_to_latlon(xyz_rescaled)
            
    except Exception as e:
        logging.error(f"Error training final {config['name']} model: {e}")
        raise

    # Calculate distance metrics
    def calculate_distance_metrics(y_true, y_pred):
        distances = haversine_distance(y_true[:, 0], y_true[:, 1], y_pred[:, 0], y_pred[:, 1])
        return {
            'median_distance': np.median(distances),
            'mean_distance': np.mean(distances),
            'percentile_95': np.percentile(distances, 95),
            'percentile_99': np.percentile(distances, 99),
            'distances': distances
        }

    y_test_combined = np.stack([y_test_lat, y_test_lon], axis=1)
    test_metrics = calculate_distance_metrics(y_test_combined, test_preds)

    print(f"\nSummary:")
    print(f"- Used model: {best_model}")
    print(f"- Test median distance: {test_metrics['median_distance']:.2f} km")
    print(f"- Test mean distance: {test_metrics['mean_distance']:.2f} km")
    print(f"- Test 95th percentile: {test_metrics['percentile_95']:.2f} km")

    return {
        'test_preds': test_preds,
        'test_metrics': test_metrics,
        'enabled_models': [best_model],
        'best_model': best_model,
        'best_params': best_params
    }

def _tune_tabpfn_regressor_hyperparams(tabpfn_function, X_train, y_train, X_test, y_test, max_time_options):
    """Special hyperparameter tuning for TabPFN regressor using different max_time values"""
    best_r2 = -float('inf')
    best_max_time = max_time_options[0]
    
    print(f"Tuning TabPFN regressor with max_time options: {max_time_options}")
    
    for max_time in max_time_options:
        try:
            result = tabpfn_function(
                X_train, y_train, X_test, y_test,
                tune_hyperparams=True, max_time=max_time, verbose=False
            )
            
            if result.get('skipped', False):
                continue
                
            r2 = result.get('r2_score', -float('inf'))
            print(f"TabPFN regressor with max_time={max_time}: R²={r2:.4f}")
            
            if r2 > best_r2:
                best_r2 = r2
                best_max_time = max_time
                
        except Exception as e:
            logging.error(f"Error testing TabPFN regressor with max_time={max_time}: {e}")
    
    return {'max_time': best_max_time}

# Process data
df = pd.read_csv("/kaggle/input/metasub-data/metasub_training_testing_data.csv")
processed_data = process_data_hierarchical(df)

X_cont = processed_data['x_cont']
y_cont = processed_data['y_continent']
y_cities = processed_data['y_city']
y_coords = processed_data['y_coords']
y_latitude = processed_data['y_latitude']
y_longitude = processed_data['y_longitude']


split_data = hierarchical_split(
    X_cont,
    y_cont,
    y_cities,
    y_coords,
    processed_data['y_latitude'],
    processed_data['y_longitude']
)

# Original feautres
X_train_cont, X_test_cont = split_data['X_train'], split_data['X_test']
# Train and test for continent
y_train_cont, y_test_cont = split_data['y_cont_train'], split_data['y_cont_test']
# Train and test for cities
y_train_city, y_test_city = split_data['y_city_train'], split_data['y_city_test']
# Train and test for latitude
y_train_lat, y_test_lat = split_data['y_lat_train'], split_data['y_lat_test']
# Train and test for longitude
y_train_lon, y_test_lon = split_data['y_lon_train'], split_data['y_lon_test']
# Train and test for co-ordinates
y_train_coords, y_test_coords = split_data['y_coords_train'],  split_data['y_coords_test']

# Continent layer
continent_model, meta_X_train_cont, meta_X_test_cont, cont_train_preds, cont_test_preds = train_hierarchical_layer(
    X_train=X_train_cont,
    X_test=X_test_cont,
    y_train=y_train_cont,
    y_test=y_test_cont,
    run_xgboost_classifier=None,
    run_grownet_classifier=None,
    run_nn_classifier=None,
    run_tabpfn_classifier=run_tabpfn_classifier,
    run_lightgbm_classifier=None,
    run_catboost_classifier=None,
    tune_hyperparams=False,
    apply_smote=True,
    n_splits=5,
    accuracy_threshold=0.90  # 91% for continent
)

exit()

# City layer 
X_train_city = np.hstack([X_train_cont,meta_X_train_cont])
X_test_city = np.hstack([X_test_cont,meta_X_test_cont])

city_model, meta_X_train_city, meta_X_test_city, city_train_preds, city_test_preds = train_hierarchical_layer(
    X_train=X_train_city,
    X_test=X_test_city,
    y_train=y_train_city,
    y_test=y_test_city,
    run_xgboost_classifier=run_xgboost_classifier,
    run_grownet_classifier=None,
    run_lightgbm_classifier=run_lightgbm_classifier,
    run_catboost_classifier=None,
    run_nn_classifier=None,
    run_tabpfn_classifier=None,  # Now handles GPU/CPU automatically
    tune_hyperparams=False,
    apply_smote=False,
    n_splits=5,
    accuracy_threshold=0.91  # 89% for city
)

# Coordinate layer

X_train_coord = np.hstack([X_train_city,meta_X_train_city])
X_test_coord = np.hstack([X_test_city,meta_X_test_city])


coords_results = train_hierarchical_coordinate_layer(
    X_train=X_train_coord,
    X_test=X_test_coord,
    y_train_lat=y_train_lat,
    y_train_lon = y_train_lon,
    y_test_lat=y_test_lat,
    y_test_lon=y_test_lon,
    y_train_coords=y_train_coords,
    y_test_coords=y_test_coords,
    coord_scaler=processed_data['encoders']['coord'],
    run_xgboost_regressor=run_xgboost_regressor,
    run_tabpfn_regressor=run_tabpfn_regressor,
    run_nn_regressor=None,
    run_grownet_regressor=None,
    run_lightgbm_regressor=None,
    run_catboost_regressor=None,
    tune_hyperparams=False,
    n_splits=5
)


# All metrics
save_dir = "saved_results/"
os.makedirs(save_dir,exist_ok=True)
# Continent Layer

print("\nContinent Prediction - Test Set:")
print(classification_report(y_test_cont, cont_test_preds,target_names=processed_data['continents']))
# Save the test predictions
np.save(os.path.join(save_dir, "x_test.npy"), X_test_cont)
np.save(os.path.join(save_dir, "y_test_cont.npy"),y_test_cont)
np.save(os.path.join(save_dir, "y_pred_cont.npy"),cont_test_preds)

# City Layer

print("\nCity Prediction - Test Set:")
print(classification_report(y_test_city,city_test_preds))
# Save the test predictions
np.save(os.path.join(save_dir,"y_test_city.npy"),y_test_city)
np.save(os.path.join(save_dir,"y_pred_city.npy"),city_test_preds)

# Co-ordinate Layer
print("Coordinate prediction results:")
print(f"Test Median Distance: {coords_results['test_metrics']['median_distance']:.2f} km")
print(f"Test Mean Distance: {coords_results['test_metrics']['mean_distance']:.2f} km")
print(f"Test 95th Percentile: {coords_results['test_metrics']['percentile_95']:.2f} km")

# Save the test predictions
np.save(os.path.join(save_dir,"y_test_coord.npy"),np.stack([y_test_lat,y_test_lon],axis=1).astype(np.float32))
np.save(os.path.join(save_dir,"y_pred_coord.npy"),coords_results['test_preds'])


# Error calculations
def error_calc(test_conts,pred_conts,test_city,pred_city,test_lat,pred_lat,test_lon,pred_lon):
    error_df = pd.DataFrame({
        'true_cont': test_conts,
        'pred_cont': pred_conts,
        'true_city': test_city,
        'pred_city': pred_city,
        'true_lat': test_lat,
        'true_lon': test_lon,
        'pred_lat': pred_lat,
        'pred_lon': pred_lon
    })


    # Assign true contient and city names
    error_df['true_cont_name'] = error_df['true_cont'].map(lambda i: processed_data['continents'][i])
    error_df['pred_cont_name'] = error_df['pred_cont'].map(lambda i: processed_data['continents'][i])

    error_df['true_city_name'] = error_df['true_city'].map(lambda i: processed_data['cities'][i])
    error_df['pred_city_name'] = error_df['pred_city'].map(lambda i: processed_data['cities'][i])

    cont_support_map = dict(zip(np.unique(error_df['true_cont_name'],return_counts=True)[0],np.unique(error_df['true_cont_name'],return_counts=True)[1]))
    city_support_map = dict(zip(np.unique(error_df['true_city_name'],return_counts=True)[0],np.unique(error_df['true_city_name'],return_counts=True)[1]))

    # Step 1: Compute the correctness
    error_df['continent_correct'] = error_df['true_cont'] == error_df['pred_cont']
    error_df['city_correct'] = error_df['true_city'] == error_df['pred_city']

    # Step 2: Calculate the haversine distance
    error_df['coord_error'] = haversine_distance(error_df['true_lat'],error_df['true_lon'],error_df['pred_lat'],error_df['pred_lon'])

    # Print the distance error statistics
    print(f"The median distance error is {np.median(error_df['coord_error'].values)}")
    print(f"The mean distance error is {np.mean(error_df['coord_error'].values)}")
    print(f"The max distance error is {np.max(error_df['coord_error'].values)}")

    # Step 3: Group into 4 categories
    def group_label(row):
        if row['continent_correct'] and row['city_correct']:
            return 'C_correct Z_correct'
        elif row['continent_correct'] and not row['city_correct']:
            return 'C_correct Z_wrong'
        elif not row['continent_correct'] and row['city_correct']:
            return 'C_wrong Z_correct'
        else:
            return 'C_wrong Z_wrong'
        
    # Create the error group column
    error_df['error_group'] = error_df.apply(group_label, axis=1)

    # Now we proceed with grouping
    group_stats = error_df.groupby('error_group')['coord_error'].agg([
        ('count','count'),
        ('mean_error_km','mean'),
        ('median_error_km','median')
    ])

    # Step 5: Calculate proportion and expected error.
    """
    P(C=C*) : Probability of contient predicting correct continent
    P(Z=Z*) : Probability of ciry predicting correct city
    E(D|condition) : Expected distance error under that condition

    E(D) = P(C=C*,Z=Z*)*E(D|C=C*,Z=Z*)+ -> ideal condition continent is correct and city is also correct
            P(C=C*,Z!=Z*)*E(D|C=C*,Z!=Z*)+ -> continent is correct and city is wrong
            P(C!=C*,Z=Z*)*E(D|C!=C*,Z=Z*)+ -> city is correct but continent is wrong
            P(C!=C*,Z!=Z*)*E(D|C!=C*,Z!=Z*) -> both cotinent and city are wrong
    """
    total = len(error_df)
    group_stats['proportion'] = group_stats['count'] / total
    group_stats['weighted_error'] = group_stats['mean_error_km'] * group_stats['proportion']
    expected_total_error = group_stats['weighted_error'].sum()
    print(group_stats)
    print(f"Expected Coordinate Error E[D]: {expected_total_error:.2f} km")

    def compute_in_radius_metrics(y_true, y_pred, thresholds=None):
        """
        Compute % of predictions within given distance thresholds
        y_true, y_pred: numpy arrays of shape (N, 2) for [lat, lon]
        thresholds: List of distance thresholds in kilometers (default: [1, 5, 50, 100, 250, 500, 1000, 5000])
        """
        if thresholds is None:
            thresholds = [1, 5, 50, 100, 250, 500, 1000, 5000]

        distances = haversine_distance(
            y_true[:, 0], y_true[:, 1], y_pred[:, 0], y_pred[:, 1]
        )

        results = {}
        for r in thresholds:
            percent = np.mean(distances <= r) * 100
            results[f"<{r} km"] = percent

        return results

    metrics = compute_in_radius_metrics(y_true=np.stack([test_lat,test_lon],axis=1), y_pred=np.stack([pred_lat,pred_lon],axis=1))

    print("In-Radius Accuracy Metrics:")
    for k, v in metrics.items():
        print(f"{k:>8}: {v:.2f}%")
        
    def in_radius_by_group(df, group_col, thresholds=[1, 5, 50, 100, 250, 500, 1000, 5000]):
        """
        Compute in-radius accuracy for a group column (continent, city, or continent+city)
        """
        df = df.copy()
        df['coord_error'] = haversine_distance(
            df['true_lat'].values, df['true_lon'].values,
            df['pred_lat'].values, df['pred_lon'].values
        )

        results = {}
        grouped = df.groupby(group_col)

        for group_name, group_df in grouped:
            res = {}
            errors = group_df['coord_error'].values
            for r in thresholds:
                res[f"<{r} km"] = np.mean(errors <= r) * 100  # in %
            results[group_name] = res

        return pd.DataFrame(results).T  # Transpose for better readability
    
    continent_metrics = in_radius_by_group(error_df, group_col='true_cont_name')
    print("In-Radius Accuracy per Continent")
    continent_metrics['continent_support'] = continent_metrics.index.map(cont_support_map)
    print(continent_metrics.round(2))

    city_metrics = in_radius_by_group(error_df, group_col='true_city_name')
    print("In-Radius Accuracy per City")
    city_metrics['city_support'] = city_metrics.index.map(city_support_map)
    print(city_metrics.round(2))

    error_df['continent_city'] = error_df['true_cont_name'] + " / " + error_df['true_city_name']
    cont_city_metrics = in_radius_by_group(error_df, group_col='continent_city')
    cont_city_metrics['continent_support'] = cont_city_metrics.index.map(lambda x :x.split("/")[-1].strip()).map(city_support_map)
    print("In-Radius Accuracy per Continent-City")
    print(cont_city_metrics.round(2))


# Error calculations for all the predictions
print("Starting error calculations...")
error_calc(test_conts=y_test_cont,pred_conts=cont_test_preds,
           test_city=y_test_city,pred_city = city_test_preds,
           test_lat=y_test_lat,pred_lat=coords_results['test_preds'][:,0],
           test_lon=y_test_lon,pred_lon=coords_results['test_preds'][:,1])

# Plot the points on the world map
#print("Plotting points on world map...")
#plot_points_on_world_map(true_lat = y_test_lat,
#                         true_long=y_test_lon,
#                         predicted_lat=coords_results['test_preds'][:,0],
#                         predicted_long=coords_results['test_preds'][:,1],
#                         filename="test.png")

Continents: 7 (['east_asia' 'europe' 'middle_east' 'north_america' 'oceania'
 'south_america' 'sub_saharan_africa'])
Cities: 40
Continuous features: 200
Enabled models: ['tabpfn']
STAGE 1: Running cross-validation to filter models and generate meta-features...
Processing Fold 1/5
  Running TabPFN on fold 1...
Processing Fold 2/5
  Running TabPFN on fold 2...
Processing Fold 3/5
  Running TabPFN on fold 3...
Processing Fold 4/5
  Running TabPFN on fold 4...
Processing Fold 5/5
  Running TabPFN on fold 5...
Model average accuracies: {'tabpfn': 0.0}
Models passing threshold (90.0%): []


ValueError: No models met the accuracy threshold of 90.0%.