**Pipeline** for running tabPFN and AutoGluon together and make an ensemble. Made to run in Colab or StudioLab

In [None]:
### installments

!pip install autogluon
!sudo apt-get install graphviz graphviz-dev
!pip install pygraphviz
!pip install tabpfn
!pip install sklearn
!pip install numpy

In [None]:
# All imports

from autogluon.tabular import TabularDataset, TabularPredictor

import pandas as pd
import numpy as np
import os

from sklearn.metrics import accuracy_score, r2_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier

from IPython.display import Image, display

from datetime import datetime, timedelta


In [None]:
# Connect to drive for dataaccess

from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/361092'

In [None]:
random_seed = 42

In [None]:
## Function for loading one of the 10 folds of the property dataset and concatinating the X and y values for train and test respectively.

#base_path = '../../data/361092' # Use this when running locally

def load_fold(fold_number, random_seed=random_seed, sample_size=None, concat_test=False):
    df_X_train = pd.read_parquet(f'{base_path}/{fold_number}/X_train.parquet')
    df_y_train = pd.read_parquet(f'{base_path}/{fold_number}/y_train.parquet')
    df_X_test = pd.read_parquet(f'{base_path}/{fold_number}/X_test.parquet')
    df_y_test = pd.read_parquet(f'{base_path}/{fold_number}/y_test.parquet')

    # concatinating the X and y values for train (and test), but keeping for test
    df_train = pd.concat([df_X_train, df_y_train], axis=1)
    

    # Convert to AutoGluon's TabularDataset
    if concat_test:
        df_test = pd.concat([df_X_test, df_y_test], axis=1)
        train_dataset = TabularDataset(df_train)
        test_dataset = TabularDataset(df_test)

        return train_dataset, test_dataset
    
    train_dataset = TabularDataset(df_train)
    test_dataset_X = TabularDataset(df_X_test)
    test_dataset_y = TabularDataset(df_y_test)

    return train_dataset, test_dataset_X, test_dataset_y

# Also instantiate the target column
label_property = 'oz252'


In [None]:
## Function to fit the model using AutoGluon

def fit_gluon(train_dataset, problem_type='regression', hyperparameters=None, eval_metric='r2', presets='medium_quality', time_limit=100, fit_weighted_ensemble=None, num_cpus = None, num_gpus=None, auto_stack=None, num_bag_folds=None, num_bag_sets=None, num_stack_levels=None, num_trials=None, verbosity=None, ag_args_fit=None, feature_prune=None, excluded_model_types=None, keep_only_best=None):
    predictor = TabularPredictor(label=label_property, problem_type=problem_type, eval_metric=eval_metric)

    fit_args = {
        'train_data': train_dataset,
        'presets': presets,
        'time_limit': time_limit,
    }

    if hyperparameters is not None:
        fit_args['hyperparameters'] = hyperparameters
    if auto_stack is not None:
        fit_args['auto_stack'] = auto_stack
    if num_bag_folds is not None:
        fit_args['num_bag_folds'] = num_bag_folds
    if num_bag_sets is not None:
        fit_args['num_bag_sets'] = num_bag_sets
    if num_stack_levels is not None:
        fit_args['num_stack_levels'] = num_stack_levels
    if num_trials is not None:
        fit_args['num_trials'] = num_trials
    if verbosity is not None:
        fit_args['verbosity'] = verbosity
    if ag_args_fit is not None:
        fit_args['ag_args_fit'] = ag_args_fit
    if feature_prune is not None:
        fit_args['feature_prune'] = feature_prune
    if excluded_model_types is not None:
        fit_args['excluded_model_types'] = excluded_model_types
    if fit_weighted_ensemble is not None:
        fit_args['fit_weighted_ensemble'] = fit_weighted_ensemble
    if num_cpus is not None:
        fit_args['num_cpus'] = num_cpus
    if num_gpus is not None:
        fit_args['num_gpus'] = num_gpus
    if keep_only_best is not None:
        fit_args['keep_only_best'] = keep_only_best

    predictor.fit(**fit_args)
    return predictor



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## Function to evaluate the predictor

def evaluate_gluon(model, test_dataset):

    leaderboard = model.leaderboard(test_dataset, only_pareto_frontier=True)

    y_test = test_dataset[label_property]
    x_test = test_dataset.drop(columns=[label_property])
    y_pred = model.predict(x_test)
    test_score = model.evaluate_predictions(y_true=y_test, y_pred=y_pred)

    path_to_png = model.plot_ensemble_model()
    L2_diagram = Image(filename=path_to_png)


    return test_score, leaderboard, L2_diagram

In [None]:
## Making sets of all 10 folds

full_train = None
full_test = None

for fold_number in range(1, 11):
    train_dataset, test_dataset = load_fold(fold_number, random_seed=random_seed)
    if full_train is None:
        full_train = train_dataset
        full_test = test_dataset
    else:
        # Use pd.concat to combine TabularDatasets
        full_train = pd.concat([full_train, train_dataset])
        full_test = pd.concat([full_test, test_dataset])

In [None]:
## Function to turn the target variable into 10 equally distributed classes


def create_interval_classes(df, target_column, n_intervals=10):
    """
    Create equally distributed interval classes for the target variable.
    
    :param df: DataFrame containing the target variable
    :param target_column: Name of the target variable column
    :param n_intervals: Number of intervals to create (default 10)
    :return: DataFrame with new 'interval_class' column and interval boundaries
    """
    # Extract target values
    target_values = df[target_column].values
    
    # Calculate interval boundaries
    interval_boundaries = np.percentile(target_values, np.linspace(0, 100, n_intervals+1))
    
    # Create interval labels
    interval_labels = [f'Interval_{i+1}' for i in range(n_intervals)]
    
    # Assign interval classes
    df['interval_class'] = pd.cut(df[target_column], 
                                  bins=interval_boundaries, 
                                  labels=interval_labels, 
                                  include_lowest=True)
    
    return df, interval_boundaries

In [None]:
## Create interval classes for the full dataset 

full_train_w_interval, boundaries = create_interval_classes(full_train, 'oz252')
train_for_tabpfn = full_train_w_interval.drop(columns=['oz252'])

In [None]:
## Function to fit the model using TabPFN

def fit_tabpfn(data=full_train_w_interval, n=1000, device='cpu', N_ensemble_configurations=None, random_seed=radnom_seed):
    if N_ensemble_configurations is None:
        classifier = TabPFNClassifier(device=device)
    else:
        classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)
    
    sample_full_train = train_for_tabpfn.sample(n=n, random_state=random_seed)
    
    X_train = sample_full_train.drop(columns=['interval_class'])
    y_train = sample_full_train['interval_class']
    
    classifier.fit(X_train, y_train)

    return classifier


In [None]:
## Function to predict using the TabPFN classifier and add the prediction as a new column in the data

def predict_tabpfn(classifier, data):
    y_pred = classifier.predict(data)
    data['interval_class_pred'] = y_pred

    return data


In [None]:
## Fit the TabPFN model, predict the training data w/o original target and the test data, and then add the intervall prediction as feature for both sets.

classifier = fit_tabpfn(data=full_train_w_interval, n=1000, device='gpu', N_ensemble_configurations=None, random_seed=random_seed)

tabpfn_preicted_train = predict_tabpfn(classifier, train_for_tabpfn)
tabpfn_preicted_test = predict_tabpfn(classifier, full_test)

train_with_tab_pred = pd.concat([full_train, tabpfn_preicted_train['interval_class_pred']], axis=1)

In [None]:
## Evaluate the tabpfn classifier...

In [None]:
## Training the AutoGluon predictor

model = fit_gluon(train_with_tab_pred, time_limit=100)
# Get the leaderboard
leaderboard = model.leaderboard(extra_info=['r2'])

In [None]:
# Evaluate the model

test_score, leaderboard, L2_diagram = evaluate_gluon(model, full_test)

# # Convert the leaderboard to a DataFrame
df = leaderboard.copy()

# Set the style of the plots
sns.set(style="whitegrid")

# Plot for validation score (r2)
plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='score_val', data=df)
plt.xticks(rotation=45, ha='right')
plt.title('Validation Score (R2) by Model')
plt.xlabel('Model')
plt.ylabel('Validation Score (R2)')
plt.tight_layout()
plt.show()


In [None]:
## Make different predictiors for eventual ensemble (maybe different metrics, hyperparameters, etc.)

predictors = []

In [None]:
## Function to load predictors

def load_predictors(start_time, end_time, folder='AutogluonModels'):
    """
    Load AutoGluon predictors created within a specified time interval.
    
    :param start_time: Start of the time interval (str in format 'YYYYMMDD_HHMMSS')
    :param end_time: End of the time interval (str in format 'YYYYMMDD_HHMMSS')
    :param folder: Folder containing the AutoGluon models
    :return: List of loaded predictors
    """
    start_datetime = datetime.strptime(start_time, '%Y%m%d_%H%M%S')
    end_datetime = datetime.strptime(end_time, '%Y%m%d_%H%M%S')
    
    predictors = []
    
    for item in os.listdir(folder):
        if item.startswith('ag-'):
            model_time_str = item.split('-')[1]
            model_time = datetime.strptime(model_time_str, '%Y%m%d_%H%M%S')
            
            if start_datetime <= model_time <= end_datetime:
                predictor_path = os.path.join(folder, item)
                predictor = TabularPredictor.load(predictor_path)
                predictors.append(predictor)
    
    return predictors

In [None]:
## Creating easy average ensemble


start_time = '20240704_124200'
end_time = '20240704_125900'

loaded_predictors = load_predictors(start_time, end_time)

y_test = full_test[label_property]
full_test_X = full_test.drop(columns=[label_property])


# Simple averaging ensemble
avg_ensemble = 0

for i in loaded_predictors:
    avg_ensemble += i.predict(full_test_X)

ten_fold_ensemble = avg_ensemble / len(loaded_predictors)


# Calculate R2 score
r2 = r2_score(y_test, ten_fold_ensemble)

print(f'R2 score: {r2}')

In [None]:
## creating more complex ensemble...