### This is a script to to load the property dataset and run **AutoGluon** on it. First basic model run, then different configurations are tried. Then look at possible **third level ensembling** of the models.

In [None]:
### installments

!pip install autogluon
!sudo apt-get install graphviz graphviz-dev
!pip install pygraphviz
!pip install tabpfn

In [9]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [7]:
random_seed = 42

In [2]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/361092'


Mounted at /content/drive


In [2]:
## Function for loading one of the 10 folds of the property dataset and concatinating the X and y values for train and test respectively.
import pandas as pd

base_path = '../../data/361092' # Use this when running locally

def load_fold(fold_number, random_seed=42, sample_size=None):
    df_X_train = pd.read_parquet(f'{base_path}/{fold_number}/X_train.parquet')
    df_y_train = pd.read_parquet(f'{base_path}/{fold_number}/y_train.parquet')
    df_X_test = pd.read_parquet(f'{base_path}/{fold_number}/X_test.parquet')
    df_y_test = pd.read_parquet(f'{base_path}/{fold_number}/y_test.parquet')

    # concatinating the X and y values for train and test respectively
    df_train = pd.concat([df_X_train, df_y_train], axis=1)
    df_test = pd.concat([df_X_test, df_y_test], axis=1)

    # Convert to AutoGluon's TabularDataset
    if sample_size:
        train_dataset = TabularDataset(df_train).sample(n=sample_size, random_state=random_seed)
        test_dataset = TabularDataset(df_test).sample(n=sample_size, random_state=random_seed)
    else:
        train_dataset = TabularDataset(df_train)
        test_dataset = TabularDataset(df_test)

    return train_dataset, test_dataset

# Also instantiate the target column
label_property = 'oz252'


In [3]:
## Function to fit the model, with most of the hyperparameters present and set to default/None. (Add more hyperparameters if desirable)

from autogluon.tabular import TabularPredictor

def fit_gluon(train_dataset, problem_type='regression', hyperparameters=None, eval_metric='r2', presets='medium_quality', time_limit=100, fit_weighted_ensemble=None, num_cpus = None, num_gpus=None, auto_stack=None, num_bag_folds=None, num_bag_sets=None, num_stack_levels=None, num_trials=None, verbosity=None, ag_args_fit=None, feature_prune=None, excluded_model_types=None, keep_only_best=None):
    predictor = TabularPredictor(label=label_property, problem_type=problem_type, eval_metric=eval_metric)

    fit_args = {
        'train_data': train_dataset,
        'presets': presets,
        'time_limit': time_limit,
    }

    if hyperparameters is not None:
        fit_args['hyperparameters'] = hyperparameters
    if auto_stack is not None:
        fit_args['auto_stack'] = auto_stack
    if num_bag_folds is not None:
        fit_args['num_bag_folds'] = num_bag_folds
    if num_bag_sets is not None:
        fit_args['num_bag_sets'] = num_bag_sets
    if num_stack_levels is not None:
        fit_args['num_stack_levels'] = num_stack_levels
    if num_trials is not None:
        fit_args['num_trials'] = num_trials
    if verbosity is not None:
        fit_args['verbosity'] = verbosity
    if ag_args_fit is not None:
        fit_args['ag_args_fit'] = ag_args_fit
    if feature_prune is not None:
        fit_args['feature_prune'] = feature_prune
    if excluded_model_types is not None:
        fit_args['excluded_model_types'] = excluded_model_types
    if fit_weighted_ensemble is not None:
        fit_args['fit_weighted_ensemble'] = fit_weighted_ensemble
    if num_cpus is not None:
        fit_args['num_cpus'] = num_cpus
    if num_gpus is not None:
        fit_args['num_gpus'] = num_gpus
    if keep_only_best is not None:
        fit_args['keep_only_best'] = keep_only_best

    predictor.fit(**fit_args)
    return predictor



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
## Function to evaluate a fitted model and training set.
from IPython.display import Image, display

def evaluate_gluon(model, test_dataset):

    leaderboard = model.leaderboard(test_dataset, only_pareto_frontier=True)

    y_test = test_dataset[label_property]
    x_test = test_dataset.drop(columns=[label_property])
    y_pred = model.predict(x_test)
    test_score = model.evaluate_predictions(y_true=y_test, y_pred=y_pred)

    #path_to_png = model.plot_ensemble_model()
    #L2_diagram = display(Image(filename=path_to_png))


    return test_score, leaderboard #, L2_diagram

In [None]:
## First training the model speratly on all 10 folds to see that it is consistent

for fold_number in range(1, 11): # jsut the first to start with
    train_dataset, test_dataset = load_fold(fold_number, random_seed=random_seed)
    model = fit_gluon(train_dataset, time_limit=100, verbosity=1, keep_only_best=True)
    test_score, leaderboard = evaluate_gluon(model, test_dataset)
    print(f'Fold {fold_number}\n\n: {test_score} \n\n')
    print(leaderboard.head())
    #print(model.fit_summary())




In [None]:
## First training the model speratly on all 10 folds to see that it is consistent

for fold_number in range(1, 11): # jsut the first to start with
    train_dataset, test_dataset = load_fold(fold_number, random_seed=random_seed)
    model = fit_gluon(train_dataset, time_limit=300, presets=['medium_quality', 'optimize_for_deployment'], verbosity=1)
    test_score, leaderboard = evaluate_gluon(model, test_dataset)
    print(f'Fold {fold_number}\n\n: {test_score} \n\n')
    print(leaderboard.head())
    #print(model.fit_summary())




In [21]:
## Making sets of all 10 folds

full_train = None
full_test = None

for fold_number in range(1, 11):
    train_dataset, test_dataset = load_fold(fold_number, random_seed=random_seed)
    if full_train is None:
        full_train = train_dataset
        full_test = test_dataset
    else:
        # Use pd.concat to combine TabularDatasets
        full_train = pd.concat([full_train, train_dataset])
        full_test = pd.concat([full_test, test_dataset])

In [15]:
from datetime import datetime, timedelta
import os

def load_predictors(start_time, end_time, folder='AutogluonModels'):
    """
    Load AutoGluon predictors created within a specified time interval.
    
    :param start_time: Start of the time interval (str in format 'YYYYMMDD_HHMMSS')
    :param end_time: End of the time interval (str in format 'YYYYMMDD_HHMMSS')
    :param folder: Folder containing the AutoGluon models
    :return: List of loaded predictors
    """
    start_datetime = datetime.strptime(start_time, '%Y%m%d_%H%M%S')
    end_datetime = datetime.strptime(end_time, '%Y%m%d_%H%M%S')
    
    predictors = []
    
    for item in os.listdir(folder):
        if item.startswith('ag-'):
            model_time_str = item.split('-')[1]
            model_time = datetime.strptime(model_time_str, '%Y%m%d_%H%M%S')
            
            if start_datetime <= model_time <= end_datetime:
                predictor_path = os.path.join(folder, item)
                predictor = TabularPredictor.load(predictor_path)
                predictors.append(predictor)
    
    return predictors

In [23]:
## Creating ensemble of the trained AutoGluon models on each fold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


start_time = '20240704_124200'
end_time = '20240704_125900'

loaded_predictors = load_predictors(start_time, end_time)

y_test = full_test[label_property]
full_test_X = full_test.drop(columns=[label_property])


# Simple averaging ensemble
ten_fold_ensemble = 0

for i in loaded_predictors:
    ten_fold_ensemble += i.predict(full_test_X)

ten_fold_ensemble = ten_fold_ensemble / 10


# Calculate R2 score
r2 = r2_score(y_test, ten_fold_ensemble)

print(f'R2 score: {r2}')

R2 score: 0.47235185168196536


In [None]:
## Display the feature importance on the whole dataset trained for a longer
## period so we interpret the dataset more correctly

model.feature_importance(test_dataset)

In [None]:
## Training the model on all 10 folds

model = fit_gluon(full_train, time_limit=100)
test_score, leaderboard = evaluate_gluon(model, full_test)
print(f'Full Train: {test_score}')
display(Image(filename=model.plot_ensemble_model()))

In [2]:
## Training the model again without the least important columns

columns_to_drop = ['oz173', 'oz1', 'oz4', 'oz181', 'oz11', 'oz31', 'oz178', 'oz83', 'oz87', 'oz234', 'oz96', 'oz135', 'oz206', 'oz40', 'oz73', 'oz42', 'oz112', 'oz113', 'oz71', 'oz222', 'oz108', 'oz46' ]

full_train_drop = full_train.drop(columns=columns_to_drop)
full_test_drop = full_test.drop(columns=columns_to_drop)

model = fit_gluon(full_train_drop, time_limit=100)
test_score, leaderboard = evaluate_gluon(model, full_test_drop)
print(f'Full Train: {test_score}')
display(Image(filename=model.plot_ensemble_model()))


NameError: name 'full_train' is not defined

In [25]:
## Testing to see if TabPFN works well

from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier



In [None]:
# Taken from Supriya's code, adjust to local variables

import matplotlib.pyplot as plt
import seaborn as sns
from autogluon.tabular import TabularPredictor



# Get the leaderboard
leaderboard = predictor.leaderboard(extra_info=['r2'])


# # Convert the leaderboard to a DataFrame
df = leaderboard.copy()

# Set the style of the plots
sns.set(style="whitegrid")

# Plot for validation score (r2)
plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='score_val', data=df)
plt.xticks(rotation=45, ha='right')
plt.title('Validation Score (R2) by Model')
plt.xlabel('Model')
plt.ylabel('Validation Score (R2)')
plt.tight_layout()
plt.show()