### This is a script to to load the property dataset and run **AutoGluon** on it. First basic model run, then different configurations are tried. Then look at possible **third level ensembling** of the models.

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
random_seed = 42

In [15]:
## Function for loading one of the 10 folds of the property dataset and concatinating the X and y values for train and test respectively. 
import pandas as pd

def load_fold(fold_number, random_seed=42, sample_size=None):
    df_X_train = pd.read_parquet(f'../../data/361092/{fold_number}/X_train.parquet')
    df_y_train = pd.read_parquet(f'../../data/361092/{fold_number}/y_train.parquet')
    df_X_test = pd.read_parquet(f'../../data/361092/{fold_number}/X_test.parquet')
    df_y_test = pd.read_parquet(f'../../data/361092/{fold_number}/y_test.parquet')

    # concatinating the X and y values for train and test respectively
    df_train = pd.concat([df_X_train, df_y_train], axis=1)
    df_test = pd.concat([df_X_test, df_y_test], axis=1)

    # Convert to AutoGluon's TabularDataset
    if sample_size:
        train_dataset = TabularDataset(df_train).sample(n=sample_size, random_state=random_seed)
        test_dataset = TabularDataset(df_test).sample(n=sample_size, random_state=random_seed)
    else:
        train_dataset = TabularDataset(df_train)
        test_dataset = TabularDataset(df_test)

    return train_dataset, test_dataset

# Also instantiate the target column
label_property = 'oz252'


In [19]:
## Function to fit the model, with most of the hyperparameters present and set to default/None. (Add more hyperparameters if desirable)

from autogluon.tabular import TabularPredictor

def fit_gluon(train_dataset, problem_type='regression', hyperparameters=None, eval_metric='r2', presets='medium_quality', time_limit=100, auto_stack=None, num_bag_folds=None, num_bag_sets=None, num_stack_levels=None, num_trials=None, verbosity=None, ag_args_fit=None, feature_prune=None, excluded_model_types=None):
    predictor = TabularPredictor(label=label_property, problem_type=problem_type, eval_metric=eval_metric)
    
    fit_args = {
        'train_data': train_dataset,
        'presets': presets,
        'time_limit': time_limit,
    }

    if hyperparameters is not None:
        fit_args['hyperparameters'] = hyperparameters
    if auto_stack is not None:
        fit_args['auto_stack'] = auto_stack
    if num_bag_folds is not None:
        fit_args['num_bag_folds'] = num_bag_folds
    if num_bag_sets is not None:
        fit_args['num_bag_sets'] = num_bag_sets
    if num_stack_levels is not None:
        fit_args['num_stack_levels'] = num_stack_levels
    if num_trials is not None:
        fit_args['num_trials'] = num_trials
    if verbosity is not None:
        fit_args['verbosity'] = verbosity
    if ag_args_fit is not None:
        fit_args['ag_args_fit'] = ag_args_fit
    if feature_prune is not None:
        fit_args['feature_prune'] = feature_prune
    if excluded_model_types is not None:
        fit_args['excluded_model_types'] = excluded_model_types

    predictor.fit(**fit_args)
    return predictor



In [11]:
## Function to evaluate a fitted model and training set. 

def evaluate_gluon(model, test_dataset):
    test_score = model.evaluate(test_dataset)
    leaderboard = model.leaderboard(test_dataset)
    return test_score, leaderboard

In [9]:
## Dictionary containing the hyperparameters for the different AutoGluon models
# First we make one dict, but later on trying different variaties for the lvl 1 and 2 models

# Think about doing this instead:
# from autogluon.common import space...

hyperparameters = {
    'GBM': [
        {'learning_rate': 0.1, 'num_leaves': 31, 'feature_fraction': 0.9},
        {'learning_rate': 0.05, 'num_leaves': 45, 'feature_fraction': 0.8},
    ],
    'CAT': {
        'iterations': 1000,
        'depth': 7,
        'learning_rate': 0.1,
        'l2_leaf_reg': 3,
    },
    'XGB': {
        'n_estimators': 1000,
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
    },
    'NN_TORCH': {
        'num_epochs': 10,
        'learning_rate': 1e-3,
        'layers': [100, 100],
    },
    'RF': {
        'n_estimators': 100,
        'max_depth': 15,
        'min_samples_split': 2,
    },
}

In [23]:
## First training the model speratly on all 10 folds to see that it is consistent

for fold_number in range(1, 4): # jsut the three first to start with
    train_dataset, test_dataset = load_fold(fold_number, random_seed=random_seed)
    model = fit_gluon(train_dataset, time_limit=300)
    test_score, leaderboard = evaluate_gluon(model, test_dataset)
    print(f'Fold {fold_number}: {test_score}')
    print(leaderboard)


No path specified. Models will be saved in: "AutogluonModels\ag-20240630_131314"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       4.24 GB / 15.80 GB (26.8%)
Disk Space Avail:   180.40 GB / 952.46 GB (18.9%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels\ag-20240630_131314"
Train Data Rows:    7996
Train Data Columns: 62
Label Column:       oz252
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    4340.35 MB
	Train Data (Original)  Memory Usage: 2.72 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the fea

Fold 1: {'r2': 0.09103472251615252, 'root_mean_squared_error': -0.025396391264573628, 'mean_squared_error': -0.0006449766892633117, 'mean_absolute_error': -0.018279427089901424, 'pearsonr': 0.3077854135530402, 'median_absolute_error': -0.013118152965545637}
                  model  score_test  score_val eval_metric  pred_time_test  \
0   WeightedEnsemble_L2    0.091035   0.119931          r2        1.377175   
1              CatBoost    0.089001   0.089654          r2        0.075433   
2         LightGBMLarge    0.079066   0.087833          r2        0.013531   
3               XGBoost    0.077817   0.065368          r2        0.053601   
4            LightGBMXT    0.077281   0.087110          r2        0.021812   
5              LightGBM    0.072527   0.078010          r2        0.040905   
6         ExtraTreesMSE    0.062175   0.086427          r2        0.585132   
7       RandomForestMSE    0.048230   0.078550          r2        0.564182   
8       NeuralNetFastAI    0.009104   0.

Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3949.02 MB
	Train Data (Original)  Memory Usage: 2.72 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 19 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Useless Original Features (Count: 1): ['oz115']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference time.


Fold 2: {'r2': 0.11939901741047843, 'root_mean_squared_error': -0.026706333421830396, 'mean_squared_error': -0.0007132282448379752, 'mean_absolute_error': -0.019365957073507745, 'pearsonr': 0.3510551809526267, 'median_absolute_error': -0.014009070114135769}
                  model  score_test  score_val eval_metric  pred_time_test  \
0   WeightedEnsemble_L2    0.119399   0.104477          r2        1.843945   
1         LightGBMLarge    0.104824   0.091638          r2        0.059174   
2         ExtraTreesMSE    0.101813   0.074513          r2        1.306422   
3       RandomForestMSE    0.099283   0.065050          r2        1.221158   
4              LightGBM    0.088354   0.059196          r2        0.064543   
5              CatBoost    0.078540   0.061511          r2        0.211820   
6               XGBoost    0.074381   0.056009          r2        0.133981   
7            LightGBMXT    0.073264   0.067152          r2        0.116357   
8       NeuralNetFastAI    0.064369   0.

AutoGluon Version:  1.1.1
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       3.95 GB / 15.80 GB (25.0%)
Disk Space Avail:   179.66 GB / 952.46 GB (18.9%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels\ag-20240630_131800"
Train Data Rows:    7996
Train Data Columns: 62
Label Column:       oz252
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    4021.21 MB
	Train Data (Original)  Memory Usage: 2.72 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 19 features to boolean dtype 

Fold 3: {'r2': 0.11276829470738381, 'root_mean_squared_error': -0.026501213816332244, 'mean_squared_error': -0.0007023143337389591, 'mean_absolute_error': -0.01921755821923169, 'pearsonr': 0.3369754256798945, 'median_absolute_error': -0.013829783905029314}
                  model  score_test  score_val eval_metric  pred_time_test  \
0   WeightedEnsemble_L2    0.112768   0.113052          r2        0.501587   
1         ExtraTreesMSE    0.103558   0.097630          r2        0.355873   
2         LightGBMLarge    0.094741   0.094309          r2        0.018516   
3               XGBoost    0.084844   0.073653          r2        0.042531   
4              LightGBM    0.081558   0.079029          r2        0.026268   
5       RandomForestMSE    0.081055   0.068119          r2        0.436017   
6            LightGBMXT    0.080822   0.088135          r2        0.028094   
7              CatBoost    0.072188   0.090169          r2        0.053167   
8       NeuralNetFastAI    0.055000   0.0

In [24]:
## First training the model speratly on all 10 folds to see that it is consistent

for fold_number in range(1, 4): # jsut the three first to start with
    train_dataset, test_dataset = load_fold(fold_number, random_seed=random_seed)
    model = fit_gluon(train_dataset, time_limit=900)
    test_score, leaderboard = evaluate_gluon(model, test_dataset)
    print(f'Fold {fold_number}: {test_score}')
    print(leaderboard)


No path specified. Models will be saved in: "AutogluonModels\ag-20240630_140716"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       4.86 GB / 15.80 GB (30.8%)
Disk Space Avail:   178.03 GB / 952.46 GB (18.7%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "AutogluonModels\ag-20240630_140716"
Train Data Rows:    7996
Train Data Columns: 62
Label Column:       oz252
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    4965.45 MB
	Train Data (Original)  Memory Usage: 2.72 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the fea

In [None]:
## Function to plot the results a 

In [None]:
## Training the model on all 10 folds

In [None]:
## Creating an ensemble of the 10 models??