# Download datasets and libraries

In [None]:
!pip install h2o
!pip install auto-sklearn==0.14.6
!pip install autogluon
!pip install autokeras

In [None]:
# to download data uncomment the following lines

# open-ml datasets

# !wget https://api.openml.org/data/v1/download/22103263/houses.arff
# !wget https://api.openml.org/data/v1/download/3626/kin8nm.arff
# !wget https://api.openml.org/data/v1/download/31/credit-g.arff
# !wget https://api.openml.org/data/v1/download/16787463/Satellite.arff
# !wget https://api.openml.org/data/v1/download/6/letter.arff

In [None]:
# to download kaggle datasets, upload your kaggle API token into files panel, then run the cell
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle

In [None]:
!rm -r datasets
!mkdir datasets

In [None]:
!kaggle competitions download -c tabular-playground-series-jan-2022 -p datasets
!unzip datasets/tabular-playground-series-jan-2022.zip -d datasets/tmp
!mv datasets/tmp/train.csv datasets/seasonal_product_sales_jan_2022.csv
!rm -r datasets/tmp

# add other datasets as well

# Split datasets

In [None]:
import pandas as pd
from scipy.io.arff import loadarff 
from pathlib import Path
import os
from sklearn.model_selection import train_test_split

In [None]:
# constants
SPLITS_DIR = 'splits'
TEST_RATIO = 0.2  
TARGET_COL = 'y'

In [None]:
class Dataset:
    def __init__(self, name, task, init_target, fit_kwargs=None):
        self.name = name
        self.task = task
        self.init_target = init_target
        self.fit_kwargs = fit_kwargs
        
        splits_dir = Path('datasets') / Path('splits')
        splits_dir.mkdir(exist_ok=True)
        
        self.train_csv_path = str(splits_dir / f'{self.name}_train.csv')
        self.test_csv_path = str(splits_dir / f'{self.name}_test.csv')
        
    def _load(self):
        filename = os.path.join('datasets', self.name + '.arff')
        if os.path.isfile(filename):
            raw_data = loadarff(filename)
            df_data = pd.DataFrame(raw_data[0])
            string_cols = df_data.select_dtypes(include='object').columns
            if len(string_cols) > 0:
                df_data[string_cols] = df_data[string_cols].stack().str.decode('utf-8').unstack()
        else:
            filename = os.path.join('datasets', self.name + '.csv')
            df_data = pd.read_csv(filename)
        df_data[TARGET_COL] = df_data[self.init_target]
        df_data = df_data.drop(columns=self.init_target)
        return df_data
    
    @property
    def is_classification(self):
        return 'clf' in self.task
    
    def make_splits(self):
        df = self._load()
        statify = None
        if self.is_classification:
            stratify = df[TARGET_COL].values
        train, test = train_test_split(df, test_size=TEST_RATIO, shuffle=True, stratify=statify)
        
        train.to_csv(self.train_csv_path, index=False)
        test.to_csv(self.test_csv_path, index=False) 
        
    def load_splits(self):
        train = pd.read_csv(self.train_csv_path)
        test = pd.read_csv(self.test_csv_path)
        string_cols = train.select_dtypes(include='object').columns
        train[string_cols] = train[string_cols].astype('category')
        test[string_cols] = test[string_cols].astype('category')
        return train, test

In [None]:
default_fit_args = {'AutoKerasAML': {'epochs': 50, 'batch_size': 1024}}
large_dataset_fit_args = {'AutoKerasAML': {'epochs': 50, 'batch_size': 256}}
small_datasets_fit_args = { 'AutoKerasAML': {'epochs': 100, 'batch_size': 128} }
forest_cover_type_fit_args = {'AutoKerasAML': {'epochs': 50, 'batch_size': 1024, 'label_type': int}}

# Define datasets
datasets = [
    Dataset('houses', 'reg', 'medianhousevalue', small_datasets_fit_args),
    # Dataset('kin8nm', 'reg', "'y'", default_fit_args),
    # Dataset('credit-g', 'binary_clf', 'class', default_fit_args),
    # Dataset('Satellite', 'binary_clf', 'Target', default_fit_args),
    # Dataset('letter', 'multi_clf', 'class', default_fit_args),
    Dataset('enamad_w2v', 'multi_clf', 'tag', large_dataset_fit_args),
    
    Dataset('seasonal_product_sales_jan_2022', 'reg', 'num_sold', default_fit_args),
    Dataset('loan_loss', 'reg', 'loss', large_dataset_fit_args),
    Dataset('product_fail_aug_2022', 'binary_clf', 'failure', small_datasets_fit_args),
    Dataset('insurance_claim_mar_2021', 'binary_clf', 'target', default_fit_args),
    Dataset('titanic_apr_2021', 'binary_clf', 'Survived', default_fit_args),
    Dataset('forest_cover_type', 'multi_clf', 'Cover_Type', forest_cover_type_fit_args),
    Dataset('product_category_jun_2021', 'multi_clf', 'target', default_fit_args),
]

## make splits

In [None]:
for dataset in datasets:
    print(f'making splits of `{dataset.name}`')
    dataset.make_splits()

# Implement systems wrapper

In [None]:
class AutoMLSystem:
    def set_dataset(self, dataset):                 # define the dataset and task
        pass
        
    def fit(self, time_budget, fit_kwargs=None):    # start trainig in 
        pass                                        # pre-determined time budget
    
    def predict_test(self):                         # return model predicttions
        pass                                        # on test data
    
    def clean_up(self):                             # free up disk and memory
        pass

## H2O

In [None]:
import h2o
from h2o.automl import H2OAutoML


class H2OAML(AutoMLSystem):
    def __init__(self, mem_gb=32):
        h2o.init(max_mem_size=mem_gb)
        
    def set_dataset(self, dataset):
        self.dataset = dataset
        self.train = h2o.import_file(self.dataset.train_csv_path, header=1)
        self.test = h2o.import_file(self.dataset.test_csv_path, header=1)
        
        if self.dataset.is_classification:
            self.train[TARGET_COL] = self.train[TARGET_COL].asfactor()
            self.test[TARGET_COL] = self.test[TARGET_COL].asfactor()
        
    def fit(self, time_budget, fit_kwargs=None):
        x = self.train.columns
        x.remove(TARGET_COL)
        
        # Run AutoML
        self.aml = H2OAutoML(max_runtime_secs=time_budget)
        self.aml.train(x=x, y=TARGET_COL, training_frame=self.train)
        
    def predict_test(self):
        p = self.aml.leader.predict(self.test)
        return p.as_data_frame()['predict'].values
    
    def clean_up(self):
        h2o.cluster().shutdown()

In [None]:
# h2o_aml = H2OAML()
# h2o_aml.set_dataset(datasets[4])
# h2o_aml.fit(60)
# result = h2o_aml.predict_test()
# h2o_aml.clean_up()
# result

## Auto-sklearn

In [None]:
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.classification import AutoSklearnClassifier


class AutoSklearnAML(AutoMLSystem):
    def set_dataset(self, dataset):
        self.dataset = dataset
        self.train, self.test = self.dataset.load_splits()
        self._feature_cols = list(self.train.columns)
        self._feature_cols.remove(TARGET_COL)
        
    def fit(self, time_budget, fit_kwargs=None):        
        aml_cls = AutoSklearnClassifier if self.dataset.is_classification else AutoSklearnRegressor
        self.aml = aml_cls(
            time_left_for_this_task=time_budget, 
            memory_limit=16 * 1024
        )
        print('start fitting ...')
        X_train = self.train[self._feature_cols]
        y_train = self.train[TARGET_COL]
        self.aml.fit(X_train, y_train, dataset_name=self.dataset.name)
        
    def predict_test(self):
        X_test = self.test[self._feature_cols]
        return self.aml.predict(X_test)

In [None]:
# ask_aml = AutoSklearnAML()
# ask_aml.set_dataset(datasets[3])
# ask_aml.fit(60)
# result = ask_aml.predict_test()
# result

## Auto-Gluon

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor


class AutoGluonAML(AutoMLSystem):
    def set_dataset(self, dataset):
        self.dataset = dataset
        self.train = TabularDataset(self.dataset.train_csv_path)
        self.test = TabularDataset(self.dataset.test_csv_path)
        self._feature_cols = list(self.train.columns)
        self._feature_cols.remove(TARGET_COL)
        
    def fit(self, time_budget, fit_kwargs=None):
        problem_type = 'regression'
        if self.dataset.task == 'binary_clf':
            problem_type = 'binary'
        elif self.dataset.task == 'multi_clf':
            problem_type = 'multiclass'
            
        eval_metric = None
        if self.dataset.task == 'binary_clf':
            eval_metric = 'roc_auc'
        
        print('start fitting ...')
        self.aml = TabularPredictor(label=TARGET_COL, 
                                    problem_type=problem_type,
                                    eval_metric=eval_metric,
                                    path=os.path.join('outputs', f'autogluon_{time_budget}',
                                                      self.dataset.name)
                                   )
        self.aml = self.aml.fit(self.train, time_limit=time_budget, presets='medium_quality')
    
    def predict_test(self):
        test_data = self.test[self._feature_cols]
        return self.aml.predict(test_data)

In [None]:
# agl = AutoGluonAML()
# agl.set_dataset(datasets[3])
# agl.fit(60)
# test_res = agl.predict_test()
# test_res.values

## AutoKeras

In [None]:
import autokeras as ak
import time
import copy
import numpy as np


class AutoKerasAML(AutoMLSystem):
    def set_dataset(self, dataset):
        print('reading data ...')
        self.dataset = dataset
        self.train_df, self.test_df = self.dataset.load_splits()
        self._feature_cols = list(self.train_df.columns)
        self._feature_cols.remove(TARGET_COL)
        
        # impute nan values. without imputation fitting can't be ran on large datasets
        def impute(df):
            numeric_cols = df.select_dtypes(include=np.number).columns
            cols_with_nans = [c for c in numeric_cols if df[c].isnull().values.any()]
            if len(cols_with_nans) == 0:
                return df
            
            nan_replacements = { c : df[c].mean() for c in cols_with_nans}
            df = df.fillna(nan_replacements)
            return df
        
        self.train_df[self._feature_cols] = impute(self.train_df[self._feature_cols])
        self.test_df[self._feature_cols] = impute(self.test_df[self._feature_cols])
        
    def fit(self, time_budget, fit_kwargs=None):
        aml_cls = ak.StructuredDataClassifier if self.dataset.is_classification else ak.StructuredDataRegressor
        fit_kwargs = fit_kwargs or {}
        fit_kwargs = copy.copy(fit_kwargs)
        self.label_type = fit_kwargs.pop('label_type') if 'label_type' in fit_kwargs else None
        x, y = self.train_df[self._feature_cols], self.train_df[[TARGET_COL]]
        start_time = time.time()        
        elapsed_time = 0
        
        while elapsed_time < time_budget:
            print(f'start new trial, args = {fit_kwargs} ...')
            self.aml = aml_cls(
                project_name=self.dataset.name,
                max_trials=1,
                directory=os.path.join('outputs', f'autokeras_{time_budget}'),
                overwrite=False)
            self.aml.fit(x, y, verbose=True, **fit_kwargs)
            elapsed_time = int(time.time() - start_time)
            print('elapsed time:', elapsed_time)
        
    def predict_test(self):
        predictions = self.aml.predict(self.test_df[self._feature_cols]).reshape(-1)
        if self.label_type is not None:
            print('label type is:', self.label_type)
            predictions = predictions.astype(self.label_type)
        return predictions

In [None]:
# d = datasets[3]
# akml = AutoKerasAML()
# akml.set_dataset(d)
# akml.fit(60, fit_kwargs=d.fit_kwargs['AutoKerasAML'])
# akml.predict_test()

# Benchmarking

In [None]:
systems_cls = [H2OAML, AutoSklearnAML, AutoGluonAML, AutoKerasAML]
time_budgets = [5*60, 20*60]
# time_budgets = [20*60]
# datasets = datasets[2:3] + datasets[5:]

In [None]:
min_train_time_h = sum(time_budgets) * len(systems_cls) * len(datasets) / 3600
print('Minimum required time (h):', min_train_time_h)

## Metrics

In [None]:
from sklearn.metrics import (
    mean_squared_error, r2_score,
    accuracy_score, f1_score
)

def calculate_metrics(predictions, dataset):
    _, test = dataset.load_splits()
    labels = test[TARGET_COL].values
    
    metrics = {}
    if dataset.is_classification:
        metrics['accuracy'] = accuracy_score(labels, predictions)
        metrics['f1_macro'] = f1_score(labels, predictions, average='macro')
    else:
        metrics['rmse'] = mean_squared_error(labels, predictions)
        metrics['r2'] = r2_score(labels, predictions)
    return metrics

In [None]:
import time


def run(system_cls, time_budget, dataset):
    system_name = system_cls.__name__
    result = {
        'system': system_name,
        'budget': time_budget,
        'dataset': dataset.name,
        'task': dataset.task,
        'status': 'failed'
    }
    
    fit_kwargs = None
    if dataset.fit_kwargs is not None and system_name in dataset.fit_kwargs:
        fit_kwargs = dataset.fit_kwargs[system_name]
    
    try:
        print('start loading system ...')
        t = time.time()
        aml = system_cls()
        aml.set_dataset(dataset)
        result['load_time'] = time.time() - t

        print('start training ...')
        t = time.time()
        aml.fit(time_budget, fit_kwargs)
        result['train_time'] = time.time() - t

        print('start predicting ...')
        t = time.time()
        predictions = aml.predict_test()
        result['inference_time'] = time.time() - t
        
        aml.clean_up()

        print('caculating metrics ...')
        print(predictions)
        metrics = calculate_metrics(predictions, dataset)
        for m in metrics:
            result[f'metric_{m}'] = metrics[m]

        result['status'] = 'success'
        
    except Exception as e:
        print('EXCEPTION:', e)
        result['exception'] = str(e)
    
    print(result)
    return result

In [None]:
all_results = []

for b in time_budgets:
    for dataset in datasets:
        for sys in systems_cls:
            r = run(sys, b, dataset)
            all_results.append(r)
            pd.DataFrame(all_results).to_csv('results.csv', index=False)

# Result Analysis

In [None]:
pd.options.plotting.backend = 'plotly'

In [None]:
results_df = pd.read_csv('results_final.csv')
results_df

## Error

In [None]:
results_with_error = results_df[~results_df['exception'].isnull()]
results_with_error['system'].plot(kind='bar')

In [None]:
results_with_error['dataset'].plot(kind='bar')

In [None]:
# drop errorness results
errorness_systems = list(results_with_error['system'].unique())
selected_rows = results_df['system'].apply(lambda s: s in errorness_systems)
results_df = results_df[~selected_rows]

## Deviation from trainig budget

In [None]:
results_df['extra_train_time'] = (results_df['train_time'] - results_df['budget']) / results_df['budget']

print('Deviation from training budget:')
results_df.groupby('system')['extra_train_time'].mean().plot(kind='bar')

## Load / Inference time 

In [None]:
results_df.groupby('system')['load_time'].mean().plot(kind='bar', title='Load time')

In [None]:
results_df.groupby('system')['inference_time'].mean().plot(kind='bar', title='Inference time')

## Average results

In [None]:
metrics = [c for c in results_df.columns if c.startswith('metric')]
reg_metrics = metrics[:2]
clf_metrics = metrics[2:]

In [None]:
def show_average_metrics(task, metrics):
    task_results = results_df[results_df['task'] == task]
    for metric in metrics:
        display(task_results.groupby('system')[metric].mean().T.plot(kind='bar', barmode='group', title=f'{task}-{metric}'))
    return task_results

### Regression

In [None]:
reg_results = show_average_metrics('reg', reg_metrics)

In [None]:
reg_results.groupby(['dataset'])['metric_rmse'].std()

In [None]:
reg_results[reg_results['dataset'] == 'seasonal_product_sales_jan_2022']

### Binary classification

In [None]:
bclf_results = show_average_metrics('binary_clf', clf_metrics)

In [None]:
bclf_results.groupby(['dataset'])['metric_f1_macro'].std()

### Multi-class classification

In [None]:
mclf_results = show_average_metrics('multi_clf', clf_metrics)

In [None]:
mclf_results.groupby(['dataset'])['metric_f1_macro'].std()

## Distinct datasets

In [None]:
top_std_settings = results_df.groupby(['dataset', 'budget'])[metrics].std().fillna(0).T.max().sort_values(ascending=False)
top_std_settings.head(n=8)

In [None]:
dataset, budget = top_std_settings.index[0]
results_df[(results_df['dataset'] == dataset) & (results_df['budget'] == budget)]

## Improvement over time

In [None]:
quality_metrics = ['metric_r2', 'metric_f1_macro']
results_df['quality'] = results_df[quality_metrics].fillna(0).max(axis=1)
results_df

In [None]:
def covert_setting_index_to_col(series, main_col):
    series_df = series.to_frame()
    series_df.columns = ['unk']
    series_df = series_df.reset_index()
    series_df = pd.DataFrame(list(series_df.apply(
        lambda r: {main_col: r[main_col], r['system']: r['unk']}, axis=1).values))
    return series_df.groupby(main_col).max()

In [None]:
d = covert_setting_index_to_col(results_df.groupby(['system', 'budget'])['quality'].mean(), 'budget')
d.T.plot(kind='bar', barmode='group')