In [None]:
!nvidia-smi

# Install (restart runtime after this)

In [None]:
!pip install autokeras
!pip install autogluon

In [None]:
import os
import shutil
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

# Datasets

## Download

In [None]:
# upload your kaggle API token into files panel, then run the cell
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle

In [None]:
!rm -r datasets
!mkdir datasets

In [None]:
# download datasets from kaggle
!kaggle datasets download -d navoneel/brain-mri-images-for-brain-tumor-detection -p datasets --unzip
!rm -r datasets/yes datasets/no
!kaggle datasets download -d die9origephit/children-vs-adults-images -p datasets/child_vs_adult --unzip
!kaggle datasets download -d dhruvildave/english-handwritten-characters-dataset -p datasets/english_handwritten_char --unzip
# !kaggle datasets download -d hasibalmuzdadid/shoe-vs-sandal-vs-boot-dataset-15k-images -p datasets --unzip
# !mv datasets/Shoe\ vs\ Sandal\ vs\ Boot\ Dataset datasets/show_sandal_boot
# !kaggle datasets download -d muratkokludataset/rice-image-dataset -p datasets --unzip
# !kaggle datasets download -d plameneduardo/sarscov2-ctscan-dataset -p datasets/covid_ct_scan --unzip

## index functions

In [None]:
DATASETS_ROOT = 'datasets'
INDEX_DIR = '_index'
CLASS = 'class'
LABEL = 'label'
PATH = 'image'

In [None]:
def is_image(file_name):
    file_name = file_name.lower()
    extensions = ['.png', '.jpg', '.jpeg']
    for ex in extensions:
        if file_name.endswith(ex):
            return True
    return False

In [None]:
def get_class_based_samples(dataset_dir, classes):
    items = []
    for c in classes:
        c_dir = dataset_dir / c
        for f in c_dir.glob('*'):
            if not f.is_dir() and is_image(str(f)):
                items.append({CLASS: c, PATH: str(f)})
            if not f.is_dir() and not is_image(str(f)):
                print(f)
    return items

In [None]:
def brain_tumor_samples():
    """
    link: https://www.kaggle.com/datasets/navoneel/brain-mri-images-for-brain-tumor-detection
    """
    dataset_dir = Path(DATASETS_ROOT) / 'brain_tumor_dataset'
    classes = ['yes', 'no']
    return get_class_based_samples(dataset_dir, classes)

In [None]:
def child_vs_adult_samples():
    """
    link: https://www.kaggle.com/datasets/die9origephit/children-vs-adults-images
    """
    dataset_dir = Path(DATASETS_ROOT) / 'child_vs_adult'
    classes = ['adults', 'children']
    splits = ['train', 'test']
    items = []
    for sp in splits:
        items += get_class_based_samples(dataset_dir / sp, classes)
    return items

In [None]:
def english_handwritten_char_samples():
    """
    link: https://www.kaggle.com/datasets/dhruvildave/english-handwritten-characters-dataset
    """
    dataset_dir = Path(DATASETS_ROOT) / 'english_handwritten_char'
    index_df = pd.read_csv(str(dataset_dir / 'english.csv'))
    index_df = index_df.rename(columns={'image': PATH, 'label': CLASS})
    index_df[PATH] = index_df[PATH].apply(lambda p: os.path.join(dataset_dir, p))
    return index_df.to_dict('records')

In [None]:
def show_sandal_boot_samples():
    """
    link: https://www.kaggle.com/datasets/hasibalmuzdadid/shoe-vs-sandal-vs-boot-dataset-15k-images
    """
    dataset_dir = Path(DATASETS_ROOT) / 'show_sandal_boot'
    classes = ['Boot', 'Sandal', 'Shoe']
    return get_class_based_samples(dataset_dir, classes)

In [None]:
def rice_samples():
    """
    link: https://www.kaggle.com/datasets/muratkokludataset/rice-image-dataset
    """
    dataset_dir = Path(DATASETS_ROOT) / 'Rice_Image_Dataset'
    classes = ['Arborio', 'Basmati', 'Ipsala', 'Jasmine', 'Karacadag']
    return get_class_based_samples(dataset_dir, classes)

In [None]:
def covid_ct_scan_samples():
    """
    link: https://www.kaggle.com/datasets/plameneduardo/sarscov2-ctscan-dataset
    """
    dataset_dir = Path(DATASETS_ROOT) / 'covid_ct_scan'
    classes = ['COVID', 'non-COVID']
    return get_class_based_samples(dataset_dir, classes)

## Dataset object

In [None]:
class Dataset:
    def __init__(self, name, samples_generator_function, split_test_ratio):
        self.name = name

        self.index_df = pd.DataFrame(samples_generator_function())
        self.classes = self.index_df[CLASS].unique().tolist()
        self.index_df[LABEL] = self.index_df[CLASS].apply(
            lambda c: self.classes.index(c))

        self.split_test_ratio = split_test_ratio
        self.index_dir = str(Path(DATASETS_ROOT) / INDEX_DIR / name)
    
    def make_splits(self):
        stratify = self.index_df[CLASS].values
        train, test = train_test_split(
            self.index_df, test_size=self.split_test_ratio,
            shuffle=True, stratify=stratify)
        os.makedirs(self.index_dir, exist_ok=True)
        train.to_csv(os.path.join(self.index_dir, 'train.csv'), index=False)
        test.to_csv(os.path.join(self.index_dir, 'test.csv'), index=False)

    @property
    def train_csv_path(self):
        return os.path.join(self.index_dir, 'train.csv')

    @property
    def test_csv_path(self):
        return os.path.join(self.index_dir, 'test.csv')

    def train_test_index_df(self):
        return (
            pd.read_csv(self.train_csv_path), 
            pd.read_csv(self.test_csv_path)
        )

In [None]:
datasets = [
    Dataset('brain_tumor', brain_tumor_samples, 0.3),
    Dataset('child_vs_adult', child_vs_adult_samples, 0.2),
    # Dataset('english_hand_written_char', english_handwritten_char_samples, 0.15),
    # Dataset('show_sandal_boot', show_sandal_boot_samples, 0.1),
    # Dataset('rice', rice_samples, 0.1),
    # Dataset('covid_ct_scan', covid_ct_scan_samples, 0.15),
]

In [None]:
for d in datasets:
    d.make_splits()

# Implement systems wrapper

In [None]:
class AutoMLSystem:
    def set_dataset(self, dataset):
        pass
        
    def fit(self, time_budget):
        pass
    
    def predict_test(self):
        pass

## Auto Gluon

In [None]:
import autogluon.core as ag
from autogluon.vision import ImagePredictor, ImageDataset


class AutoGluonImageClassiferAML(AutoMLSystem):
    def set_dataset(self, dataset):
        self.dataset = dataset        
        self.train_dataset = ImageDataset(
            dataset.train_csv_path, dataset.classes, image_column=PATH)
        self.test_dataset = ImageDataset(
            dataset.test_csv_path, dataset.classes, image_column=PATH)
        
    def fit(self, time_budget):
        self.aml = ImagePredictor(
            path=os.path.join('outputs', 'ag', str(time_budget), self.dataset.name))
        # since the original dataset does not provide validation split, the `fit` function splits it randomly with 90/10 ratio
        self.aml.fit(self.train_dataset, time_limit=time_budget,
                     presets='medium_quality_faster_train')
        print('fit_summary:', self.aml.fit_summary())
    
    def predict_test(self):
        labels = self.test_dataset[LABEL]
        return self.aml.predict(self.test_dataset), labels

In [None]:
dataset = datasets[0]
agaml = AutoGluonImageClassiferAML()
agaml.set_dataset(dataset)
agaml.fit(60)
p, l = agaml.predict_test()
# p, l

In [None]:
agaml.aml.list_models()

## Auto Keras

In [None]:
!rm -r datasets/_tf

In [None]:
import autokeras as ak
import time


class AutoKerasImageClassiferAML(AutoMLSystem):
    def set_dataset(self, dataset):
        self.dataset = dataset

        def convert_to_tf_dataset_compatible_dir():
            new_dataset_dir = Path(DATASETS_ROOT) / '_tf' / dataset.name
            if new_dataset_dir.exists():
                return new_dataset_dir / 'train', new_dataset_dir / 'test'

            train_df, test_df = dataset.train_test_index_df()
            for sp, df in zip(['train', 'test'], [train_df, test_df]):
                sp_dir = new_dataset_dir / sp
                for i, r in df.iterrows():
                    class_dir = sp_dir / r[CLASS]
                    class_dir.mkdir(parents=True, exist_ok=True)
                    shutil.copy(r[PATH], class_dir)
            return new_dataset_dir / 'train', new_dataset_dir / 'test'

        train_dir, test_dir = convert_to_tf_dataset_compatible_dir()
        image_size = (256, 256)

        self.train_dataset = tf.keras.utils.image_dataset_from_directory(
            train_dir, class_names=dataset.classes,
            batch_size=None, image_size=image_size)
        self.test_dataset = tf.keras.utils.image_dataset_from_directory(
            test_dir, class_names=dataset.classes,
            batch_size=None, image_size=image_size, shuffle=False)
        
    def fit(self, time_budget):
        start_time = time.time()        
        elapsed_time = 0
        
        while elapsed_time < time_budget:
            print(f'start new trial ...')
            self.aml = ak.ImageClassifier(
                project_name=self.dataset.name, max_trials=2,
                directory=os.path.join('outputs', 'ak', str(time_budget)),
                overwrite=False)
            self.aml.fit(self.train_dataset, epochs=20, verbose=True)
            # self.aml.fit(self.train_dataset, verbose=True)
            elapsed_time = int(time.time() - start_time)
            print('elapsed time:', elapsed_time)
    
    def predict_test(self):
        labels = self.test_dataset.map(lambda x, y: y)
        labels_list = [l.numpy() for l in labels]
        return self.aml.predict(self.test_dataset).reshape(-1).astype(int), np.array(labels_list)

In [None]:
dataset = datasets[0]
akaml = AutoKerasImageClassiferAML()
akaml.set_dataset(dataset)
akaml.fit(10)
akaml.predict_test()

# Benchmarking

In [None]:
!rm -r outputs

In [None]:
systems_cls = [AutoGluonImageClassiferAML, AutoKerasImageClassiferAML]
time_budgets = [5*60, 15*60]

In [None]:
min_train_time_h = sum(time_budgets) * len(systems_cls) * len(datasets) / 3600
print('Minimum required time (h):', min_train_time_h)

In [None]:
from sklearn.metrics import (
    accuracy_score, f1_score
)

def calculate_metrics(predictions, labels):
    metrics = {}
    metrics['accuracy'] = accuracy_score(labels, predictions)
    metrics['f1_macro'] = f1_score(labels, predictions, average='macro')
    return metrics

In [None]:
import time


def run(system_cls, time_budget, dataset):
    system_name = system_cls.__name__
    result = {
        'system': system_name,
        'budget': time_budget,
        'dataset': dataset.name,
        'status': 'failed'
    }
    
    try:
        print('start loading system ...')
        t = time.time()
        aml = system_cls()
        aml.set_dataset(dataset)
        result['load_time'] = time.time() - t

        print('start training ...')
        t = time.time()
        aml.fit(time_budget)
        result['train_time'] = time.time() - t

        print('start predicting ...')
        t = time.time()
        predictions, labels = aml.predict_test()
        result['inference_time'] = time.time() - t
        
        print('caculating metrics ...')
        metrics = calculate_metrics(predictions, labels)
        for m in metrics:
            result[f'metric_{m}'] = metrics[m]

        result['status'] = 'success'
        
    except Exception as e:
        print('EXCEPTION:', e)
        result['exception'] = str(e)
    
    print(result)
    return result

In [None]:
all_results = []

for b in time_budgets:
    for dataset in datasets:
        for sys in systems_cls:
            r = run(sys, b, dataset)
            all_results.append(r)
            pd.DataFrame(all_results).to_csv('results.csv', index=False)

In [None]:
from google.colab import files
files.download('results.csv') 

# Result analysis

In [None]:
pd.options.plotting.backend = 'plotly'

results_df = pd.read_csv('results.csv')
results_df

In [None]:
metrics = [c for c in results_df.columns if c.startswith('metric')]

for metric in metrics:
    display(results_df.groupby('system')[metric].mean().T.plot(kind='bar', barmode='group', title=f'{metric}'))

In [None]:
results_df.groupby(['dataset'])['metric_f1_macro'].std()

In [None]:
quality_metrics = ['metric_f1_macro']
results_df['quality'] = results_df[quality_metrics].fillna(0).max(axis=1)
results_df

In [None]:
def covert_setting_index_to_col(series, main_col):
    series_df = series.to_frame()
    series_df.columns = ['unk']
    series_df = series_df.reset_index()
    series_df = pd.DataFrame(list(series_df.apply(
        lambda r: {main_col: r[main_col], r['system']: r['unk']}, axis=1).values))
    return series_df.groupby(main_col).max()

In [None]:
d = covert_setting_index_to_col(results_df.groupby(['system', 'budget'])['quality'].mean(), 'budget')
d.T.plot(kind='bar', barmode='group')