# Install requirements

In [None]:
!pip install autokeras

!pip install autogluon
!pip install mxnet>=1.9

# Download and read dataset

In [None]:
import os
import datetime
import time

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak
from matplotlib import pyplot as plt

# pd.options.plotting.backend = 'plotly'
pd.options.plotting.backend = 'matplotlib'

## Weather dataset

In [None]:
def get_weather_dataset():
    zip_path = tf.keras.utils.get_file(
        origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
        fname='jena_climate_2009_2016.csv.zip',
        extract=True,
        cache_dir='dataset')
    csv_path, _ = os.path.splitext(zip_path)
    df = pd.read_csv(csv_path)
    df['Date Time'] = pd.to_datetime(df['Date Time'], format='%d.%m.%Y %H:%M:%S')
    return df

In [None]:
weather_df = get_weather_dataset()
print('dataset size:', len(weather_df))
weather_df.head()

In [None]:
weather_df.info()

In [None]:
weather_df.set_index('Date Time', drop=True)['p (mbar)'].plot()

In [None]:
weather_df.set_index('Date Time', drop=True)['T (degC)'].plot()

In [None]:
weather_df.set_index('Date Time', drop=True)['wv (m/s)'].plot()

In [None]:
dt = weather_df['Date Time'].sort_values()
interval = dt - dt.shift(1)
interval.value_counts()

## Air quality

In [None]:
def get_air_quality_dataset():
    zip_path = tf.keras.utils.get_file(
        fname="AirQualityUCI.csv.zip",
        origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip",
        extract=True,
        cache_dir='dataset'
    )
    csv_path, _ = os.path.splitext(zip_path)
    dataset = pd.read_csv(csv_path, sep=";", parse_dates=[['Date', 'Time']])
    dataset = dataset[dataset.columns[:-2]]
    dataset = dataset.dropna()
    dataset = dataset.replace(",", ".", regex=True)
    dataset['Date_Time'] = pd.to_datetime(dataset['Date_Time'], format='%d/%m/%Y %H.%M.%S')
    numeric_col = dataset.columns[1:]
    dataset[numeric_col] = dataset[numeric_col].astype(float)
    return dataset

In [None]:
air_df = get_air_quality_dataset()
print('dataset size:', len(air_df))
air_df.head()

In [None]:
air_df.info()

In [None]:
air_df.set_index('Date_Time', drop=True)['AH'].plot(backend='plotly')

In [None]:
dt = air_df['Date_Time'].sort_values()
interval = dt - dt.shift(1)
interval.value_counts()

## Covid

In [None]:
def get_covid_dataset():
    df = pd.read_csv(
        "https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv",
        parse_dates=["Date"],
    )
    return df

In [None]:
covid_df = get_covid_dataset()
print('dataset count:', len(covid_df))
covid_df.head()

In [None]:
covid_df.info()

In [None]:
covid_df['Date'].plot(kind='hist', backend='plotly')

In [None]:
covid_df['name'].plot(kind='hist', backend='plotly')

# Define datasets

In [None]:
from autogluon.timeseries import TimeSeriesDataFrame


class Dataset:
    def __init__(self, name, df, time_col, target_col, prediction_length, item_id_col=None):
        self.name = name
        self.df = df
        self.initial_time_col = time_col
        self.initial_item_id = item_id_col
        self.target_col = target_col
        self.prediction_length = prediction_length

        self.time_col = 'timestamp'
        self.item_id = 'item_id'

    def split(self):
        data = self.df.copy()
        if self.initial_item_id is None:
            data[self.item_id] = 0
        else:
            data = data.rename({ self.initial_item_id : self.item_id }, axis=1)

        data = TimeSeriesDataFrame.from_data_frame(
            data,
            id_column=self.item_id,
            timestamp_column=self.initial_time_col
        )

        test_data = data.copy()  # the full data set
        # the data set with the last prediction_length time steps included, i.e., akin to `a[:-5]`
        train_data = data.slice_by_timestep(slice(None, -self.prediction_length))

        os.makedirs('splits', exist_ok=True)
        test_data.to_csv(os.path.join('splits', f'{self.name}_test.csv'))
        train_data.to_csv(os.path.join('splits', f'{self.name}_train.csv'))

    def read_split(self, split):
        df = pd.read_csv(os.path.join('splits', f'{self.name}_{split}.csv'))
        df[self.time_col] = pd.to_datetime(df[self.time_col])
        return df


In [None]:
datasets = [
    # Dataset('weather_hour', weather_df, 'Date Time', 'T (degC)', prediction_length=12),
    Dataset('air_quality_day', air_df, 'Date_Time', 'AH', prediction_length=24),
    Dataset('air_quality_week', air_df, 'Date_Time', 'AH', prediction_length=24*7),
    Dataset('covid_3day', covid_df, 'Date', 'ConfirmedCases', prediction_length=3, item_id_col='name'),
    Dataset('covid_week', covid_df, 'Date', 'ConfirmedCases', prediction_length=7, item_id_col='name'),
]

In [None]:
for d in datasets:
    print(f'splitting {d.name}')
    d.split()

## Check

In [None]:
!head covid_week_test.csv

In [None]:
air_df.tail()

In [None]:
!tail air_quality_day_test.csv

In [None]:
!tail air_quality_day_train.csv

# AutoML systems

In [None]:
class TimeSeriesAutoMLSystem:
    def set_dataset(self, dataset):
        pass
        
    def fit(self, time_budget):
        pass
    
    def predict_test(self):
        pass

## Auto-gluon

In [None]:
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame


class AutoGluonTSSystem(TimeSeriesAutoMLSystem):
    def set_dataset(self, dataset: Dataset):
        self.dataset = dataset
        train_df = dataset.read_split('train')
        test_df = dataset.read_split('test')

        self.train = TimeSeriesDataFrame.from_data_frame(
            train_df,
            id_column=self.dataset.item_id,
            timestamp_column=self.dataset.time_col
        )
        self.test = TimeSeriesDataFrame.from_data_frame(
            test_df,
            id_column=self.dataset.item_id,
            timestamp_column=self.dataset.time_col
        )
        
    def fit(self, time_budget: int):
        self.aml = TimeSeriesPredictor(
            path=os.path.join('outputs', f'autogluon-{time_budget}', self.dataset.name),
            target=self.dataset.target_col,
            prediction_length=self.dataset.prediction_length,
            eval_metric='MAPE'
        )
        self.aml.fit(
            train_data=self.train,
            presets='medium_quality',
            time_limit=time_budget
        )
    
    def predict_test(self):
        predictions = self.aml.predict(self.train)
        return predictions['mean']

In [None]:
aml = AutoGluonTSSystem()
aml.set_dataset(datasets[0])
aml.fit(time_budget=60)
p = aml.predict_test()

In [None]:
aml.aml.leaderboard(aml.test, silent=True)

In [None]:
aml.predict_test()

## Auto-keras

In [None]:
import autokeras as ak


class AutoKerasTSAutoML(TimeSeriesAutoMLSystem):
    def set_dataset(self, dataset):
        self.dataset = dataset
        self.train_df = self.dataset.read_split('train').sort_values(dataset.time_col)
        self.test_df = self.dataset.read_split('test').sort_values(dataset.time_col)

        n_items = len(self.train_df[self.dataset.item_id].unique())
        if n_items > 1:
            raise ValueError("Can't make traning on multiple items datasets")

        self._feature_cols = list(self.train_df.columns)
        self._feature_cols.remove(self.dataset.target_col)
        self._feature_cols.remove(self.dataset.item_id)
        self._feature_cols.remove(self.dataset.time_col)
        
    def fit(self, time_budget):
        x, y = self.train_df[self._feature_cols], self.train_df[[self.dataset.target_col]]
        start_time = time.time()        
        elapsed_time = 0
        lb = d.prediction_length
        bs = 64
        if lb > bs:
            c = int(lb / bs)
            bs = int(lb / c)
        else:
            bs = lb
        
        while elapsed_time < time_budget:
            print(f'start new trial ...')
            self.aml = ak.TimeseriesForecaster(
                lookback=lb,
                predict_from=1,
                predict_until=lb,
                max_trials=1,
                directory=os.path.join('outputs', f'autokeras_{time_budget}'),
                project_name=d.name,
                # metrics='mean_absolute_percentage_error',
                overwrite=False
            )
            self.aml.fit(x, y, verbose=True, batch_size=bs, epochs=10)
            elapsed_time = int(time.time() - start_time)
            print('elapsed time:', elapsed_time)
    
    def predict_test(self):
        x = self.test_df[self._feature_cols]
        p = self.aml.predict(x)
        return p

In [None]:
d = datasets[0]

akml = AutoKerasTSAutoML()
akml.set_dataset(d)
akml.fit(60)

In [None]:
self = akml
autokeras_aml = self.aml

x = self.train_df[self._feature_cols]
x = autokeras_aml.read_for_predict(x)
x
# p = self.aml.predict(x, y=self.dataset.target_col)
# p

In [None]:
y_pred = super(type(autokeras_aml), autokeras_aml).predict(x=x)

In [None]:
y_pred

# Benchmarking

In [None]:
systems_cls = [AutoGluonTSSystem]
time_budgets = [5*60, 15*60]   # [5*60, 20*60]
selected_datasets = datasets

In [None]:
from sklearn.metrics import mean_absolute_percentage_error


def calculate_metrics(predictions, dataset):
    test_df = dataset.read_split('test')
    test_df = test_df.set_index([dataset.item_id, dataset.time_col], drop=True)
    labels = test_df.loc[predictions.index][dataset.target_col]

    metrics = {
        'MAPE': mean_absolute_percentage_error(labels, predictions),
    }

    return metrics

In [None]:
def run(system_cls, time_budget, dataset):
    system_name = system_cls.__name__
    result = {
        'system': system_name,
        'budget': time_budget,
        'dataset': dataset.name,
        'status': 'failed'
    }
    
    try:
        print(f'Start run:\n\tSystem: {system_name}\n\tBudget: {time_budget}\n\tDataset: {dataset.name}\n')
        print('start loading system ...')
        t = time.time()
        aml = system_cls()
        aml.set_dataset(dataset)
        result['load_time'] = time.time() - t

        print('start training ...')
        t = time.time()
        aml.fit(time_budget)
        result['train_time'] = time.time() - t

        print('start predicting ...')
        t = time.time()
        predictions = aml.predict_test()
        result['inference_time'] = time.time() - t
        
        print('caculating metrics ...')
        metrics = calculate_metrics(predictions, dataset)
        for m in metrics:
            result[f'metric_{m}'] = metrics[m]

        result['status'] = 'success'
        
    except Exception as e:
        print('EXCEPTION:', e)
        result['exception'] = str(e)
    
    print(result)
    return result

In [None]:
all_results = []

for b in time_budgets:
    for dataset in selected_datasets:
        for sys in systems_cls:
            r = run(sys, b, dataset)
            all_results.append(r)
            pd.DataFrame(all_results).to_csv('results.csv', index=False)

# Results

In [None]:
results_df = pd.read_csv('results.csv')
results_df

# Draft

## Air quality

In [None]:
air_train_data = TimeSeriesDataFrame.from_data_frame(
    air_df,
    timestamp_column="Date_Time",
)

In [None]:
air_df['item_id'] = 0
air_train_data = TimeSeriesDataFrame.from_data_frame(
    air_df,
    timestamp_column="Date_Time",
)

In [None]:
prediction_length = 24 * 7

air_test_data = air_train_data.copy()  # the full data set

# the data set with the last prediction_length time steps included, i.e., akin to `a[:-5]`
air_train_data = air_train_data.slice_by_timestep(slice(None, -prediction_length))

In [None]:
len(air_test_data) - len(air_train_data)

In [None]:
air_predictor = TimeSeriesPredictor(
    path="autogluon-air-quality-test",
    target="AH",
    prediction_length=prediction_length,
    eval_metric="MAPE",
)
air_predictor.fit(
    train_data=air_train_data,
    presets="low_quality",
    time_limit=300
)

In [None]:
air_predictor.leaderboard(air_test_data, silent=True)

In [None]:
predictions = air_predictor.predict(air_test_data)
predictions

In [None]:
len(predictions)

## Covid

In [None]:
train_data = TimeSeriesDataFrame.from_data_frame(
    covid_df,
    id_column="name",
    timestamp_column="Date",
)

In [None]:
plt.figure(figsize=(20, 3))
for country in ["United Kingdom_", "Germany_"]:
    plt.plot(train_data.loc[country], label=country)
plt.legend()

In [None]:
prediction_length = 7

test_data = train_data.copy()  # the full data set

# the data set with the last prediction_length time steps included, i.e., akin to `a[:-5]`
train_data = train_data.slice_by_timestep(slice(None, -prediction_length))

In [None]:
train_data

In [None]:
plt.figure(figsize=(20, 3))
plt.plot(test_data.loc["Germany_"], label="test")
plt.plot(train_data.loc["Germany_"], label="train")

test_range = (
    test_data.loc["Germany_"].index.max(),
    train_data.loc["Germany_"].index.max(),
)

plt.fill_betweenx(
    y=(0, test_data.loc["Germany_"]["ConfirmedCases"].max()),
    x1=test_range[0],
    x2=test_range[1],
    alpha=0.1,
    label="test forecast horizon",
)

plt.legend()

In [None]:
prediction_length=7

predictor = TimeSeriesPredictor(
    path="autogluon-covidforecast",
    target="ConfirmedCases",
    prediction_length=prediction_length,
    eval_metric="MAPE",
)
predictor.fit(
    train_data=train_data,
    presets="medium_quality",
    time_limit=15*60
)

In [None]:
predictor.leaderboard(test_data, silent=True)

In [None]:
predictions = predictor.predict(train_data)
predictions

In [None]:
predictions.reset_index()['item_id'].unique()

In [None]:
country = 'Afghanistan_'
country = 'Iran_'
# country = 'France_'
# country = 'Germany_'
# country = 'Cyprus_'
plt.figure(figsize=(20, 3))

ytrue = train_data.loc[country]["ConfirmedCases"]
ypred = predictions.loc[country]

# prepend the last value of true range to predicted range for plotting continuity
ypred.loc[ytrue.index[-1]] = [ytrue[-1]] * 10
ypred = ypred.sort_index()

ytrue_test = test_data.loc[country]["ConfirmedCases"][-5:]

plt.plot(ytrue[-30:], label="Training Data")
plt.plot(ypred["mean"], label="Mean Forecasts")
plt.plot(ytrue_test, label="Actual")

plt.fill_between(
    ypred.index, ypred["0.1"], ypred["0.9"], color="red", alpha=0.1
)
plt.title(f"COVID Case Forecasts in {country}, compared to actual trajectory")
_ = plt.legend()