### Demo Notebook that uses FLAML framework's AutoML feature for task oriented ML models

In [None]:
%pip install flaml openml
%pip install --upgrade pandas "dask[complete]"
%pip install "flaml[automl]" 

** NOTE : On M1 Mac, to install automl, need to install libomp using `brew install libomp`

2. Classification Example

Download **[Telco Churn](https://www.openml.org/search?type=data&sort=runs&id=42178&status=active)** Dataset from OpenML datasets. The task is to predict if a customer is going to churn.

In [None]:
from flaml.automl.data import load_openml_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
try:
    X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=42178, data_dir='./')
except (ServerError, Exception):
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from pandas import DataFrame

    X, y = make_classification(n_samples=539383, n_features=7)
    X = DataFrame(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.head()

3. Run FLAML

In [None]:
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()

In [None]:
settings = {
    "time_budget": 600,  # total running time in seconds
    "metric": 'accuracy', 
                        # check the documentation for options of metrics (https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric)
    "task": 'classification',  # task type
    "log_file_name": 'chrun-experiment.log',  # flaml log file
    "seed": 7654321,    # random seed
}

In [None]:
automl.fit(X_train=X_train, y_train=y_train, **settings)

In [None]:
'''retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

In [None]:
automl.model.estimator

In [None]:
'''pickle and save the automl object'''
import pickle
with open('automl.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
'''load pickled automl object'''
with open('automl.pkl', 'rb') as f:
    automl = pickle.load(f)

In [None]:
'''compute predictions of testing dataset''' 
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]

In [None]:
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))

In [None]:
from flaml.automl.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename=settings['log_file_name'], time_budget=240)
for config in config_history:
    print(config)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation Accuracy')
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()