# Install Packages

In [1]:
# !conda update -y -n base -c defaults conda

In [2]:
# !conda install -y numpy scipy scikit-learn pandas joblib pytorch

In [3]:
# !pip install deap update_checker tqdm stopit xgboost
# !pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0
# !pip install tpot

In [1]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !conda install -y -c conda-forge ipywidgets

In [2]:
# !pip install IProgress

# Import Packages

In [1]:
import json
import pandas as pd
import numpy as np
import tpot
from autosklearn.metrics import mean_absolute_error as auto_sklearn_MAE
import time

# Read Data

In [2]:
data = {}
for h in [0, 30, 120, 180]:
    data['processed_dep_h{}'.format(h)] = pd.read_csv("/mnt/data/Christophe/processed_dep_h{}.csv".format(h))
data.keys()

dict_keys(['processed_dep_h0', 'processed_dep_h30', 'processed_dep_h120', 'processed_dep_h180'])

In [3]:
with open("/mnt/data/Christophe/csv_docs.json", "r") as f:
    docs = json.load(f)

# Select and Slice Data

In [4]:
h=30
df = data['processed_dep_h{}'.format(h)]

X_train = df[df['dtype']=="TRAIN"]
X_train.pop("dtype")
y_train = X_train.pop("t_taxi")

X_val = df[df['dtype']=="VALIDATE"]
X_val.pop("dtype")
y_val = X_val.pop("t_taxi")

X_test = df[df['dtype']=="TEST"]
X_test.pop("dtype")
y_test = X_test.pop("t_taxi")

# Model Evaluation Function

In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time

def model_eval(y, y_pred, name=None, file=None, verbose=True, **kwargs):
    report = {}
    if name:
        report['name'] = name
        if verbose:
            print(name)
    
    report["RMSE"] = mean_squared_error(y, y_pred, squared=False)
    report["MAE"] = mean_absolute_error(y, y_pred)
    report["% <2 min"] = sum(abs(y-y_pred) < 2*60)/len(y)*100
    report["% <5 min"] = sum(abs(y-y_pred) < 5*60)/len(y)*100
    report["% <7 min"] = sum(abs(y-y_pred) < 7*60)/len(y)*100
    report["time"] = str(pd.Timestamp(round(time.time()), unit='s'))
    
    for kwarg in kwargs:
        report[kwarg] = kwargs[kwarg]
    
    if file is not None:
        with open(file, "a") as f:
            f.write(str(report)+"\n")
    if verbose:
        print(report)
    return(report)

In [7]:
# Validation Data is not used separately -> include into training data
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])
preprocessor = gen_preprocessor(list(X_train.columns))
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Model Preprocessor Function

In [6]:
def gen_preprocessor(X_columns):
    ColumnTransformations = []
    num_cols = []
    cat_cols = []
    
    from sklearn.preprocessing import QuantileTransformer
    for col in X_columns:
        if 'circular' in docs[col]['type']:
            ColumnTransformations.append(
            (
                col + '_qcut_' + str(docs[col]['n_bins']),
                QuantileTransformer(n_quantiles=docs[col]['n_bins']),
                [col]
            ))
        if 'num' in docs[col]['type']:
            num_cols.append(col)
        if 'cat' in docs[col]['type']:
            cat_cols.append(col)

    from sklearn.preprocessing import StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline
    num_trans = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    ColumnTransformations.append(("num_trans", num_trans, num_cols))

    from sklearn.preprocessing import OneHotEncoder
    ColumnTransformations.append(("onehotencode", OneHotEncoder(handle_unknown='ignore'), cat_cols))

    from sklearn.compose import ColumnTransformer
    return ColumnTransformer(ColumnTransformations, remainder='drop', n_jobs=-1)

# TPOT

In [8]:
t0= time.time()
print(pd.Timestamp(t0, unit='s'))

automl = tpot.TPOTRegressor(
    generations=10,
    population_size=10,
    #config_dict='TPOT light',
    verbosity=2,
    #_jobs=2,
)

automl.fit(X_train.toarray(), y_train)
y_pred = automl.predict(X_test.toarray())

t1= time.time()
print(pd.Timestamp(t0, unit='s'))
print(pd.Timedelta(t1-t0, unit='s'))
      
model_eval(y_test, y_pred, name="")

automl.export("automl_tpot_export.py")

2021-04-13 10:24:57.230084658


Traceback (most recent call last):
  File "/home/sim/miniconda3/envs/venv_tf/lib/python3.8/site-packages/stopit/utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "/home/sim/miniconda3/envs/venv_tf/lib/python3.8/site-packages/tpot/decorators.py", line 57, in time_limited_call
    func(*args)
  File "/home/sim/miniconda3/envs/venv_tf/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/sim/miniconda3/envs/venv_tf/lib/python3.8/site-packages/sklearn/linear_model/_coordinate_descent.py", line 1314, in fit
    mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/home/sim/miniconda3/envs/venv_tf/lib/python3.8/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/sim/miniconda3/envs/venv_tf/lib/python3.8/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "

Optimization Progress:   0%|          | 0/110 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -33741.39694242393

Generation 2 - Current best internal CV score: -33741.39694242393

Generation 3 - Current best internal CV score: -33510.42535492859

Generation 4 - Current best internal CV score: -33426.914543837695

Generation 5 - Current best internal CV score: -33173.21287535768

Generation 6 - Current best internal CV score: -31666.31390364268

Generation 7 - Current best internal CV score: -30436.283321665564

Generation 8 - Current best internal CV score: -30202.58603458874

Generation 9 - Current best internal CV score: -30202.58603458874

Generation 10 - Current best internal CV score: -30202.58603458874

Best pipeline: LinearSVR(DecisionTreeRegressor(input_matrix, max_depth=9, min_samples_leaf=6, min_samples_split=6), C=5.0, dual=False, epsilon=0.0001, loss=squared_epsilon_insensitive, tol=1e-05)
2021-04-13 10:24:57.230084658
0 days 05:48:42.815656424
{'RMSE': 166.51795108401382, 'MAE': 121.33314731462242, '% <2 min': 61.006

In [6]:
y_pred = automl.predict(X_test.toarray())

# Evaluate TPOT

In [10]:
for h in [0, 30, 120, 180]:
    df = data['processed_dep_h{}'.format(h)]

    X_test = df[df['dtype']=="TEST"]
    X_test.pop("dtype")
    y_test = X_test.pop("t_taxi")
    
    X_test = preprocessor.transform(X_test)
    
    y_pred = automl.predict(X_test.toarray())
    
    model_eval(y_test, y_pred, name="auto-tpot_G10_N10_h30, SIMPLE TEST, h{}".format(h),  file="model_auto-tpot.txt", t=6*60*60)

auto-tpot_G10_N10_h30, SIMPLE TEST, h0
{'name': 'auto-tpot_G10_N10_h30, SIMPLE TEST, h0', 'RMSE': 162.21414004438628, 'MAE': 118.30037508362065, '% <2 min': 61.56960015874591, '% <5 min': 94.68598075205874, '% <7 min': 98.2478420478222, 'time': '2021-04-13 19:26:31', 't': 21600}
auto-tpot_G10_N10_h30, SIMPLE TEST, h30
{'name': 'auto-tpot_G10_N10_h30, SIMPLE TEST, h30', 'RMSE': 166.51795108401382, 'MAE': 121.33314731462242, '% <2 min': 61.00602661256638, '% <5 min': 93.89780615390735, '% <7 min': 97.82007677466834, 'time': '2021-04-13 19:26:33', 't': 21600}
auto-tpot_G10_N10_h30, SIMPLE TEST, h120
{'name': 'auto-tpot_G10_N10_h30, SIMPLE TEST, h120', 'RMSE': 213.04641690976166, 'MAE': 155.28512968409746, '% <2 min': 51.37614678899083, '% <5 min': 86.87383291385889, '% <7 min': 94.39798652269221, 'time': '2021-04-13 19:26:38', 't': 21600}
auto-tpot_G10_N10_h30, SIMPLE TEST, h180
{'name': 'auto-tpot_G10_N10_h30, SIMPLE TEST, h180', 'RMSE': 209.653821332903, 'MAE': 152.67455369511435, '% <2