In [None]:
import numpy as np
import os
from strat_meta.loader import Dataset
import random

random.seed(42)

In [35]:
loader = Dataset('data', None, None, None)
dataset = loader.load('train_dataset.csv')

In [36]:
print(dataset.shape)
dataset.head(5)

(200000, 8)


Unnamed: 0,duration,dist,valhalla_eta,n_jam,n_slow,n_normal,n_free,hour
0,102,1981.492433,78.575,0,0,0,1,22
1,550,1906.145205,302.529,0,0,1,0,14
2,214,1777.758584,166.511,0,1,2,1,23
3,820,1965.234345,129.549,0,5,1,0,12
4,692,1952.65442,146.625,0,0,1,1,8


In [38]:
from sklearn.model_selection import train_test_split
Y = dataset.pop('duration')
X_train, X_test, y_train, y_test = train_test_split(dataset, Y, test_size=0.2, shuffle=False)

In [41]:
from strat_meta.GME import GeneticMetaEstimator, Method, MetaParameters, Metric
from sklearn.ensemble import RandomForestRegressor as RFR

In [42]:
best_result = dict()

In [45]:
unique_hours = X_train.hour.unique()
unique_hours.sort()

distances = X_train['dist'].to_numpy()
distances.sort()

distances = [ (arr[0], arr[-1]) for arr in np.array_split(distances, 7) ]

In [46]:
def mimmax_norm(x, bounds):
    _min, _max = bounds
    x = np.clip(x, _min, _max)
    return (x - _min) / (_max - _min)

In [47]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

from sklearn.metrics import mean_absolute_error as mae

ranges = dict()

ranges['MAE'] = (0, 1200) #[215.06808055530894, 2394.9915966386557]
ranges['MAPE'] = (0, 90) #[20.396595005977968, 85.07578865159266]
ranges['pred_time'] =(0.0001, 0.009) #[0.0006735324859619141, 0.007704019546508789]
metrics = [Metric('MAE', mae), Metric('MAPE', mape)]

In [48]:
weights = {'MAPE' : 0.5, 'MAE': 0, 'pred_time': 0.5}
def norm_eval(values):
    
    norm_res = dict()
    for key in ('MAPE', 'MAE', 'pred_time'):
        #print(values.method.history)
        norm_res[key] = mimmax_norm(values[key], ranges[key]) * weights[key]

    return norm_res

In [49]:
from datetime import datetime

In [None]:
for hour in unique_hours:
    hourly_batch = X_train.loc[X_train.hour == hour]
    hourly_batch = hourly_batch.sort_values(by=['dist'])
    
    hourly_batch_test = X_test.loc[X_test.hour == hour]
    hourly_batch_test = hourly_batch_test.sort_values(by=['dist'])
    
    for dist_low, dist_high in distances:
        dist_hourly_batch = hourly_batch.loc[hourly_batch.dist > dist_low]
        dist_hourly_batch = dist_hourly_batch.loc[dist_hourly_batch.dist < dist_high]
        
        dist_hourly_batch_test = hourly_batch_test.loc[hourly_batch_test.dist > dist_low]
        dist_hourly_batch_test = dist_hourly_batch_test.loc[dist_hourly_batch_test.dist < dist_high]
        

        rfr_method = Method('RFR', RFR)
        rfr_params = MetaParameters({'n_estimators': (10, 100, int, 3), 
                                 'criterion': ["mae", "mse"],
                                 'min_samples_split': (0.0001, 1.0, float, 0.1),
                                 'max_features' :["auto", "sqrt", "log2"]})

        rfr_estimator = GeneticMetaEstimator(rfr_method, rfr_params, 10, 40, norm_eval, 0.5, 0.3, metrics, True, 
                                                                                                 dist_hourly_batch,
                                                                                                 dist_hourly_batch_test,
                                                                                                 y_train.loc[dist_hourly_batch.index],
                                                                                                 y_test.loc[dist_hourly_batch_test.index])
        
        log_rfr, best_rfr = rfr_estimator.run(verbose=False)
        stats = 'MAPE: '
        stats += str(best_rfr.method.history['MAPE'][best_rfr.best_exp]) + " MAE: "
        stats += str(best_rfr.method.history['MAE'][best_rfr.best_exp]) + " training time: "
        stats += str(best_rfr.method.history['train_time'][best_rfr.best_exp]) + " prediction time: "
        stats += str(best_rfr.method.history['pred_time'][best_rfr.best_exp])
        
        print(f' {datetime.utcnow().isoformat()} --- {hour}:00, {dist_low} - {dist_high}: best rfr is {best_rfr}, stats: {stats}')
        best_result[(hour, (dist_low, dist_high))] = best_rfr

Best = RFR: {'n_estimators': 29, 'criterion': 'mae', 'min_samples_split': 0.4889823602723493, 'max_features': 'auto'}, best score = 0.4518360769169999, #best exp = 32
Best fit = 0.4518360769169999
 2022-02-11T11:38:56.562704 --- 0:00, 46.98900744456714 - 1737.6633460570708: best rfr is RFR: {'n_estimators': 29, 'criterion': 'mae', 'min_samples_split': 0.4889823602723493, 'max_features': 'auto'}, best score = 0.4518360769169999, #best exp = 32, stats: MAPE: 65.27203108062002 MAE: 50.46286472148541 training time: 0.028296232223510742 prediction time: 0.0016880035400390625
Best = RFR: {'n_estimators': 72, 'criterion': 'mae', 'min_samples_split': 0.45485531968860937, 'max_features': 'auto'}, best score = 0.3427300909047361, #best exp = 16
Best fit = 0.3427300909047361
 2022-02-11T11:39:15.831145 --- 0:00, 1737.6671687792225 - 1859.4420644862284: best rfr is RFR: {'n_estimators': 72, 'criterion': 'mae', 'min_samples_split': 0.45485531968860937, 'max_features': 'auto'}, best score = 0.342730

In [17]:
from sklearn.svm import SVR
best_result_svr = dict()
ranges = dict()

ranges['MAE'] = (0, 1200)
ranges['MAPE'] = (0, 80)
ranges['pred_time'] =(0.0001, 0.15)
metrics = [Metric('MAE', mae), Metric('MAPE', mape)]

for hour in unique_hours:
    hourly_batch = X_train.loc[X_train.hour == hour]
    hourly_batch = hourly_batch.sort_values(by=['dist'])
    
    hourly_batch_test = X_test.loc[X_test.hour == hour]
    hourly_batch_test = hourly_batch_test.sort_values(by=['dist'])
    
    for dist_low, dist_high in distances:
        dist_hourly_batch = hourly_batch.loc[hourly_batch.dist > dist_low]
        dist_hourly_batch = dist_hourly_batch.loc[dist_hourly_batch.dist < dist_high]
        
        dist_hourly_batch_test = hourly_batch_test.loc[hourly_batch_test.dist > dist_low]
        dist_hourly_batch_test = dist_hourly_batch_test.loc[dist_hourly_batch_test.dist < dist_high]
        

        svr_method = Method('SVR', SVR)
        svr_params = MetaParameters({'kernel': ["rbf", "sigmoid"], 
                                 'gamma': ['auto', 'scale'],
                                 'tol' :(0.001, 1.0, float, 0.01),
                                 'C':(0.1, 3, float, 0.1),
                                    })

        svr_estimator = GeneticMetaEstimator(svr_method, svr_params, 10, 40, norm_eval, 0.5, 0.3, metrics, True,
                                                                                                 dist_hourly_batch,
                                                                                                 dist_hourly_batch_test,
                                                                                                 y_train.loc[dist_hourly_batch.index],
                                                                                                 y_test.loc[dist_hourly_batch_test.index])
        
        log_svr, best_svr = svr_estimator.run(verbose=False)
        stats = 'MAPE: '
        stats += str(best_svr.method.history['MAPE'][best_svr.best_exp]) + " MAE: "
        stats += str(best_svr.method.history['MAE'][best_svr.best_exp]) + " training time: "
        stats += str(best_svr.method.history['train_time'][best_svr.best_exp]) + " prediction time: "
        stats += str(best_svr.method.history['pred_time'][best_svr.best_exp])
        
        print(f' {datetime.utcnow().isoformat()} --- {hour}:00, {dist_low} - {dist_high}: best svr is {best_svr}, stats: {stats}')
        best_result_svr[(hour, (dist_low, dist_high))] = best_svr

Best = SVR: {'kernel': 'rbf', 'gamma': 'scale', 'tol': 0.3804366892077403, 'C': 1.7443121597333808}, best score = 0.2923002318309641, #best exp = 34
Best fit = 0.2923002318309641
 2022-02-08T07:26:17.503178 --- 0:00, 501.0 - 2032.0: best svr is SVR: {'kernel': 'rbf', 'gamma': 'scale', 'tol': 0.3804366892077403, 'C': 1.7443121597333808}, best score = 0.2923002318309641, #best exp = 34, stats: MAPE: 45.27924019971293 MAE: 542.0690129735875 training time: 0.07370281219482422 prediction time: 0.0028896331787109375
Best = SVR: {'kernel': 'rbf', 'gamma': 'scale', 'tol': 0.4449897055689021, 'C': 3}, best score = 0.347334554218457, #best exp = 16
Best fit = 0.347334554218457
 2022-02-08T07:26:37.286604 --- 0:00, 2032.0 - 3581.0: best svr is SVR: {'kernel': 'rbf', 'gamma': 'scale', 'tol': 0.4449897055689021, 'C': 3}, best score = 0.347334554218457, #best exp = 16, stats: MAPE: 54.30702251296752 MAE: 505.9448002067495 training time: 0.07042145729064941 prediction time: 0.002473115921020508
Best 

In [18]:
from sklearn.ensemble import GradientBoostingRegressor as GBR

ranges = dict()
best_result_gbr = dict()

ranges['MAE'] = (0, 2000)
ranges['MAPE'] = (0, 100)
ranges['pred_time'] =(0.0001, 0.001)
metrics = [Metric('MAE', mae), Metric('MAPE', mape)]

for hour in unique_hours:
    hourly_batch = X_train.loc[X_train.hour == hour]
    hourly_batch = hourly_batch.sort_values(by=['dist'])
    
    hourly_batch_test = X_test.loc[X_test.hour == hour]
    hourly_batch_test = hourly_batch_test.sort_values(by=['dist'])
    
    for dist_low, dist_high in distances:
        dist_hourly_batch = hourly_batch.loc[hourly_batch.dist > dist_low]
        dist_hourly_batch = dist_hourly_batch.loc[dist_hourly_batch.dist < dist_high]
        
        dist_hourly_batch_test = hourly_batch_test.loc[hourly_batch_test.dist > dist_low]
        dist_hourly_batch_test = dist_hourly_batch_test.loc[dist_hourly_batch_test.dist < dist_high]
        

        gbr_method = Method('GBR', GBR)
        gbr_params = MetaParameters({'loss': ["huber", "quantile"],
                                 'learning_rate': (0.001, 2.0, float, 0.01),
                                 'n_estimators': (10, 100, int, 3), 
                                 'criterion': ["friedman_mse", "mse", "mae"]
                                    })

        gbr_estimator = GeneticMetaEstimator(gbr_method, gbr_params, 10, 20, norm_eval, 0.5, 0.3, metrics, True,
                                                                                                 dist_hourly_batch,
                                                                                                 dist_hourly_batch_test,
                                                                                                 y_train.loc[dist_hourly_batch.index],
                                                                                                 y_test.loc[dist_hourly_batch_test.index])
        
        log_gbr, best_gbr = gbr_estimator.run(False)
        stats = 'MAPE: '
        stats += str(best_gbr.method.history['MAPE'][best_gbr.best_exp]) + " MAE: "
        stats += str(best_gbr.method.history['MAE'][best_gbr.best_exp]) + " training time: "
        stats += str(best_gbr.method.history['train_time'][best_gbr.best_exp]) + " prediction time: "
        stats += str(best_gbr.method.history['pred_time'][best_gbr.best_exp])
        
        print(f' {datetime.utcnow().isoformat()} --- {hour}:00, {dist_low} - {dist_high}: best GBR is {best_gbr}, stats: {stats}')
        best_result_gbr[(hour, (dist_low, dist_high))] = best_gbr

Best = GBR: {'loss': 'huber', 'learning_rate': 0.6538744148622248, 'n_estimators': 23, 'criterion': 'friedman_mse'}, best score = 0.2252820368923038, #best exp = 19
Best fit = 0.2252820368923038
 2022-02-08T12:12:06.519363 --- 0:00, 501.0 - 2032.0: best GBR is GBR: {'loss': 'huber', 'learning_rate': 0.6538744148622248, 'n_estimators': 23, 'criterion': 'friedman_mse'}, best score = 0.2252820368923038, #best exp = 19, stats: MAPE: 39.02787174749288 MAE: 538.288588947517 training time: 0.06095385551452637 prediction time: 0.00015425682067871094
Best = GBR: {'loss': 'huber', 'learning_rate': 1.1458091298339654, 'n_estimators': 30, 'criterion': 'mae'}, best score = 0.21234184589448732, #best exp = 14
Best fit = 0.21234184589448732
 2022-02-08T12:13:41.366105 --- 0:00, 2032.0 - 3581.0: best GBR is GBR: {'loss': 'huber', 'learning_rate': 1.1458091298339654, 'n_estimators': 30, 'criterion': 'mae'}, best score = 0.21234184589448732, #best exp = 14, stats: MAPE: 34.320557289249024 MAE: 398.80582

In [21]:
import pickle

with open('results/rfr_dict.pkl', 'wb') as f:
    pickle.dump(best_result, f)
with open('results/svr_dict.pkl', 'wb') as f:
    pickle.dump(best_result_svr, f)
with open('results/rfr_dict_gbr.pkl', 'wb') as f:
    pickle.dump(best_result_gbr, f)