# Machine Learning Model Selection

In [0]:
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from tqdm import tqdm

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# setting paths
data_dir = os.path.abspath('/content/drive/My Drive/Projects/YapAiTek_Challenge/_data')
train_data_fp = os.path.join(data_dir, 'X_train_final.npy')
test_data_fp = os.path.join(data_dir, 'X_test_final.npy')
y_true_fp = os.path.join(data_dir, 'y_train_final.npy')

In [0]:
# Loading data-sets
train = np.load(train_data_fp)
target = np.load(y_true_fp)
test = np.load(test_data_fp)

In [0]:
rs = 13  # random_state constant
cv = 3   # kfold constant

## NMAE Metric

In [0]:
estimators = {
    'KNN': KNeighborsRegressor(n_neighbors=10),
    'RandomForest': RandomForestRegressor(n_jobs=-1, random_state=rs),
    'ExtraTree': ExtraTreesRegressor(n_jobs=-1, random_state=rs),
    'GradientBoosting': GradientBoostingRegressor(random_state=rs),
    'AdaBoost': AdaBoostRegressor(random_state=rs),
    'XGB': XGBRegressor(n_jobs=-1, random_state=rs)
}

In [19]:
%%time

estimators_scores = {}

for est_name, model in tqdm(estimators.items()):
    print(f'\nEvaluating {est_name} estimator...')
    scores = cross_val_score(estimator=model, X=train, y=target, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
    estimators_scores[est_name] = {
        'scores': scores,
        'mean': scores.mean(),
        'var': scores.var()
    }
    print(f'{est_name} mean score: {scores.mean()} \n')




  0%|          | 0/6 [00:00<?, ?it/s][A[A[A


Evaluating KNN estimator...





 17%|█▋        | 1/6 [34:28<2:52:22, 2068.55s/it][A[A[A

KNN mean score: -1.6742676792247215 


Evaluating RandomForest estimator...





 33%|███▎      | 2/6 [46:26<1:50:53, 1663.33s/it][A[A[A

RandomForest mean score: -1.6890654422261033 


Evaluating ExtraTree estimator...





 50%|█████     | 3/6 [56:38<1:07:24, 1348.00s/it][A[A[A

ExtraTree mean score: -1.6015738346776205 


Evaluating GradientBoosting estimator...





 67%|██████▋   | 4/6 [1:00:35<33:49, 1014.56s/it][A[A[A

GradientBoosting mean score: -1.6199754747319106 


Evaluating AdaBoost estimator...





 83%|████████▎ | 5/6 [1:02:16<12:20, 740.72s/it] [A[A[A

AdaBoost mean score: -3.483656357673277 


Evaluating XGB estimator...





100%|██████████| 6/6 [1:03:47<00:00, 545.61s/it][A[A[A


[A[A[A

XGB mean score: -1.6193738421040382 

CPU times: user 943 ms, sys: 814 ms, total: 1.76 s
Wall time: 1h 3min 47s


In [32]:
means = {}

for k,v in estimators_scores.items():
    print(f'{k} estimator cross_val scores:')
    print(v, '\n')

    means[k] = round(v['mean'], 5)

KNN estimator cross_val scores:
{'scores': array([-1.73327426, -1.59060038, -1.6989284 ]), 'mean': -1.6742676792247215, 'var': 0.00369671453672359} 

RandomForest estimator cross_val scores:
{'scores': array([-1.74485013, -1.5619733 , -1.7603729 ]), 'mean': -1.6890654422261033, 'var': 0.008116365173660446} 

ExtraTree estimator cross_val scores:
{'scores': array([-1.62388991, -1.53359478, -1.64723682]), 'mean': -1.6015738346776205, 'var': 0.002401422338395368} 

GradientBoosting estimator cross_val scores:
{'scores': array([-1.58867514, -1.59992848, -1.6713228 ]), 'mean': -1.6199754747319106, 'var': 0.0013393803924348358} 

AdaBoost estimator cross_val scores:
{'scores': array([-3.1651333 , -3.66126518, -3.62457059]), 'mean': -3.483656357673277, 'var': 0.05095288307401801} 

XGB estimator cross_val scores:
{'scores': array([-1.59022777, -1.59896065, -1.66893311]), 'mean': -1.6193738421040382, 'var': 0.0012407708695984546} 



In [33]:
# Sorting estimators by their mean 'NMAE' score
sorted(means.items(), key=lambda x: x[1], reverse=True)

[('ExtraTree', -1.60157),
 ('XGB', -1.61937),
 ('GradientBoosting', -1.61998),
 ('KNN', -1.67427),
 ('RandomForest', -1.68907),
 ('AdaBoost', -3.48366)]

## R^2 metric

In [0]:
estimators = {
    'RandomForest': RandomForestRegressor(n_jobs=-1, random_state=rs),
    'ExtraTree': ExtraTreesRegressor(n_jobs=-1, random_state=rs),
    'GradientBoosting': GradientBoostingRegressor(random_state=rs),
    'AdaBoost': AdaBoostRegressor(random_state=rs),
    'XGB': XGBRegressor(n_jobs=-1, random_state=rs)
}

In [35]:
%%time

estimators_scores2 = {}

for est_name, model in tqdm(estimators.items()):
    print(f'\nEvaluating {est_name} estimator...')
    scores = cross_val_score(estimator=model, X=train, y=target, cv=cv, scoring='r2', n_jobs=-1)
    estimators_scores2[est_name] = {
        'scores': scores,
        'mean': scores.mean(),
        'var': scores.var()
    }
    print(f'\n{est_name} mean score: {scores.mean()} \n')




  0%|          | 0/5 [00:00<?, ?it/s][A[A[A


Evaluating RandomForest estimator...





 20%|██        | 1/5 [12:14<48:59, 734.90s/it][A[A[A


RandomForest mean score: 0.6613523792942447 


Evaluating ExtraTree estimator...





 40%|████      | 2/5 [23:36<35:56, 718.78s/it][A[A[A


ExtraTree mean score: 0.6905771510577857 


Evaluating GradientBoosting estimator...





 60%|██████    | 3/5 [27:48<19:17, 578.83s/it][A[A[A


GradientBoosting mean score: 0.6901622836287785 


Evaluating AdaBoost estimator...





 80%|████████  | 4/5 [29:44<07:20, 440.15s/it][A[A[A


AdaBoost mean score: 0.13407228236632673 


Evaluating XGB estimator...





100%|██████████| 5/5 [31:15<00:00, 335.38s/it][A[A[A


[A[A[A


XGB mean score: 0.6905353055379363 

CPU times: user 762 ms, sys: 619 ms, total: 1.38 s
Wall time: 31min 15s


In [36]:
means2 = {}

for k,v in estimators_scores2.items():
    print(f'{k} estimator cross_val scores (R^2):')
    print(v, '\n')

    means2[k] = round(v['mean'], 5)

RandomForest estimator cross_val scores (R^2):
{'scores': array([0.62321993, 0.72754431, 0.6332929 ]), 'mean': 0.6613523792942447, 'var': 0.0022075963384341727} 

ExtraTree estimator cross_val scores (R^2):
{'scores': array([0.67976943, 0.72757407, 0.66438795]), 'mean': 0.6905771510577857, 'var': 0.0007238177159276799} 

GradientBoosting estimator cross_val scores (R^2):
{'scores': array([0.68171311, 0.70209149, 0.68668225]), 'mean': 0.6901622836287785, 'var': 7.526838451231936e-05} 

AdaBoost estimator cross_val scores (R^2):
{'scores': array([ 0.27905747,  0.15198563, -0.02882625]), 'mean': 0.13407228236632673, 'var': 0.01595917513559031} 

XGB estimator cross_val scores (R^2):
{'scores': array([0.68140439, 0.70257982, 0.68762171]), 'mean': 0.6905353055379363, 'var': 7.897765589191111e-05} 



In [37]:
# Sorting estimators by their mean 'R^2' score
sorted(means2.items(), key=lambda x: x[1], reverse=True)

[('ExtraTree', 0.69058),
 ('XGB', 0.69054),
 ('GradientBoosting', 0.69016),
 ('RandomForest', 0.66135),
 ('AdaBoost', 0.13407)]

In [38]:
estimators['XGB']

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=13,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)