In [49]:
# =============================================================================
#
# =============================================================================


# allows to import own functions
import sys
import os
import re
root_project = re.findall(r'(^\S*TFM-master)', os.getcwd())[0]
sys.path.append(root_project)

from src.utils.help_func import results_searchcv, make_train_val_test,plot_predictions
from src.features.add_features import features_graph, features_pop
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
from scipy.stats import randint
from yellowbrick.model_selection import LearningCurve, FeatureImportances
from yellowbrick.regressor import ResidualsPlot
from sklearn.metrics import r2_score
import joblib
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

df = pd.read_csv(
    f'{root_project}/data/processed/simulation_results_rev17_wide.csv')
df = features_graph(df)
df = features_pop(df)

PATH =  f"{root_project}/models/decision_tree_rev17.pkl"

df['total_deceased'] = df['total_deceased'].replace(0,np.finfo(float).eps)

features = [
    'Tr',
    'inf_pow_1',
    'inf_pow_2',
    'mort_pow_1',
    'mort_pow_2',
    'mort_pow_3',
    'n_closed',
    'react_time',
    'total_deceased',
    'betweenness',
    'degree',
    'closeness',
    'country_pop']



df = df[features]

In [50]:
X_train_val, y_train_val, X_test, y_test = make_train_val_test(df, out_mode=1)

pipe = Pipeline([
    ('model', TransformedTargetRegressor(regressor=DecisionTreeRegressor(), func=np.log, inverse_func=np.exp))
])


param_dist = dict(
    model__regressor__max_depth=randint(low=8, high=18),
    model__regressor__min_samples_leaf=randint(2, 20),
)

random_search = RandomizedSearchCV(pipe,
                                   param_distributions=param_dist, verbose=2,
                                   n_iter=100, 
                                   random_state=42, n_jobs=-1)


random_search.fit(X_train_val, y_train_val)

Train_validation set: (31008, 12)
Test set: (7751, 12)
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    8.1s finished


RandomizedSearchCV(estimator=Pipeline(steps=[('model',
                                              TransformedTargetRegressor(func=<ufunc 'log'>,
                                                                         inverse_func=<ufunc 'exp'>,
                                                                         regressor=DecisionTreeRegressor()))]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'model__regressor__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fab929b1850>,
                                        'model__regressor__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fabf0647730>},
                   random_state=42, verbose=2)

In [51]:
results_searchcv(random_search, X_test, y_test)

Cross-val best score:
0.7797399782908527
Cross-val std:
0.010940607862023716
Best parameters found:
{'model__regressor__max_depth': 14, 'model__regressor__min_samples_leaf': 3}
Score in test:
0.7703850038632898
R^2 in test
0.7703850038632898
MAE in test:
476740829.3852974


In [52]:
random_search.predict(X_test)

array([3.98876834e+09, 2.22044605e-16, 2.22044605e-16, ...,
       5.34057623e+09, 3.02107339e+09, 5.71321517e+09])

In [53]:
param_dist = dict(
    max_depth=randint(low=8, high=18),
    min_samples_leaf=randint(2, 20),
)

random_search = RandomizedSearchCV(DecisionTreeRegressor(random_state=42),
                                   param_distributions=param_dist, verbose=2,
                                   n_iter=50, 
                                   random_state=42, n_jobs=-1)

random_search.fit(X_train_val, y_train_val)

# Load the model in path

results_searchcv(random_search, X_test, y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 235 out of 250 | elapsed:    3.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.0s finished


Cross-val best score:
0.8388588434253961
Cross-val std:
0.0061789086729539255
Best parameters found:
{'max_depth': 14, 'min_samples_leaf': 19}
Score in test:
0.8447317165640315
R^2 in test
0.8447317165640315
MAE in test:
427604825.8813874
