In [2]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import time

from xgboost import XGBRFClassifier, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

import multiprocessing

import warnings

In [3]:
dir        = "DATA/"
file1      = 'BHBH_Delay_Time.csv'

BHBH       = pd.read_csv(dir+file1)
BHBH.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)
BHBH.head()

Unnamed: 0,Index,ID,name,BWorldtime,Mass_0,Radius_0,Zams_0,Phase_0,RemnantType_0,Mass_1,...,Events,EventsPlus,Mzams_0,Mzams_1,Semimajor_ini,Eccentricity_ini,Z,alpha,Delay_Time,Eccentricity_Delay
0,0,360308,0_792857804219921,5.411991,18.34303,7.8e-05,21.22227,7,6,21.09631,...,RB:RE:S:RB:RE:S,RB:RE:ehSt:RB:RE:erSt,41.0592,34.5745,24.87463,0.2619,0.0004,0.5,99934.1,6.409107e-11
1,1,360337,0_285120263308905,4.134819,50.99943,0.000217,92.2436,7,6,51.78028,...,S:S,hhS:hrS,92.2436,53.641,81917.701038,0.3955,0.0004,0.5,5.520443e+16,4.93482e-14
2,2,360367,0_303570516402716,4.31159,33.98611,0.000144,41.10912,7,6,30.83786,...,RB:RE:RB:RE:S:RB:RE:S,RB:RE:RB:RE:ehSt:RB:RE:erSt,74.5563,46.8471,127.422203,0.2217,0.0004,0.5,710833.7,1.981434e-10
3,3,360429,0_321152334354104,5.738531,42.6148,0.000181,50.85,7,6,33.31328,...,RB:S:RE:RB:S,RB:hhS:RE:RB:hrS,50.85,32.3486,316.947946,0.0107,0.0004,0.5,12149660.0,2.425785e-12
4,4,360471,0_850952161687565,4.503749,49.74939,0.000211,65.2221,7,6,45.91471,...,RB:S:RE:RB:S,RB:hhS:RE:RB:hrS,65.2221,46.7855,1548.08029,0.0751,0.0004,0.5,5995947000.0,1.042263e-11


In [4]:
X = BHBH.drop(['Index', 'ID', 'name', 'Delay_Time',
               'Eccentricity_Delay',
               'Events', 'EventsAll', 'EventsPlus'], axis=1).to_numpy()
Y = BHBH.Delay_Time

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0)
X_train

array([[7.727871e+00, 8.429761e+00, 3.580374e-05, ..., 3.920000e-01,
        6.000000e-03, 3.000000e+00],
       [6.081992e+00, 3.356409e+01, 1.425568e-04, ..., 5.154000e-01,
        2.000000e-04, 3.000000e+00],
       [5.721070e+00, 2.755586e+01, 1.170381e-04, ..., 6.117000e-01,
        4.000000e-03, 3.000000e+00],
       ...,
       [7.759990e+00, 3.273191e+01, 1.390223e-04, ..., 2.948000e-01,
        1.000000e-04, 3.000000e+00],
       [8.284873e+00, 1.374775e+01, 5.839085e-05, ..., 1.623000e-01,
        4.000000e-03, 5.000000e-01],
       [6.702390e+00, 9.183860e+00, 3.900663e-05, ..., 4.573000e-01,
        2.000000e-03, 5.000000e+00]])

## Scikit-learn Random Forest

In [None]:
# model definition for the regressor
regressor=RandomForestRegressor(n_estimators=10, criterion='squared_error',
                                max_depth=10, min_samples_split=2,
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                max_features=1.0, max_leaf_nodes=None,
                                min_impurity_decrease=.0, bootstrap=True,
                                oob_score=False, n_jobs=None,
                                random_state=0,
                                verbose=1)

In [None]:
regressor.fit(X, Y)

In [None]:
Y_pred = regressor.predict(X_test)

In [None]:
print('Score of the regressor for the training set:\n', regressor.score(X_train, Y_train))
print('\nScore of the regressor for the test set:\n', regressor.score(X_test, Y_test))

In [None]:
fig, ax = plt.subplots(figsize=(15,7))

#b = np.histogram_bin_edges(BHBH[(BHBH.Delay_Time<1e18)].Delay_Time, bins='rice') # does not work with bind='fd'
b = np.logspace(np.log10(min(Y_pred)), np.log10(max(Y_pred)), 100)
entries, edges, _ = ax.hist(Y_pred, bins=b)

# calculate bin centers
bin_centers = 0.5 * (edges[:-1] + edges[1:])

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_title('Distribution of the Delay times')
ax.set_xlabel('Delay Time [Myr]')
ax.set_ylabel('Counts')

In [None]:
fig, ax = plt.subplots(figsize=(15,7))

#b = np.histogram_bin_edges(BHBH[(BHBH.Delay_Time<1e18)].Delay_Time, bins='rice') # does not work with bind='fd'
b = np.logspace(np.log10(min(Y_test)), np.log10(max(Y_test)), 100)
entries, edges, _ = ax.hist(Y_test, bins=b)

# calculate bin centers
bin_centers = 0.5 * (edges[:-1] + edges[1:])

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_title('Distribution of the Delay times')
ax.set_xlabel('Delay Time [Myr]')
ax.set_ylabel('Counts')

In [6]:
# parameters for the grid search
# to play a bit with those
parameters={'n_estimators':[10, 100],
            'criterion'   :['squared_error', 'absolute_error', 'poisson'],
            'max_depth'   :[None, 2, 10]
           }

regressor=RandomForestRegressor(n_estimators=10, criterion='squared_error', n_jobs=multiprocessing.cpu_count()-1)
# model for the gridsearch
Model_GS=GridSearchCV(regressor, parameters, cv=5, verbose=1, n_jobs=multiprocessing.cpu_count()-2)

In [7]:
Model_GS.fit(X_train, Y_train)

print ('----------RESULTS FOR GRIDSEARCH----------\n')

print("Best parameters set found:")
print(Model_GS.best_params_)

print("Score with best parameters:")
print(Model_GS.best_score_)

print("\nAll scores on the grid:")

# using pandas for better view
GridSearch_Results = pd.DataFrame.from_dict(Model_GS.cv_results_)
GridSearch_Results

Fitting 5 folds for each of 18 candidates, totalling 90 fits


exception calling callback for <Future at 0x7f5ad1f29490 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/joblib/parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/home/ubuntu/miniconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 531, in apply_async

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}