In [1]:
from imblearn.over_sampling import SMOTE
from scipy import stats
from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve,f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy
import pandas as pd
import pickle
import time
from bayes_opt import BayesianOptimization
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

#### mof

In [2]:
# dataset with FRAX CRFs and GRS
with open('ready_whi_sp23', 'rb') as file_handler:
    data = pickle.load(file_handler)
    X1, Y1 = data.get('X', []).values, data.get('Y', []).values
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=0.2,random_state=98)
sm = SMOTE(random_state=2)
x_train_s1, y_train_s1 = sm.fit_resample(x_train1, y_train1)

In [3]:
# dataset with FRAX CRFs (no grs)
with open('ready_whi_sp23', 'rb') as file_handler:
    data = pickle.load(file_handler)
    X2, Y2 = data.get('X_nogrs', []).values, data.get('Y', []).values
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.2,random_state=98)
sm = SMOTE(random_state=2)
x_train_s2, y_train_s2 = sm.fit_resample(x_train2, y_train2)

In [4]:
# Bayesian optimization for Model 4 (FRAX CRFs + GRS)
from sklearn.ensemble import RandomForestClassifier as RFC
def rfc_cv(n_estimators, min_samples_split, max_features, max_depth, data, targets):
    estimator = RFC(
        n_estimators= int(n_estimators),
        min_samples_split=min_samples_split,
        max_features = max_features,
        max_depth = max_depth
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring='roc_auc', cv=5)
    return cval.mean()

def optimize_rfc(data, targets):
    def rfc_crossval(n_estimators, min_samples_split, max_features, max_depth):
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            max_depth=int(max_depth),           
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (20, 200),
            "min_samples_split":(2, 20),
            "max_features": (0.1, 0.999),
            'max_depth': (1,8)
        },
        random_state=45,
        verbose=0
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)
    
if __name__ == "__main__":
    print("--- Optimizing Random Forest with GRS---")
    optimize_rfc(data=x_train_s1, targets=y_train_s1)    

--- Optimizing Random Forest with GRS---
Final result: {'target': 0.8778842865533523, 'params': {'max_depth': 7.673980115788044, 'max_features': 0.9985845161097171, 'min_samples_split': 2.25878383683403, 'n_estimators': 89.91446418046922}}


In [5]:
# Bayesian optimization for Model 2 (FRAX CRFs)
def optimize_rfc(data, targets):
    def rfc_crossval(n_estimators, min_samples_split, max_features, max_depth):
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            max_depth=int(max_depth),           
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (20, 200),
            "min_samples_split": (2, 20),
            "max_features": (0.1, 0.999),
            'max_depth': (1,8)
        },
        random_state=45,
        verbose=0
    )
    optimizer.maximize(n_iter=5)

    print("Final result:", optimizer.max)


if __name__ == "__main__":
    print("--- Optimizing Random Forest no GRS---")
    optimize_rfc(data=x_train_s2, targets=y_train_s2)

--- Optimizing Random Forest no GRS---
Final result: {'target': 0.8714351435021763, 'params': {'max_depth': 7.976003315817541, 'max_features': 0.9986730679991317, 'min_samples_split': 9.440530888774544, 'n_estimators': 99.2896873402521}}


In [6]:
# grid search for Model 3 (FRAX CRFs + GRS)
parameters = {
        'n_estimators': [20, 200, 400],
        'min_samples_split': [2, 10, 20],
        'max_features': [0.1, 0.5, 0.999],
        'max_depth': [1,5,8]
     }
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    rf = RandomForestClassifier(random_state=45)
    model = GridSearchCV(estimator=rf, 
                    param_grid=parameters,cv=5).fit(x_train_s1, y_train_s1)
    print(model.best_params_)

{'max_depth': 8, 'max_features': 0.999, 'min_samples_split': 10, 'n_estimators': 100}


In [7]:
# grid search for Model 1 (FRAX CRFs)
parameters = {
        'n_estimators': [20, 200, 400],
        'min_samples_split': [2, 10, 20],
        'max_features': [0.1, 0.5, 0.999],
        'max_depth': [1,5,8]
     }
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    rf = RandomForestClassifier(random_state=45)
    model = GridSearchCV(estimator=rf, 
                    param_grid=parameters,cv=5).fit(x_train_s2, y_train_s2)
    print(model.best_params_)

{'max_depth': 8, 'max_features': 0.999, 'min_samples_split': 2, 'n_estimators': 20}


#### hip

In [8]:
# dataset with FRAX CRFs and GRS
with open('ready_whi_sp23_hip_py38', 'rb') as file_handler:
    data = pickle.load(file_handler)
    X1, Y1 = data.get('X', []).values, data.get('Y', []).values
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=0.2,random_state=98)
sm = SMOTE(random_state=2)
x_train_s1, y_train_s1 = sm.fit_resample(x_train1, y_train1)

In [9]:
# dataset with FRAX CRF (no grs)
with open('ready_whi_sp23_hip_py38', 'rb') as file_handler:
    data = pickle.load(file_handler)
    X2, Y2 = data.get('X_nogrs', []).values, data.get('Y', []).values
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.2,random_state=98)
sm = SMOTE(random_state=2)
x_train_s2, y_train_s2 = sm.fit_resample(x_train2, y_train2)

In [10]:
from sklearn.ensemble import RandomForestClassifier as RFC
def rfc_cv(n_estimators, min_samples_split, max_features, max_depth, data, targets):
    estimator = RFC(
        n_estimators= int(n_estimators),
        min_samples_split=min_samples_split,
        max_features = max_features,
        max_depth = max_depth
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring='roc_auc', cv=5)
    return cval.mean()

def optimize_rfc(data, targets):
    def rfc_crossval(n_estimators, min_samples_split, max_features, max_depth):
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            max_depth=int(max_depth),           
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (20, 200),
            "min_samples_split":(2, 20),
            "max_features": (0.1, 0.999),
            'max_depth': (1,8)
        },
        random_state=45,
        verbose=0
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)
    
if __name__ == "__main__":
    print("--- Optimizing Random Forest with GRS---")
    optimize_rfc(data=x_train_s1, targets=y_train_s1)

--- Optimizing Random Forest with GRS---
Final result: {'target': 0.960325165077627, 'params': {'max_depth': 7.984688295988707, 'max_features': 0.7700001278592569, 'min_samples_split': 2.417242716040132, 'n_estimators': 89.61723237512966}}


In [11]:
# Bayesisn optimization for Model 2 (FRAX CRFs)
def optimize_rfc(data, targets):
    def rfc_crossval(n_estimators, min_samples_split, max_features, max_depth):
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            max_depth=int(max_depth),           
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (20, 200),
            "min_samples_split": (2, 20),
            "max_features": (0.1, 0.999),
            'max_depth': (1,8)
        },
        random_state=45,
        verbose=0
    )
    optimizer.maximize(n_iter=5)

    print("Final result:", optimizer.max)


if __name__ == "__main__":
    print("--- Optimizing Random Forest no GRS---")
    optimize_rfc(data=x_train_s2, targets=y_train_s2)

--- Optimizing Random Forest no GRS---
Final result: {'target': 0.9578460853657077, 'params': {'max_depth': 7.115950711497945, 'max_features': 0.565296273916827, 'min_samples_split': 8.856182048595624, 'n_estimators': 281.6501024210875}}


In [12]:
# grid search for Model 3 (FRAX CRFs + GRS)
parameters = {
        'n_estimators': [20, 100, 400],
        'min_samples_split': [2, 10, 20],
        'max_features': [0.1, 0.5, 0.999],
        'max_depth': [1,5,8]
     }
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    rf = RandomForestClassifier(random_state=45)
    model = GridSearchCV(estimator=rf, 
                    param_grid=parameters,cv=5).fit(x_train_s1, y_train_s1)
    print(model.best_params_)

{'max_depth': 5, 'max_features': 0.999, 'min_samples_split': 10, 'n_estimators': 100}


In [13]:
# grid search for Model 1 (FRAX CRFs)
parameters = {
        'n_estimators': [20, 100, 400],
        'min_samples_split': [2, 10, 20],
        'max_features': [0.1, 0.5, 0.999],
        'max_depth': [1,5,8]
     }
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    rf = RandomForestClassifier(random_state=45)
    model = GridSearchCV(estimator=rf, 
                    param_grid=parameters,cv=5).fit(x_train_s2, y_train_s2)
    print(model.best_params_)

{'max_depth': 8, 'max_features': 0.999, 'min_samples_split': 2, 'n_estimators': 20}
