In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import pickle
import numpy as np
import pandas as pd
import re

import lightgbm

import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_contour

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Read the data

In [2]:
train_data = np.load("full_obs_data_train.npz")
val_data = np.load("full_obs_data_val.npz")
test_data = np.load("full_obs_data_test.npz")

dn = train_data["dn"]
senior = train_data["senior"]
topo = train_data["topo"]

dn = np.hstack([np.zeros((dn.shape[0],1)),dn])
senior = np.hstack([np.ones((senior.shape[0],1)),senior])
topo = np.hstack([2 * np.ones((topo.shape[0],1)),topo])

Xy_train = np.concatenate([dn,senior,topo],axis=0)
X_train, y_train = Xy_train[:, :-1], Xy_train[:,-1]


dn = val_data["dn"]
senior = val_data["senior"]
topo = val_data["topo"]

dn = np.hstack([np.zeros((dn.shape[0],1)),dn])
senior = np.hstack([np.ones((senior.shape[0],1)),senior])
topo = np.hstack([2 * np.ones((topo.shape[0],1)),topo])

Xy_val = np.concatenate([dn,senior,topo],axis=0)
X_val, y_val = Xy_val[:, :-1], Xy_val[:,-1]

dn = test_data["dn"]
senior = test_data["senior"]
topo = test_data["topo"]

dn = np.hstack([np.zeros((dn.shape[0],1)),dn])
senior = np.hstack([np.ones((senior.shape[0],1)),senior])
topo = np.hstack([2 * np.ones((topo.shape[0],1)),topo])

Xy_test = np.concatenate([dn,senior,topo],axis=0)
X_test, y_test = Xy_test[:, :-1], Xy_test[:,-1]

In [3]:
dn = np.load("OOD_DoNothing_obs_data.npy")
senior = np.load("OOD_Senior_original_95_obs_data.npy")
topo = np.load("OOD_Topo_Agent_95_2_obs_data.npy")

dn = np.hstack([np.zeros((dn.shape[0],1)),dn])
senior = np.hstack([np.ones((senior.shape[0],1)),senior])
topo = np.hstack([2 * np.ones((topo.shape[0],1)),topo])


Xy_test_ood = np.concatenate([dn, senior, topo])
X_test_ood, y_test_ood = Xy_test_ood[:,:-1], Xy_test_ood[:,-1]
print(X_test_ood.shape, y_test_ood.shape)



(6036, 4296) (6036,)


# Model input

In [4]:
#lgb_model.fit(X_train, y_train)
with open('./models/lightgbm_new.pkl', 'rb') as f:
    lgb_model_new = pickle.load(f)

 
with open('./models/rf_new.pkl', 'rb') as f:
    rf = pickle.load(f)

with open('./models/xg_best_new.pkl', 'rb') as f:
    xg = pickle.load(f)

In [5]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
# acc, balanced_acc, f1_score, binary_acc, ood_balanced_acc
def calculate_metrics(model, X,y):
    results = {}
    pred = model.predict(X)
    results["accuracy"] = accuracy_score(y, pred)
    results["balanced_accuracy"] = balanced_accuracy_score(y, pred)
    results["f1_micro"] = f1_score(y, pred, average="micro")
    #results["f1_macro"] = f1_score(y, pred, average="macro")

    y_pred_prob = model.predict_proba(X)[:, [0, 3, 2, 1]]
    surv = y_pred_prob[:,0]
    fail = np.sum(y_pred_prob[:,1:4], axis=1)
    y_pred_binary = (surv < fail).astype(int)
    
    y_binary = y.copy()
    y_binary[y_binary > 0] = 1


    
    results["binary_accuracy"] = accuracy_score(y_binary, y_pred_binary)

    return results
    

## lightgbm

In [6]:
test_results = calculate_metrics(lgb_model_new, X_test, y_test)
ood_results = calculate_metrics(lgb_model_new, X_test_ood, y_test_ood)
test_results["ood_balanced_accuracy"] = ood_results["balanced_accuracy"]
test_results["ood_binary_accuracy"] = ood_results["binary_accuracy"]

df_lightgbm = pd.DataFrame([test_results])
#df.columns = ["train", "val", "test", "ood"]
#lgb_model_new.index = ["lightgbm"]
df_lightgbm
df_lightgbm.index = ["LGBM"]

In [7]:
df_lightgbm.round(2)

Unnamed: 0,accuracy,balanced_accuracy,f1_micro,binary_accuracy,ood_balanced_accuracy,ood_binary_accuracy
LGBM,0.82,0.76,0.82,0.87,0.76,0.87


## RF

In [8]:
#train_results = calculate_metrics(lgb_model, X_train, y_train)
#val_results = calculate_metrics(lgb_model, X_val, y_val)
test_results = calculate_metrics(rf, X_test, y_test)
ood_results = calculate_metrics(rf, X_test_ood, y_test_ood) 

In [10]:
test_results["ood_balanced_accuracy"] = ood_results["balanced_accuracy"]
test_results["ood_binary_accuracy"] = ood_results["binary_accuracy"]

df_rf = pd.DataFrame([test_results])
#df.columns = ["train", "val", "test", "ood"]
df_rf.index = ["RF"]
df_rf.round(2)

Unnamed: 0,accuracy,balanced_accuracy,f1_micro,binary_accuracy,ood_balanced_accuracy,ood_binary_accuracy
RF,0.73,0.62,0.73,0.82,0.61,0.82


## XGBoost

In [12]:
import xgboost as xgb
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical = True)
dval_clf = xgb.DMatrix(X_val, y_val, enable_categorical = True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical = True)
dtest_ood_clf = xgb.DMatrix(X_test_ood, y_test_ood, enable_categorical = True)



In [13]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
# acc, balanced_acc, f1_score, binary_acc, ood_balanced_acc
def calculate_metrics_xg(model, X,y):
    results = {}
    pred_probs = model.predict(X)
    pred = pred_probs.argmax(axis=1)

    results["accuracy"] = accuracy_score(y, pred)
    results["balanced_accuracy"] = balanced_accuracy_score(y, pred)
    results["f1_micro"] = f1_score(y, pred, average="micro")
    #results["f1_macro"] = f1_score(y, pred, average="macro")

    

    # binary 
    y_binary = y.copy()
    pred_binary = pred.copy()
    y_binary[y_binary > 0] = 1
    pred_binary[pred_binary > 0] = 1

   
    results["binary_accuracy"] = accuracy_score(y_binary, pred_binary)

    return results
    

In [14]:
#train_results = calculate_metrics_xg(xg, dtrain_clf, y_train)
#val_results = calculate_metrics_xg(xg, dval_clf, y_val)
test_results = calculate_metrics_xg(xg, dtest_clf, y_test)
ood_results = calculate_metrics_xg(xg, dtest_ood_clf, y_test_ood) 

In [16]:
test_results["ood_balanced_accuracy"] = ood_results["balanced_accuracy"]
test_results["ood_binary_accuracy"] = ood_results["binary_accuracy"]

df_xg = pd.DataFrame([test_results])
#df_xg.columns = ["train", "val", "test", "ood"]
df_xg.index = ["XGBoost"]
df_xg.round(2)

Unnamed: 0,accuracy,balanced_accuracy,f1_micro,binary_accuracy,ood_balanced_accuracy,ood_binary_accuracy
XGBoost,0.8,0.73,0.8,0.83,0.73,0.83


In [22]:
df = pd.concat([df_rf, df_xg, df_lightgbm])
df.round(2)

Unnamed: 0,accuracy,balanced_accuracy,f1_micro,binary_accuracy,ood_balanced_accuracy,ood_binary_accuracy
RF,0.73,0.62,0.73,0.82,0.61,0.82
XGBoost,0.8,0.73,0.8,0.83,0.73,0.83
LGBM,0.82,0.76,0.82,0.87,0.76,0.87
