In [1]:
import pandas as pd 
import numpy as np
import joblib
import time
import sklearn.metrics
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

## Load dataset generated by CovaS

In [2]:
def calculate_macro_tpr_fpr(voting_cm):
    num_classes = voting_cm.shape[0]
    tpr_list = []
    fpr_list = []

    for i in range(num_classes):
        TP = voting_cm[i, i]
        FN = np.sum(voting_cm[i, :]) - TP
        FP = np.sum(voting_cm[:, i]) - TP
        TN = np.sum(voting_cm) - (TP + FN + FP)

        TPR = TP / (TP + FN) if (TP + FN) > 0 else 0
        FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

        tpr_list.append(TPR)
        fpr_list.append(FPR)

    macro_tpr = np.mean(tpr_list)
    macro_fpr = np.mean(fpr_list)

    return macro_tpr, macro_fpr

train = pd.read_csv('/home/vvhoang/new/full_code/DS/train_shap_52.csv')
test = pd.read_csv('/home/vvhoang/new/full_code/DS/test_shap_52.csv')

X_train = train.drop(['Label'], axis=1)
y_train = train['Label']
X_test = test.drop(['Label'], axis=1)
y_test = test['Label']
y_test = pd.Series(y_test)
y_train = pd.Series(y_train)


## XGBoost

In [3]:
xgb_params = {
    'device': 'cuda',
    'max_depth': 128,
    'n_estimators': 5000,
    'objective':"multi:softmax", 
    'num_class':len(y_train.unique())  ,
    'booster': 'gbtree',
    'learning_rate': 0.1,
    'eval_metric': 'auc',
    'verbosity': 0
}


print("XGBClassifier Starting")
xgb_model = XGBClassifier(**xgb_params)
# xgb_model = joblib.load('./models/framework_xgb_best.pkl')
xgb_model.fit(X_train,y_train)
joblib.dump(xgb_model, './models/framework_xgb_best.pkl')
xgb_start_time = time.time()
xgb_prediction = xgb_model.predict(X_test)
xgb_end_time = time.time()
xgb_time = xgb_end_time - xgb_start_time
print("XGBClassifier Finished")

xgb_acc = sklearn.metrics.accuracy_score(xgb_prediction, y_test)
xgb_precision = sklearn.metrics.precision_score(xgb_prediction, y_test, average='micro')
xgb_f1 = sklearn.metrics.f1_score(xgb_prediction, y_test, average='micro')
xgb_recall = sklearn.metrics.recall_score(xgb_prediction, y_test, average='micro')
xgb_cm = sklearn.metrics.confusion_matrix(xgb_prediction, y_test)
xgb_fp = xgb_cm[0, 1]
print("XGBoost report:")
print("XGBoost Time:", xgb_time)
print("XGBoost Accuracy:", xgb_acc)
print("XGBoost Precision:", xgb_precision)
print("XGBoost F1:", xgb_f1)
print("XGBoost Recall:", xgb_recall)
print("XGBoost FP", xgb_fp)
print("XGBoost CM:", xgb_cm)
xgb_tpr, xgb_fpr = calculate_macro_tpr_fpr(xgb_cm)
print(f'XGBoost Macro-average TPR: {xgb_tpr}')
print(f'XGBoost Macro-average FPR: {xgb_fpr}')

XGBClassifier Starting
XGBClassifier Finished
XGBoost report:
XGBoost Time: 0.148360013961792
XGBoost Accuracy: 0.8427021696252466
XGBoost Precision: 0.8427021696252466
XGBoost F1: 0.8427021696252466
XGBoost Recall: 0.8427021696252466
XGBoost FP 0
XGBoost CM: [[169   0   0   0   0   0   0   0   0   0   0   0]
 [  0 169   0   0   0   0   0   1   0   0   0   0]
 [  0   0 169   0   0   0   0  42   0  10   0   0]
 [  0   0   0 169   0   0   0   3   0   0   0   0]
 [  0   0   0   0 169   1   0   2   0   0   0   0]
 [  0   0   0   0   0 168   0   0   0   0   0   0]
 [  0   0   0   0   0   0 169   1   0   0   0   0]
 [  0   0   0   0   0   0   0  55  40   5   0   0]
 [  0   0   0   0   0   0   0  49 129   0   0   0]
 [  0   0   0   0   0   0   0   6   0  92  87   0]
 [  0   0   0   0   0   0   0  10   0  62  82   0]
 [  0   0   0   0   0   0   0   0   0   0   0 169]]
XGBoost Macro-average TPR: 0.8352117822023984
XGBoost Macro-average FPR: 0.014124362800583922


## ExtraTree

In [4]:
et_params = {
    "n_estimators": 100,
    "max_leaf_nodes": 15000,
    "n_jobs": -1,
    "random_state": 0,
    "bootstrap": True,
    "criterion": "entropy"
}

print("ExtraTreesClassifier Starting")
et_model = ExtraTreesClassifier(**et_params)
# et_model = joblib.load('./models/framework_et_best.pkl')
et_model.fit(X=X_train, y=y_train)
joblib.dump(et_model, './models/framework_et_best.pkl')
et_start_time = time.time()
et_prediction = et_model.predict(X_test)
et_end_time = time.time()
et_time = et_end_time - et_start_time
print("ExtraTreesClassifier Finished")

et_acc = sklearn.metrics.accuracy_score(et_prediction, y_test)
et_precision = sklearn.metrics.precision_score(et_prediction, y_test, average='micro')
et_f1 = sklearn.metrics.f1_score(et_prediction, y_test, average='micro')
et_recall = sklearn.metrics.recall_score(et_prediction, y_test, average='micro')
et_cm = sklearn.metrics.confusion_matrix(et_prediction, y_test)
et_fp = et_cm[0, 1]
print("ExtraTrees report:")
print("ExtraTrees Time:", et_end_time - et_start_time)
print("ExtraTrees Accuracy:", et_acc)
print("ExtraTrees Precision:", et_precision)
print("ExtraTrees F1:", et_f1)
print("ExtraTrees Recall:", et_recall)
print("ExtraTrees FP:", et_fp)
print("ExtraTrees CM:\n", et_cm)
et_tpr, et_fpr = calculate_macro_tpr_fpr(et_cm)
print(f'XGBoost Macro-average TPR: {et_tpr}')
print(f'XGBoost Macro-average FPR: {et_fpr}')

ExtraTreesClassifier Starting
ExtraTreesClassifier Finished
ExtraTrees report:
ExtraTrees Time: 0.10711264610290527
ExtraTrees Accuracy: 0.8550295857988166
ExtraTrees Precision: 0.8550295857988166
ExtraTrees F1: 0.8550295857988166
ExtraTrees Recall: 0.8550295857988166
ExtraTrees FP: 0
ExtraTrees CM:
 [[169   0   0   0   0   0   0   0   0   0   0   0]
 [  0 169   0   0   0   0   0   1   0   0   0   0]
 [  0   0 169   0   0   0   0  27   0  11   0   0]
 [  0   0   0 169   0   0   0   0   0   0   0   0]
 [  0   0   0   0 169   0   0   0   0   0   0   0]
 [  0   0   0   0   0 169   0   0   0   0   0   0]
 [  0   0   0   0   0   0 169   0   0   0   0   0]
 [  0   0   0   0   0   0   0  83  37   5   0   0]
 [  0   0   0   0   0   0   0  36 132   0   0   0]
 [  0   0   0   0   0   0   0   9   0  94  96   0]
 [  0   0   0   0   0   0   0  13   0  59  73   0]
 [  0   0   0   0   0   0   0   0   0   0   0 169]]
XGBoost Macro-average TPR: 0.8530055948711128
XGBoost Macro-average FPR: 0.0130893970

## RandomForest

In [5]:
rf_params = {
    "n_estimators": 900,
    "max_leaf_nodes": 15000,
    "n_jobs": -1,
    "random_state": 0,
    "bootstrap": True,
    "criterion": "entropy"
}

print("RandomForestClassifier Starting")
rf_model = RandomForestClassifier(**rf_params)
# rf_model = joblib.load('./models/framework_rf_best.pkl')
rf_model.fit(X=X_train, y=y_train)
joblib.dump(rf_model, './models/framework_rf_best.pkl')
rf_start_time = time.time()
rf_prediction = rf_model.predict(X_test)
rf_end_time = time.time()
rf_time = rf_end_time - rf_start_time
print("RandomForestClassifier Finished")

rf_acc = sklearn.metrics.accuracy_score(rf_prediction, y_test)
rf_precision = sklearn.metrics.precision_score(rf_prediction, y_test, average='micro')
rf_f1 = sklearn.metrics.f1_score(rf_prediction, y_test, average='micro')
rf_recall = sklearn.metrics.recall_score(rf_prediction, y_test, average='micro')
rf_cm = sklearn.metrics.confusion_matrix(rf_prediction, y_test)
rf_fp = rf_cm[0, 1]
print("RandomForest report:")
print("RandomForest Time:", rf_end_time - rf_start_time)
print("RandomForest Accuracy:", rf_acc)
print("RandomForest Precision:", rf_precision)
print("RandomForest F1:", rf_f1)
print("RandomForest Recall:", rf_recall)
print("RandomForest FP:", rf_fp)
print("RandomForest CM:", rf_cm)
rf_tpr, rf_fpr = calculate_macro_tpr_fpr(rf_cm)
print(f'XGBoost Macro-average TPR: {rf_tpr}')
print(f'XGBoost Macro-average FPR: {rf_fpr}')

RandomForestClassifier Starting
RandomForestClassifier Finished
RandomForest report:
RandomForest Time: 0.3634300231933594
RandomForest Accuracy: 0.8451676528599605
RandomForest Precision: 0.8451676528599605
RandomForest F1: 0.8451676528599605
RandomForest Recall: 0.8451676528599605
RandomForest FP: 0
RandomForest CM: [[169   0   0   0   0   0   0   1   0   0   0   0]
 [  0 169   0   0   0   0   0   1   0   0   0   0]
 [  0   0 169   0   0   0   0  36   0  12   0   0]
 [  0   0   0 169   0   0   0   1   0   0   0   0]
 [  0   0   0   0 169   1   0   3   0   0   0   0]
 [  0   0   0   0   0 168   0   0   0   0   0   0]
 [  0   0   0   0   0   0 169   3   0   0   0   0]
 [  0   0   0   0   0   0   0  71  42   5   1   0]
 [  0   0   0   0   0   0   0  31 127   0   0   0]
 [  0   0   0   0   0   0   0  10   0  97 100   0]
 [  0   0   0   0   0   0   0  12   0  55  68   0]
 [  0   0   0   0   0   0   0   0   0   0   0 169]]
XGBoost Macro-average TPR: 0.8411108665069027
XGBoost Macro-average