In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from mlxtend.evaluate import PredefinedHoldoutSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
X_full = pd.read_csv('New_Train_Features_Full.csv')
X_test_full = pd.read_csv('New_Test_Features_Full.csv')
y = pd.read_csv('Train_Target.csv').iloc[:,1]

X_full_no = pd.read_csv('New_Train_Features.csv')
X_test_full_no = pd.read_csv('New_Test_Features.csv')

X_no_manual = pd.read_csv('New_Train_Features_No_Selected.csv')
X_test_no_manual = pd.read_csv('New_Test_Features_No_Selected.csv')

X_RFE = pd.read_csv('New_Train_Features_RFE.csv')
X_RFE_test = pd.read_csv('New_Test_Features_RFE.csv')

X_LASSO = pd.read_csv('New_Train_Features_LASSO.csv')
X_LASSO_test = pd.read_csv('New_Test_Features_LASSO.csv')

X_manual = pd.read_csv('New_Train_Features_Selected.csv')
X_manual_test = pd.read_csv('New_Test_Features_Selected.csv')

# Full Dataset:

In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_full, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [None]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [None]:
params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 20, None],
    "max_features": ["sqrt", "log2"],
    "criterion": ["gini", "entropy"],
    "class_weight": ["balanced", "balanced_subsample", {0: 1, 1: 5}],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}


rf = RandomForestClassifier(random_state = 2025)

In [None]:
np.random.seed(2025)
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
rf_grid.fit(X_temp, y_temp)
print('Best Parameters:', rf_grid.best_params_)

Best Parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}


In [None]:
np.random.seed(2025)
rf_model = RandomForestClassifier(class_weight = 'balanced', criterion = 'entropy', max_depth = 20, max_features = 'log2', min_samples_leaf = 2, min_samples_split = 10, n_estimators = 50, random_state = 2025)
rf_model.fit(X_temp, y_temp)
y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96        36
           1       0.80      0.89      0.84         9

    accuracy                           0.93        45
   macro avg       0.89      0.92      0.90        45
weighted avg       0.94      0.93      0.93        45



In [None]:
np.random.seed(2025)
rf_model.fit(X_full, y)
y_pred_rf = rf_model.predict(X_test_full)

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_rf})

with open('RF_Full_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_Full_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# RFE Dataset:

In [None]:
np.random.seed(2025)
X_train_rfe, X_temp_rfe, y_train_rfe, y_temp_rfe = train_test_split(X_RFE, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val_rfe, X_test_rfe, y_val_rfe, y_test_rfe = train_test_split(X_temp_rfe, y_temp_rfe, test_size = 0.5, random_state = 2025, stratify = y_temp_rfe)

In [None]:
np.random.seed(2025)
train_rfe_ind, val_rfe_ind = train_test_split(np.arange(X_temp_rfe.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp_rfe)
split_rfe = PredefinedHoldoutSplit(valid_indices = val_rfe_ind)

In [None]:
np.random.seed(2025)
rf_grid_rfe = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split_rfe, n_jobs = -1, scoring = 'f1_macro')
rf_grid_rfe.fit(X_temp_rfe, y_temp_rfe)
print('Best Parameters:', rf_grid_rfe.best_params_)

Best Parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}


In [None]:
np.random.seed(2025)
rf_model_rfe = RandomForestClassifier(class_weight = 'balanced', criterion = 'gini', max_depth = None, max_features = 'log2', min_samples_leaf = 2, min_samples_split = 10, n_estimators = 50, random_state = 2025)
rf_model_rfe.fit(X_temp_rfe, y_temp_rfe)
y_pred_rfe = rf_model_rfe.predict(X_test_rfe)

print(classification_report(y_test, y_pred_rfe))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97        36
           1       0.89      0.89      0.89         9

    accuracy                           0.96        45
   macro avg       0.93      0.93      0.93        45
weighted avg       0.96      0.96      0.96        45



In [None]:
np.random.seed(2025)
rf_model_rfe.fit(X_RFE, y)
y_pred_rfe_rf = rf_model_rfe.predict(X_RFE_test)

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_rfe_rf})

with open('RF_RFE_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_RFE_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# LASSO:

In [None]:
np.random.seed(2025)
X_train_lasso, X_temp_lasso, y_train_lasso, y_temp_lasso = train_test_split(X_LASSO, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val_lasso, X_test_lasso, y_val_lasso, y_test_lasso = train_test_split(X_temp_lasso, y_temp_lasso, test_size = 0.5, random_state = 2025, stratify = y_temp_lasso)

In [None]:
np.random.seed(2025)
train_lasso_ind, val_lasso_ind = train_test_split(np.arange(X_temp_lasso.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp_lasso)
split_lasso = PredefinedHoldoutSplit(valid_indices = val_lasso_ind)

In [None]:
np.random.seed(2025)
rf_grid_lasso = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split_lasso, n_jobs = -1, scoring = 'f1_macro')
rf_grid_lasso.fit(X_temp_lasso, y_temp_lasso)
print('Best Parameters:', rf_grid_lasso.best_params_)

Best Parameters: {'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}


In [None]:
np.random.seed(2025)
rf_model_lasso = RandomForestClassifier(class_weight = 'balanced', criterion = 'entropy', max_depth = 20, max_features = 'log2', min_samples_leaf = 2, min_samples_split = 10, n_estimators = 50, random_state = 2025)
rf_model_lasso.fit(X_temp_lasso, y_temp_lasso)
y_pred_lasso = rf_model_lasso.predict(X_test_lasso)

print(classification_report(y_test, y_pred_lasso))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        36
           1       1.00      0.78      0.88         9

    accuracy                           0.96        45
   macro avg       0.97      0.89      0.92        45
weighted avg       0.96      0.96      0.95        45



In [None]:
np.random.seed(2025)
rf_model_lasso.fit(X_LASSO, y)
y_pred_lasso_rf = rf_model_lasso.predict(X_LASSO_test)

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_lasso_rf})

with open('RF_LASSO_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_LASSO_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Full - Without Standardization




In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_full_no, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [None]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [None]:
np.random.seed(2025)
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
rf_grid.fit(X_temp, y_temp)
print('Best Parameters:', rf_grid.best_params_)

Best Parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [None]:
np.random.seed(2025)
rf_model = RandomForestClassifier(class_weight = 'balanced', criterion = 'entropy', max_depth = 5, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2, n_estimators = 50, random_state = 2025)
rf_model.fit(X_temp, y_temp)
y_pred_rf_no = rf_model.predict(X_test)

print(classification_report(y_test, y_pred_rf_no))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00         9

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [None]:
np.random.seed(2025)
rf_model.fit(X_full_no, y)
y_pred_rf_no = rf_model.predict(X_test_full_no)

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_rf_no})

with open('RF_Full_No_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_Full_No_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Full - Without Standardization - Manual



In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_no_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [None]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [None]:
np.random.seed(2025)
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
rf_grid.fit(X_temp, y_temp)
print('Best Parameters:', rf_grid.best_params_)

Best Parameters: {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}


In [None]:
np.random.seed(2025)
rf_model = RandomForestClassifier(class_weight = 'balanced_subsample', criterion = 'entropy', max_depth = 20, max_features = 'log2', min_samples_leaf = 2, min_samples_split = 2, n_estimators = 50, random_state = 2025)
rf_model.fit(X_temp, y_temp)
y_pred_manual = rf_model.predict(X_test)

print(classification_report(y_test, y_pred_manual))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       1.00      0.89      0.94         9

    accuracy                           0.98        45
   macro avg       0.99      0.94      0.96        45
weighted avg       0.98      0.98      0.98        45



In [None]:
np.random.seed(2025)
rf_model.fit(X_no_manual, y)
y_pred_rf_no_manual = rf_model.predict(X_test_no_manual)

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_rf_no_manual})

with open('RF_No_Manual_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_No_Manual_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Manual Selection:

In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [None]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [None]:
np.random.seed(2025)
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
rf_grid.fit(X_temp, y_temp)
print('Best Parameters:', rf_grid.best_params_)

Best Parameters: {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50}


In [None]:
np.random.seed(2025)
rf_model = RandomForestClassifier(class_weight = 'balanced_subsample', criterion = 'entropy', max_depth = 20, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 10, n_estimators = 50, random_state = 2025)
rf_model.fit(X_temp, y_temp)
y_pred_manual = rf_model.predict(X_test)

print(classification_report(y_test, y_pred_manual))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90        36
           1       0.60      0.67      0.63         9

    accuracy                           0.84        45
   macro avg       0.76      0.78      0.77        45
weighted avg       0.85      0.84      0.85        45



In [None]:
np.random.seed(2025)
rf_model.fit(X_manual, y)
y_pred_rf_manual = rf_model.predict(X_manual_test)

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_rf_manual})

with open('RF_Manual_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_Manual_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# SMOTE FOR BEST RF:

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

smote = SMOTE(random_state = 2025)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

In [None]:
np.random.seed(2025)
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
rf_grid.fit(X_temp, y_temp)
print('Best Parameters:', rf_grid.best_params_)

Best Parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
np.random.seed(2025)
rf_model_smote = RandomForestClassifier(class_weight = 'balanced', criterion = 'gini', max_depth = 10, max_features = 'log2', min_samples_leaf = 2, min_samples_split = 2, n_estimators = 100, random_state = 2025)
rf_model_smote.fit(X_resampled, y_resampled)
y_pred_smote = rf_model_smote.predict(X_test)

print(classification_report(y_test, y_pred_smote))
y_pred_smote_rf = rf_model_smote.predict(X_manual_test)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       1.00      0.89      0.94         9

    accuracy                           0.98        45
   macro avg       0.99      0.94      0.96        45
weighted avg       0.98      0.98      0.98        45



In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_smote_rf})

with open('RF_SMOTE_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_SMOTE_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# GHOST:


In [None]:
!pip install ghostml

Collecting ghostml
  Downloading ghostml-0.3.0-py3-none-any.whl.metadata (4.1 kB)
Downloading ghostml-0.3.0-py3-none-any.whl (6.5 kB)
Installing collected packages: ghostml
Successfully installed ghostml-0.3.0


In [None]:
import ghostml
from sklearn import metrics

def calc_metrics(labels_test, test_probs, threshold = 0.5):
  scores = [1 if x>=threshold else 0 for x in test_probs]
  auc = metrics.roc_auc_score(labels_test, test_probs)
  kappa = metrics.cohen_kappa_score(labels_test, scores)
  confusion = metrics.confusion_matrix(labels_test, scores, labels = list(set(labels_test)))

  print('Threshold: %.2f, kappa: %.3f, AUC test set: %.3f'%(threshold, kappa, auc))
  print(confusion)
  print(metrics.classification_report(labels_test, scores))

  return

thresholds = np.round(np.arange(0.01, 0.75,0.01),2)

In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

rf_ghost = RandomForestClassifier(class_weight = 'balanced_subsample', criterion = 'entropy', max_depth = 20, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 10, n_estimators = 50, random_state = 2025, oob_score = True)
rf_ghost.fit(X_temp, y_temp)
rf_test_probs = rf_ghost.predict_proba(X_test)[:,1]
calc_metrics(y_test, rf_test_probs, threshold = 0.5)

Threshold: 0.50, kappa: 0.533, AUC test set: 0.917
[[32  4]
 [ 3  6]]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        36
           1       0.60      0.67      0.63         9

    accuracy                           0.84        45
   macro avg       0.76      0.78      0.77        45
weighted avg       0.85      0.84      0.85        45



In [None]:
np.random.seed(2025)
rf_temp_probs = rf_ghost.predict_proba(X_temp)[:,1]
rf_threshold = ghostml.optimize_threshold_from_predictions(y_temp, rf_temp_probs, thresholds, ThOpt_metrics = 'Kappa')

np.random.seed(2025)
calc_metrics(y_test, rf_test_probs, threshold = rf_threshold)
rf_GHOST_test = [1 if x >= rf_threshold else 0 for x in rf_test_probs]

Threshold: 0.54, kappa: 0.697, AUC test set: 0.917
[[35  1]
 [ 3  6]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.95        36
           1       0.86      0.67      0.75         9

    accuracy                           0.91        45
   macro avg       0.89      0.82      0.85        45
weighted avg       0.91      0.91      0.91        45



In [None]:
np.random.seed(2025)
rf_ghost.fit(X_manual, y)
rf_probs = rf_ghost.predict_proba(X_manual_test)[:,1]
rf_pred_ghost = [1 if x >= rf_threshold else 0 for x in rf_probs]

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': rf_pred_ghost})

with open('RF_GHOST_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_GHOST_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_full, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

rf_ghost = RandomForestClassifier(class_weight = 'balanced_subsample', criterion = 'entropy', max_depth = 20, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 10, n_estimators = 50, random_state = 2025, oob_score = True)
rf_ghost.fit(X_temp, y_temp)
rf_test_probs = rf_ghost.predict_proba(X_test)[:,1]
calc_metrics(y_test, rf_test_probs, threshold = 0.5)

Threshold: 0.50, kappa: 0.744, AUC test set: 0.954
[[33  3]
 [ 1  8]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.94        36
           1       0.73      0.89      0.80         9

    accuracy                           0.91        45
   macro avg       0.85      0.90      0.87        45
weighted avg       0.92      0.91      0.91        45



In [None]:
np.random.seed(2025)
rf_temp_probs = rf_ghost.predict_proba(X_temp)[:,1]
rf_threshold = ghostml.optimize_threshold_from_predictions(y_temp, rf_temp_probs, thresholds, ThOpt_metrics = 'Kappa')

np.random.seed(2025)
calc_metrics(y_test, rf_test_probs, threshold = rf_threshold)
rf_GHOST_test = [1 if x >= rf_threshold else 0 for x in rf_test_probs]

Threshold: 0.41, kappa: 0.667, AUC test set: 0.954
[[30  6]
 [ 0  9]]
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        36
           1       0.60      1.00      0.75         9

    accuracy                           0.87        45
   macro avg       0.80      0.92      0.83        45
weighted avg       0.92      0.87      0.88        45



In [None]:
np.random.seed(2025)
rf_ghost.fit(X_full, y)
rf_probs = rf_ghost.predict_proba(X_test_full)[:,1]
rf_pred_ghost = [1 if x >= rf_threshold else 0 for x in rf_probs]

In [None]:
from google.colab import files

pred_rf = pd.DataFrame({'ID': range(1, 92), 'Placement': rf_pred_ghost})

with open('RF_GHOST_FULL_Cov.csv', 'w') as file:
    pred_rf.to_csv(file, index = False, header = True)

files.download('RF_GHOST_FULL_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>