In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from mlxtend.evaluate import PredefinedHoldoutSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [6]:
X_full = pd.read_csv('New_Train_Features_Full.csv')
X_test_full = pd.read_csv('New_Test_Features_Full.csv')
y = pd.read_csv('Train_Target.csv').iloc[:,1]

X_full_no = pd.read_csv('New_Train_Features.csv')
X_test_full_no = pd.read_csv('New_Test_Features.csv')

X_no_manual = pd.read_csv('New_Train_Features_No_Selected.csv')
X_test_no_manual = pd.read_csv('New_Test_Features_No_Selected.csv')

X_RFE = pd.read_csv('New_Train_Features_RFE.csv')
X_RFE_test = pd.read_csv('New_Test_Features_RFE.csv')

X_LASSO = pd.read_csv('New_Train_Features_LASSO.csv')
X_LASSO_test = pd.read_csv('New_Test_Features_LASSO.csv')

X_manual = pd.read_csv('New_Train_Features_Selected.csv')
X_manual_test = pd.read_csv('New_Test_Features_Selected.csv')

# Full Dataset:

In [17]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_full, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [18]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [19]:
params = {
    "max_depth": [10, 20, None],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [5, 10],
    "criterion": ["gini", "entropy"],
    "max_features": ["sqrt", "log2"],
    "class_weight": [{0: 1, 1: 5}, None]
}

dt = DecisionTreeClassifier(random_state = 2025)

In [20]:
np.random.seed(2025)
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state = 2025), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
dt_grid.fit(X_temp, y_temp)
print('Best Parameters:', dt_grid.best_params_)

Best Parameters: {'class_weight': {0: 1, 1: 5}, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 5}


In [21]:
np.random.seed(2025)
dt_model = DecisionTreeClassifier(class_weight = {0: 1, 1: 5}, criterion = 'gini', max_depth = 10, max_features = 'log2', min_samples_leaf = 10, min_samples_split = 5, random_state = 2025)
dt_model.fit(X_temp, y_temp)
y_pred = dt_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.69      0.81        36
           1       0.42      0.89      0.57         9

    accuracy                           0.73        45
   macro avg       0.69      0.79      0.69        45
weighted avg       0.85      0.73      0.76        45



In [22]:
np.random.seed(2025)
dt_model.fit(X_full, y)
y_pred_dt = dt_model.predict(X_test_full)

In [23]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_dt})

with open('DT_Full_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_Full_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# RFE Dataset:

In [24]:
np.random.seed(2025)
X_train_rfe, X_temp_rfe, y_train_rfe, y_temp_rfe = train_test_split(X_RFE, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val_rfe, X_test_rfe, y_val_rfe, y_test_rfe = train_test_split(X_temp_rfe, y_temp_rfe, test_size = 0.5, random_state = 2025, stratify = y_temp_rfe)

In [25]:
np.random.seed(2025)
train_rfe_ind, val_rfe_ind = train_test_split(np.arange(X_temp_rfe.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp_rfe)
split_rfe = PredefinedHoldoutSplit(valid_indices = val_rfe_ind)

In [26]:
np.random.seed(2025)
dt_grid_rfe = GridSearchCV(DecisionTreeClassifier(random_state = 2025), param_grid = params, cv = split_rfe, n_jobs = -1, scoring = 'f1_macro')
dt_grid_rfe.fit(X_temp_rfe, y_temp_rfe)
print('Best Parameters:', dt_grid_rfe.best_params_)

Best Parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5}


In [27]:
np.random.seed(2025)
dt_model_rfe = DecisionTreeClassifier(class_weight = None, criterion = 'gini', max_depth = 10, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 5, random_state = 2025)
dt_model_rfe.fit(X_temp_rfe, y_temp_rfe)
y_pred_rfe = dt_model_rfe.predict(X_test_rfe)

print(classification_report(y_test, y_pred_rfe))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        36
           1       0.43      0.33      0.38         9

    accuracy                           0.78        45
   macro avg       0.64      0.61      0.62        45
weighted avg       0.76      0.78      0.77        45



In [28]:
np.random.seed(2025)
dt_model_rfe.fit(X_RFE, y)
y_pred_rfe_dt = dt_model_rfe.predict(X_RFE_test)

In [29]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_rfe_dt})

with open('DT_RFE_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_RFE_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# LASSO:

In [30]:
np.random.seed(2025)
X_train_lasso, X_temp_lasso, y_train_lasso, y_temp_lasso = train_test_split(X_LASSO, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val_lasso, X_test_lasso, y_val_lasso, y_test_lasso = train_test_split(X_temp_lasso, y_temp_lasso, test_size = 0.5, random_state = 2025, stratify = y_temp_lasso)

In [31]:
np.random.seed(2025)
train_lasso_ind, val_lasso_ind = train_test_split(np.arange(X_temp_lasso.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp_lasso)
split_lasso = PredefinedHoldoutSplit(valid_indices = val_lasso_ind)

In [32]:
np.random.seed(2025)
dt_grid_lasso = GridSearchCV(DecisionTreeClassifier(random_state = 2025), param_grid = params, cv = split_lasso, n_jobs = -1, scoring = 'f1_macro')
dt_grid_lasso.fit(X_temp_lasso, y_temp_lasso)
print('Best Parameters:', dt_grid_lasso.best_params_)

Best Parameters: {'class_weight': {0: 1, 1: 5}, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 5}


In [33]:
np.random.seed(2025)
dt_model_lasso = DecisionTreeClassifier(class_weight = {0: 1, 1: 5}, criterion = 'gini', max_depth = 10, max_features = 'log2', min_samples_leaf = 10, min_samples_split = 5, random_state = 2025)
dt_model_lasso.fit(X_temp_lasso, y_temp_lasso)
y_pred_lasso = dt_model_lasso.predict(X_test_lasso)

print(classification_report(y_test, y_pred_lasso))

              precision    recall  f1-score   support

           0       0.95      0.53      0.68        36
           1       0.32      0.89      0.47         9

    accuracy                           0.60        45
   macro avg       0.64      0.71      0.57        45
weighted avg       0.82      0.60      0.64        45



In [34]:
np.random.seed(2025)
dt_model_lasso.fit(X_LASSO, y)
y_pred_lasso_dt = dt_model_lasso.predict(X_LASSO_test)

In [35]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_lasso_dt})

with open('DT_LASSO_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_LASSO_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Full - Without Standardization




In [36]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_full_no, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [37]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [38]:
np.random.seed(2025)
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state = 2025), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
dt_grid.fit(X_temp, y_temp)
print('Best Parameters:', dt_grid.best_params_)

Best Parameters: {'class_weight': {0: 1, 1: 5}, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5}


In [39]:
np.random.seed(2025)
dt_model = DecisionTreeClassifier(class_weight = {0: 1, 1: 5}, criterion = 'gini', max_depth = 10, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 5, random_state = 2025)
dt_model.fit(X_temp, y_temp)
y_pred_dt_no = dt_model.predict(X_test)

print(classification_report(y_test, y_pred_dt_no))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88        36
           1       0.53      1.00      0.69         9

    accuracy                           0.82        45
   macro avg       0.76      0.89      0.78        45
weighted avg       0.91      0.82      0.84        45



In [40]:
np.random.seed(2025)
dt_model.fit(X_full_no, y)
y_pred_dt_no = dt_model.predict(X_test_full_no)

In [41]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_dt_no})

with open('DT_Full_No_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_Full_No_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Full - Without Standardization - Manual



In [42]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_no_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [43]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [44]:
np.random.seed(2025)
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state = 2025), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
dt_grid.fit(X_temp, y_temp)
print('Best Parameters:', dt_grid.best_params_)

Best Parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 5}


In [45]:
np.random.seed(2025)
dt_model = DecisionTreeClassifier(class_weight =  None, criterion = 'gini', max_depth = 10, max_features = 'log2', min_samples_leaf = 5, min_samples_split = 5, random_state = 2025)
dt_model.fit(X_temp, y_temp)
y_pred_manual = dt_model.predict(X_test)

print(classification_report(y_test, y_pred_manual))

              precision    recall  f1-score   support

           0       0.91      0.83      0.87        36
           1       0.50      0.67      0.57         9

    accuracy                           0.80        45
   macro avg       0.70      0.75      0.72        45
weighted avg       0.83      0.80      0.81        45



In [46]:
np.random.seed(2025)
dt_model.fit(X_no_manual, y)
y_pred_dt_no_manual = dt_model.predict(X_test_no_manual)

In [47]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_dt_no_manual})

with open('DT_No_Manual_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_No_Manual_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Manual Selection:

In [48]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

In [49]:
np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [50]:
np.random.seed(2025)
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state = 2025), param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
dt_grid.fit(X_temp, y_temp)
print('Best Parameters:', dt_grid.best_params_)

Best Parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5}


In [51]:
np.random.seed(2025)
dt_model = DecisionTreeClassifier(class_weight = None, criterion = 'gini', max_depth = 10, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 5, random_state = 2025)
dt_model.fit(X_temp, y_temp)
y_pred_manual = dt_model.predict(X_test)

print(classification_report(y_test, y_pred_manual))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90        36
           1       0.62      0.56      0.59         9

    accuracy                           0.84        45
   macro avg       0.76      0.74      0.75        45
weighted avg       0.84      0.84      0.84        45



In [52]:
np.random.seed(2025)
dt_model.fit(X_manual, y)
y_pred_dt_manual = dt_model.predict(X_manual_test)

In [53]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_dt_manual})

with open('DT_Manual_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_Manual_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# SMOTE FOR BEST DT:

In [54]:
from imblearn.over_sampling import SMOTE

In [58]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_no_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

smote = SMOTE(random_state = 2025)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

In [59]:
np.random.seed(2025)
dt_model_smote = DecisionTreeClassifier(class_weight =  None, criterion = 'gini', max_depth = 10, max_features = 'log2', min_samples_leaf = 5, min_samples_split = 5, random_state = 2025)
dt_model_smote.fit(X_resampled, y_resampled)
y_pred_smote = dt_model_smote.predict(X_test)

print(classification_report(y_test, y_pred_smote))
y_pred_smote_dt = dt_model_smote.predict(X_test_no_manual)

              precision    recall  f1-score   support

           0       0.97      0.89      0.93        36
           1       0.67      0.89      0.76         9

    accuracy                           0.89        45
   macro avg       0.82      0.89      0.84        45
weighted avg       0.91      0.89      0.89        45



In [60]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_smote_dt})

with open('DT_SMOTE_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_SMOTE_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Hillinger Distance

In [61]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree._criterion import Criterion

class HellingerDistanceCriterion(Criterion):
    def __init__(self, n_outputs, n_classes):
        self.n_outputs = n_outputs
        self.n_classes = n_classes

    def node_impurity(self):
        return np.sum(np.sqrt(self.sum_left * self.sum_right))

    def children_impurity(self):
        return np.sum(np.sqrt(self.sum_left)) + np.sum(np.sqrt(self.sum_right))

    def __call__(self, y, sample_weight):
        return np.sum(np.sqrt(y))

In [62]:
hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64'))

In [63]:
np.random.seed(2025)
X_train, X_temp, y_train, y_temp = train_test_split(X_no_manual, y, test_size = 0.3, random_state = 2025, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 2025, stratify = y_temp)

np.random.seed(2025)
train_ind, val_ind = train_test_split(np.arange(X_temp.shape[0]), test_size = 0.3, random_state = 2025, stratify = y_temp)
split = PredefinedHoldoutSplit(valid_indices = val_ind)

In [69]:
params = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [5, 10],
    "max_features": ["sqrt", "log2"],
    "class_weight": [{0: 1, 1: 5}, None]
}

dt = DecisionTreeClassifier(criterion = hdc, random_state = 2025)

In [72]:
np.random.seed(2025)
dt_grid = GridSearchCV(dt, param_grid = params, cv = split, n_jobs = -1, scoring = 'f1_macro')
dt_grid.fit(X_temp, y_temp)
print('Best Parameters:', dt_grid.best_params_)

Best Parameters: {'class_weight': {0: 1, 1: 5}, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5}


In [73]:
np.random.seed(2025)
hddt_model = DecisionTreeClassifier(class_weight = {0: 1, 1: 5}, criterion = hdc, max_depth = 5, max_features = 'sqrt', min_samples_leaf = 5, min_samples_split = 5, random_state = 2025)
hddt_model.fit(X_temp, y_temp)
y_pred_hddt = hddt_model.predict(X_test)

print(classification_report(y_test, y_pred_hddt))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89        36
           1       0.00      0.00      0.00         9

    accuracy                           0.80        45
   macro avg       0.40      0.50      0.44        45
weighted avg       0.64      0.80      0.71        45



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [75]:
np.random.seed(2025)
hddt_model.fit(X_no_manual, y)
y_pred_hddt = hddt_model.predict(X_test_no_manual)

In [76]:
from google.colab import files

pred_dt = pd.DataFrame({'ID': range(1, 92), 'Placement': y_pred_hddt})

with open('DT_HDDT_Cov.csv', 'w') as file:
    pred_dt.to_csv(file, index = False, header = True)

files.download('DT_HDDT_Cov.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>