# Data Loading
Nothing much here. Just importing the repo and putting into X and Y.

In [14]:
import numpy as np
  
data = np.load('data.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

# SKLearn Imports and Data Preprocessing
Importing necessary SKLearn packages and preprocessing the data. (80/20 train test split)

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Default Decision Tree Classifier
When reading a paper on Bankruptcy Prediction, Random Forest and XGBoost were used. Upon the creation of this ipynb, Decision Trees, Random Forest, and XGBoost were all tested and the default XGBoost and Decision Tree returned simliar scores, so the Decision Tree algorithm was chosen as the first path of exploration with hopes of being able to have more optimization with different hyperparameters and custom ensembling in the DT algorithm. These were the scores of each default algorithm:
- Decision Tree:
    - Accuracy: 0.9589
    - Precision: 0.3860
    - Recall: 0.5116
    - F1: 0.4400
- Random Forest:
    - Accuracy: 0.9692
    - Precision: 0.5294
    - Recall: 0.2093
    - F1: 0.3000
- XGBoost:
    - Accuracy: 0.9721
    - Precision: 0.6087
    - Recall: 0.3256
    - F1: 0.4242

In [16]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

dt_model = DecisionTreeClassifier(random_state=69)

dt_model.fit(X_train, y_train)

dt_y_pred = dt_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_precision = precision_score(y_test, dt_y_pred)
dt_recall = recall_score(y_test, dt_y_pred)
dt_f1 = f1_score(y_test, dt_y_pred)

print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"Decision Tree Precision: {dt_precision:.4f}")
print(f"Decision Tree Recall: {dt_recall:.4f}")
print(f"Decision Tree F1 Score: {dt_f1:.4f}")

Decision Tree Accuracy: 0.9589
Decision Tree Precision: 0.3860
Decision Tree Recall: 0.5116
Decision Tree F1 Score: 0.4400


In [17]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=69)

rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Random Forest Precision: {rf_precision:.4f}")
print(f"Random Forest Recall: {rf_recall:.4f}")
print(f"Random Forest F1 Score: {rf_f1:.4f}")

Random Forest Accuracy: 0.9692
Random Forest Precision: 0.5294
Random Forest Recall: 0.2093
Random Forest F1 Score: 0.3000


In [18]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=69)
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)

xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_precision = precision_score(y_test, xgb_y_pred)
xgb_recall = recall_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred)

print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"XGBoost Precision: {xgb_precision:.4f}")
print(f"XGBoost Recall: {xgb_recall:.4f}")
print(f"XGBoost F1 Score: {xgb_f1:.4f}")

XGBoost Accuracy: 0.9721
XGBoost Precision: 0.6087
XGBoost Recall: 0.3256
XGBoost F1 Score: 0.4242


# Grid Search
This will get the best overall model for the parameters max_depth, min_samples_split, min_samples_leaf, and criterion. (72 models in total)

## Best Hyperparameters Based on Testing with Cross-Validation (cv=5)
- Criterion='gini'
- max_depth=None
- min_samples_leaf=5
- min_samples_split=5

In [19]:
# param_grid = {
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 10, 20],
#     'min_samples_leaf': [1, 5, 10],
#     'criterion': ['gini', 'entropy']
# }

# grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='f1', verbose=1)

# best_model = grid_search.fit(X_train, y_train)

# dt_y_pred = best_model.predict(X_test)

# dt_accuracy = accuracy_score(y_test, dt_y_pred)
# dt_precision = precision_score(y_test, dt_y_pred)
# dt_recall = recall_score(y_test, dt_y_pred)
# dt_f1 = f1_score(y_test, dt_y_pred)

# print(f"Best Decision Tree Parameters: {best_model.best_params_}")
# print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
# print(f"Decision Tree Precision: {dt_precision:.4f}")
# print(f"Decision Tree Recall: {dt_recall:.4f}")
# print(f"Decision Tree F1 Score: {dt_f1:.4f}")

# Further Testing
After further testing, these hyperparameters are the best for the Decision Tree. (min_samples_split=3 is the only one different from default)

In [42]:
dt_model = DecisionTreeClassifier(random_state=69, max_depth=None, criterion='gini', min_samples_split=3, min_samples_leaf=1)

dt_model.fit(X_train, y_train)

dt_y_pred = dt_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_precision = precision_score(y_test, dt_y_pred)
dt_recall = recall_score(y_test, dt_y_pred)
dt_f1 = f1_score(y_test, dt_y_pred)

print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"Decision Tree Precision: {dt_precision:.4f}")
print(f"Decision Tree Recall: {dt_recall:.4f}")
print(f"Decision Tree F1 Score: {dt_f1:.4f}")

Decision Tree Accuracy: 0.9597
Decision Tree Precision: 0.3966
Decision Tree Recall: 0.5349
Decision Tree F1 Score: 0.4554
