# Data Loading
Nothing much in this cell after "data.npz" was created.

In [1]:
import numpy as np
  
data = np.load('data.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

# SKLearn Imports and Data Preprocessing
Importing necessary SKLearn packages and preprocessing the data (Scaling).

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Regular Model
Just the normal Logistic Regression model and its scores.

In [3]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Default Accuracy: {accuracy:.4f}")
print(f"Default Precision: {precision:.4f}")
print(f"Default Recall: {recall:.4f}")
print(f"Default F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#coefficients = logistic_model.coef_
#print("Coefficients:", coefficients)

# y_probs = logistic_model.predict_proba(X_test)[:, 1]

# thresholds = [0.3, 0.4, 0.5, 0.6]
# for thresh in thresholds:
#     y_pred_thresh = np.where(y_probs > thresh, 1, 0)
#     print(f"Threshold: {thresh}")
#     print(f"Accuracy: {accuracy_score(y_test, y_pred_thresh):.4f}")
#     print(f"Precision: {precision_score(y_test, y_pred_thresh):.4f}")
#     print(f"Recall: {recall_score(y_test, y_pred_thresh):.4f}")
#     print(f"F1 Score: {f1_score(y_test, y_pred_thresh):.4f}")

Default Accuracy: 0.9699
Default Precision: 0.5500
Default Recall: 0.2558
Default F1 Score: 0.3492

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1321
           1       0.55      0.26      0.35        43

    accuracy                           0.97      1364
   macro avg       0.76      0.62      0.67      1364
weighted avg       0.96      0.97      0.96      1364



# Model With Class Weight and Max Iterations (To Optimize for Recall)
A Logistic Regression model optimized for Recall. There is a tradeoff in precision. (Lots of false positives)

In [4]:
logistic_model = LogisticRegression(class_weight='balanced', max_iter=500)

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Default Accuracy: {accuracy:.4f}")
print(f"Default Precision: {precision:.4f}")
print(f"Default Recall: {recall:.4f}")
print(f"Default F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# y_probs = logistic_model.predict_proba(X_test)[:, 1]

# thresholds = [0.3, 0.4, 0.5, 0.6]
# for thresh in thresholds:
#     y_pred_thresh = np.where(y_probs > thresh, 1, 0)
#     print(f"Threshold: {thresh}")
#     print(f"Accuracy: {accuracy_score(y_test, y_pred_thresh):.4f}")
#     print(f"Precision: {precision_score(y_test, y_pred_thresh):.4f}")
#     print(f"Recall: {recall_score(y_test, y_pred_thresh):.4f}")
#     print(f"F1 Score: {f1_score(y_test, y_pred_thresh):.4f}")

def custom_score(y_true, y_pred):
    recall_weight = 0.75
    accuracy_weight = 0.25
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    return recall * recall_weight + accuracy * accuracy_weight

final_score = custom_score(y_test, y_pred)

print(f"Final custom score on test set: {final_score:.4f}")

Default Accuracy: 0.8710
Default Precision: 0.1692
Default Recall: 0.7907
Default F1 Score: 0.2787

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.87      0.93      1321
           1       0.17      0.79      0.28        43

    accuracy                           0.87      1364
   macro avg       0.58      0.83      0.60      1364
weighted avg       0.97      0.87      0.91      1364

Final custom score on test set: 0.8108


# Grid Search
This will get the best recall score for the regularization strength (C) and the max number of iterations (max_iter) on the Logistic Regression Model below. There will be Convergence Warnings due to non-convergence.

## Best Hyperparameters Based on Testing with Cross-Validation (cv=5)
- Recall: 
    - class_weight='balanced'
    - C=0.01
    - max_iter=100
- Accuracy:
    - class_weight='None'
    - C=0.01
    - max_iter=100
- F1:
    - class_weight='None'
    - C=10
    - max_iter=500
- Precision:
    - class_weight='None'
    - C=0.01
    - max_iter=100

In [135]:
# grid = {'C': [0.01, 0.1, 1, 10], 'max_iter': [100, 500, 1000]}

# search = GridSearchCV(LogisticRegression(class_weight='balanced'), grid, scoring='recall', cv=5)
# search.fit(X_train, y_train)

# print(f"Best parameters: {search.best_params_}")
# print(f"Best recall from CV: {search.best_score_:.4f}")

# scores = ['accuracy', 'f1', 'precision']
# for score in scores:
#     search = GridSearchCV(LogisticRegression(), grid, scoring=score, cv=5)
#     search.fit(X_train, y_train)

#     print(f"Best parameters: {search.best_params_}")
#     print(f"Best {score} from CV: {search.best_score_:.4f}")

# Ensemble Method
This method has returned the best results so far across the board.

## Best Hyperparameters for each Model
Many calues of C, class_weight, and max_iter were tested for each model in the ensemble.
- Recall:
    - C=0.1
    - class_weight='balanced'
    - max_iter=100
- Accuracy:
    - C=0.1
    - class_weight=None
    - max_iter=100

## Best Parameters in the Ensemble
Different values of voting and weights were tried to maximize recall, precision, and accuracy. There is an inverse relationship between precision and recall because recall goes up when more positives get predicted and precision goes down because a lot of these positives are false positives.

In [5]:
model_recall = LogisticRegression(C=0.1, class_weight='balanced', max_iter=100)

model_accuracy = LogisticRegression(C=0.1, class_weight=None, max_iter=100)

ensemble_model = VotingClassifier(estimators=[
    ('recall_opt', model_recall),
    ('accuracy_opt', model_accuracy)
], voting='soft', weights=[1,1])

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1 = f1_score(y_test, y_pred_ensemble)

print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble Precision: {ensemble_precision:.4f}")
print(f"Ensemble Recall: {ensemble_recall:.4f}")
print(f"Ensemble F1 Score: {ensemble_f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Ensemble Accuracy: 0.9567
Ensemble Precision: 0.3788
Ensemble Recall: 0.5814
Ensemble F1 Score: 0.4587

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.87      0.93      1321
           1       0.17      0.79      0.28        43

    accuracy                           0.87      1364
   macro avg       0.58      0.83      0.60      1364
weighted avg       0.97      0.87      0.91      1364



# Most Important Features
This will print out the most significant features (based on their weights in 100 randomly shuffled LogisticRegression models)

In [137]:
data = np.load('data.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = y_train.squeeze()
y_test = y_test.squeeze()

num_models = 5
feature_per_model = 19
total_features = X_train.shape[1]
assert total_features == 95, "The total number of features is not 95 as expected."

indices = np.arange(total_features)

important_features_dict = {}
model_results = {}
final_features = {index: {'count': 0} for index in indices}

for run in range(0, 100):
    np.random.shuffle(indices)
    feature_groups = np.array_split(indices, num_models)
    important_features = [] 

    for i, features in enumerate(feature_groups, start=1):
        X_train_subset = X_train[:, features]
        X_test_subset = X_test[:, features]

        model = LogisticRegression()
        model.fit(X_train_subset, y_train)

        y_pred = model.predict(X_test_subset)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        model_results[f'Model_{run}.{i}'] = {
            'Features': features,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Coefficients': model.coef_
        }

        # print(f"Model {run}.{i} Results:")
        # print(f"Features: {features}")
        # print(f"Accuracy: {accuracy:.4f}")
        # print(f"Precision: {precision:.4f}")
        # print(f"Recall: {recall:.4f}")
        # print(f"F1 Score: {f1:.4f}")
        # print(f"Coefficients: {model.coef_}\n")
        for x in range(len(model.coef_[0])):
            if model.coef_[0][x] > 0.5 or model.coef_[0][x] < -0.5:
                #print(f"Feature index with high/low weight: {features[x]}")
                important_features.append(features[x])
                final_features[features[x]]['count'] += 1

    important_features_dict[f'Run_{run}'] = {'Features': important_features}

# Create Different Logistic Regression Models on Different Number of Features
feature_data['count] is basically the relevancy score a feature has in making a classification. The higher the count, the more often you will find that feature having a high weight in a logistic regression model that is being trained on a subset of the full feature set. This means that features with count equal to 100 will have a high weight in all 100 model iterations that this feature can be found in. Ultimately, this count is meaningless because the logistic regression models trained on all features have the best performance on the test set. This information will be used in our other models on this bankruptcy data as well.

In [138]:
threshold = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99]
#print(final_features) # type: ignore

for thresh in threshold:
    new_feature_set = []
    for feature_index, feature_data in final_features.items():
        if feature_data['count'] >= thresh:
            #print(f"Feature index {feature_index} has a significant count of {feature_data['count']}")
            new_feature_set.append(feature_index)

    X_train_selected = X_train[:, new_feature_set]
    X_test_selected = X_test[:, new_feature_set]

    model_selected = LogisticRegression()
    model_selected.fit(X_train_selected, y_train)

    # Predict on the test set
    y_pred_selected = model_selected.predict(X_test_selected)

    # Calculate and print the evaluation metrics
    accuracy_selected = accuracy_score(y_test, y_pred_selected)
    precision_selected = precision_score(y_test, y_pred_selected)
    recall_selected = recall_score(y_test, y_pred_selected)
    f1_selected = f1_score(y_test, y_pred_selected)

    print(f"Results for Logistic Regression model trained on selected features with significance count of {thresh} or more:")
    print(f"Accuracy: {accuracy_selected:.4f}")
    print(f"Precision: {precision_selected:.4f}")
    print(f"Recall: {recall_selected:.4f}")
    print(f"F1 Score: {f1_selected:.4f}")

Results for Logistic Regression model trained on selected features with significance count of 0 or more:
Accuracy: 0.9699
Precision: 0.5500
Recall: 0.2558
F1 Score: 0.3492
Results for Logistic Regression model trained on selected features with significance count of 5 or more:
Accuracy: 0.9670
Precision: 0.4500
Recall: 0.2093
F1 Score: 0.2857
Results for Logistic Regression model trained on selected features with significance count of 10 or more:
Accuracy: 0.9655
Precision: 0.4000
Recall: 0.1860
F1 Score: 0.2540
Results for Logistic Regression model trained on selected features with significance count of 20 or more:
Accuracy: 0.9677
Precision: 0.4706
Recall: 0.1860
F1 Score: 0.2667
Results for Logistic Regression model trained on selected features with significance count of 30 or more:
Accuracy: 0.9677
Precision: 0.4706
Recall: 0.1860
F1 Score: 0.2667
Results for Logistic Regression model trained on selected features with significance count of 40 or more:
Accuracy: 0.9663
Precision: 0.4