# Data Loading
Nothing much in this cell after "data.npz" was created.

In [3]:
import numpy as np
  
data = np.load('data.npz')
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

# SKLearn Imports and Data Preprocessing
Importing necessary SKLearn packages and preprocessing the data (Scaling).

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Regular Model
Just the normal Logistic Regression model and its scores.

In [6]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Default Accuracy: {accuracy:.4f}")
print(f"Default Precision: {precision:.4f}")
print(f"Default Recall: {recall:.4f}")
print(f"Default F1 Score: {f1:.4f}")

#coefficients = logistic_model.coef_
#print("Coefficients:", coefficients)

# y_probs = logistic_model.predict_proba(X_test)[:, 1]

# thresholds = [0.3, 0.4, 0.5, 0.6]
# for thresh in thresholds:
#     y_pred_thresh = np.where(y_probs > thresh, 1, 0)
#     print(f"Threshold: {thresh}")
#     print(f"Accuracy: {accuracy_score(y_test, y_pred_thresh):.4f}")
#     print(f"Precision: {precision_score(y_test, y_pred_thresh):.4f}")
#     print(f"Recall: {recall_score(y_test, y_pred_thresh):.4f}")
#     print(f"F1 Score: {f1_score(y_test, y_pred_thresh):.4f}")

Default Accuracy: 0.9699
Default Precision: 0.5500
Default Recall: 0.2558
Default F1 Score: 0.3492


# Model With Class Weight and Max Iterations (To Optimize for Recall)
A Logistic Regression model optimized for Recall. There is a tradeoff in precision. (Lots of false positives)

In [8]:
logistic_model = LogisticRegression(class_weight='balanced', max_iter=500)

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Default Accuracy: {accuracy:.4f}")
print(f"Default Precision: {precision:.4f}")
print(f"Default Recall: {recall:.4f}")
print(f"Default F1 Score: {f1:.4f}")

# y_probs = logistic_model.predict_proba(X_test)[:, 1]

# thresholds = [0.3, 0.4, 0.5, 0.6]
# for thresh in thresholds:
#     y_pred_thresh = np.where(y_probs > thresh, 1, 0)
#     print(f"Threshold: {thresh}")
#     print(f"Accuracy: {accuracy_score(y_test, y_pred_thresh):.4f}")
#     print(f"Precision: {precision_score(y_test, y_pred_thresh):.4f}")
#     print(f"Recall: {recall_score(y_test, y_pred_thresh):.4f}")
#     print(f"F1 Score: {f1_score(y_test, y_pred_thresh):.4f}")

def custom_score(y_true, y_pred):
    recall_weight = 0.75
    accuracy_weight = 0.25
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    return recall * recall_weight + accuracy * accuracy_weight

final_score = custom_score(y_test, y_pred)

print(f"Final custom score on test set: {final_score:.4f}")

Default Accuracy: 0.8710
Default Precision: 0.1692
Default Recall: 0.7907
Default F1 Score: 0.2787
Final custom score on test set: 0.8108


# Grid Search
This will get the best recall score for the regularization strength (C) and the max number of iterations (max_iter) on the Logistic Regression Model below. There will be Convergence Warnings due to non-convergence.

## Best Hyperparameters Based on Testing with Cross-Validation (cv=5)
- Recall: 
    - class_weight='balanced'
    - C=0.01
    - max_iter=100
- Accuracy:
    - class_weight='None'
    - C=0.01
    - max_iter=100
- F1:
    - class_weight='None'
    - C=10
    - max_iter=500
- Precision:
    - class_weight='None'
    - C=0.01
    - max_iter=100

In [9]:
# grid = {'C': [0.01, 0.1, 1, 10], 'max_iter': [100, 500, 1000]}

# search = GridSearchCV(LogisticRegression(class_weight='balanced'), grid, scoring='recall', cv=5)
# search.fit(X_train, y_train)

# print(f"Best parameters: {search.best_params_}")
# print(f"Best recall from CV: {search.best_score_:.4f}")

# scores = ['accuracy', 'f1', 'precision']
# for score in scores:
#     search = GridSearchCV(LogisticRegression(), grid, scoring=score, cv=5)
#     search.fit(X_train, y_train)

#     print(f"Best parameters: {search.best_params_}")
#     print(f"Best {score} from CV: {search.best_score_:.4f}")

# Ensemble Method
This method has returned the best results so far across the board.

## Best Hyperparameters for each Model
Many calues of C, class_weight, and max_iter were tested for each model in the ensemble.
- Recall:
    - C=0.1
    - class_weight='balanced'
    - max_iter=100
- Accuracy:
    - C=0.1
    - class_weight=None
    - max_iter=100

## Best Parameters in the Ensemble
Different values of voting and weights were tried to maximize recall, precision, and accuracy. There is an inverse relationship between precision and recall because recall goes up when more positives get predicted and precision goes down because a lot of these positives are false positives.

In [52]:
model_recall = LogisticRegression(C=0.1, class_weight='balanced', max_iter=100)

model_accuracy = LogisticRegression(C=0.1, class_weight=None, max_iter=100)

ensemble_model = VotingClassifier(estimators=[
    ('recall_opt', model_recall),
    ('accuracy_opt', model_accuracy)
], voting='soft', weights=[1,1])

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1 = f1_score(y_test, y_pred_ensemble)

print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble Precision: {ensemble_precision:.4f}")
print(f"Ensemble Recall: {ensemble_recall:.4f}")
print(f"Ensemble F1 Score: {ensemble_f1:.4f}")

Ensemble Accuracy: 0.9567
Ensemble Precision: 0.3788
Ensemble Recall: 0.5814
Ensemble F1 Score: 0.4587
