In [1]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:

# Load preprocessed data
X_train, y_train = joblib.load('../data/X_train_y_train.pkl')
X_test, y_test = joblib.load('../data/X_test_y_test.pkl')


In [3]:

# ----------------------------
# Model 1: Logistic Regression
# ----------------------------
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000)

log_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

grid_log = GridSearchCV(log_reg, log_params, cv=5, scoring='f1', n_jobs=-1)
grid_log.fit(X_train, y_train)

print("🔍 Best Logistic Regression Params:", grid_log.best_params_)
print("\n📈 Logistic Regression Classification Report:")
print(classification_report(y_test, grid_log.predict(X_test)))
print("ROC-AUC:", roc_auc_score(y_test, grid_log.predict_proba(X_test)[:, 1]))

# Save model
joblib.dump(grid_log.best_estimator_, '../models/logistic_model.pkl')

🔍 Best Logistic Regression Params: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}

📈 Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3403
           1       0.86      0.87      0.87       173

    accuracy                           0.99      3576
   macro avg       0.93      0.93      0.93      3576
weighted avg       0.99      0.99      0.99      3576

ROC-AUC: 0.9886890010344492


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['../models/logistic_model.pkl']

In [4]:

# ----------------------------
# Model 2: Multinomial Naive Bayes
# ----------------------------
# Naive Bayes works well for text data; sparse matrix expected
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

print("\n📈 Naive Bayes Classification Report:")
print(classification_report(y_test, nb_model.predict(X_test)))
print("ROC-AUC:", roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1]))

# Save model
joblib.dump(nb_model, '../models/naive_bayes_model.pkl')


📈 Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3403
           1       0.65      0.55      0.60       173

    accuracy                           0.96      3576
   macro avg       0.81      0.77      0.79      3576
weighted avg       0.96      0.96      0.96      3576

ROC-AUC: 0.9259561862280647


['../models/naive_bayes_model.pkl']

In [5]:
# ----------------------------
# Model 3: Random Forest
# ----------------------------
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}

grid_rf = GridSearchCV(rf_model, rf_params, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("\n📈 Random Forest Classification Report:")
print(classification_report(y_test, grid_rf.predict(X_test)))
print("ROC-AUC:", roc_auc_score(y_test, grid_rf.predict_proba(X_test)[:, 1]))

# Save model
joblib.dump(grid_rf.best_estimator_, '../models/random_forest_model.pkl')

print("✅ All models trained and saved successfully.")



📈 Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       0.96      0.62      0.75       173

    accuracy                           0.98      3576
   macro avg       0.97      0.81      0.87      3576
weighted avg       0.98      0.98      0.98      3576

ROC-AUC: 0.9903791112568135
✅ All models trained and saved successfully.
