In [1]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 1. Load the processed dataset
processed_file_path = "../data/processed_credit_scoring_data.csv"
df = pd.read_csv(processed_file_path)

# 2. Fix the target variable (if necessary)
# Example: Map 22 to 1
df["FraudResult"] = df["FraudResult"].replace({22: 1})

# 3. Drop non-numeric columns
non_numeric_cols = ["TransactionId", "BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime", "FraudResult", "Risk_Label"]
X = df.drop(columns=non_numeric_cols)
y = df["FraudResult"]  # Target variable

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Train and evaluate models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

evaluation_results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    evaluation_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, pos_label=1),  # Specify pos_label=1
        "Recall": recall_score(y_test, y_pred, pos_label=1),  # Specify pos_label=1
        "F1 Score": f1_score(y_test, y_pred, pos_label=1),  # Specify pos_label=1
        "ROC-AUC": roc_auc_score(y_test, y_pred_proba)
    }

# Display results
results_df = pd.DataFrame(evaluation_results).T
print(results_df)

# 6. Hyperparameter tuning for Random Forest
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best parameters for Random Forest: {grid_search.best_params_}")

# Evaluate the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]
best_rf_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, pos_label=1),  # Specify pos_label=1
    "Recall": recall_score(y_test, y_pred, pos_label=1),  # Specify pos_label=1
    "F1 Score": f1_score(y_test, y_pred, pos_label=1),  # Specify pos_label=1
    "ROC-AUC": roc_auc_score(y_test, y_pred_proba)
}

print("Best Random Forest Metrics:")
print(best_rf_metrics)
# Define directory to store models
MODEL_DIR = "../saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)  # Create folder if it doesn't exist

# Save the best Random Forest model
model_path = os.path.join(MODEL_DIR, "best_random_forest.joblib")
joblib.dump(best_rf, model_path)

print(f"✅ Best Random Forest model saved at: {model_path}")

Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
                     Accuracy  Precision    Recall  F1 Score   ROC-AUC
Logistic Regression  0.998171   0.576923  0.384615  0.461538  0.988602
Decision Tree        0.999373   0.909091  0.769231  0.833333  0.884577
Random Forest        0.999530   0.941176  0.820513  0.876712  0.999874
Gradient Boosting    0.999425   0.937500  0.769231  0.845070  0.871664
Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
Best Random Forest Metrics:
{'Accuracy': 0.9994773428108503, 'Precision': 0.9142857142857143, 'Recall': 0.8205128205128205, 'F1 Score': 0.8648648648648649, 'ROC-AUC': np.float64(0.9998549685362297)}
✅ Best Random Forest model saved at: ../saved_models/best_random_forest.joblib
