In [1]:
import sys
sys.path.append('../src')

In [8]:
print(X_train.select_dtypes(include='object').columns)


Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'ProductId', 'TransactionStartTime'],
      dtype='object')


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load processed dataset
df = pd.read_csv("../data/processed/X_transformed_named.csv")

# Drop unwanted ID/date columns
drop_cols = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'ProductId', 'TransactionStartTime']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Target
y = df.pop("is_high_risk")
X = df

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC_AUC": roc_auc_score(y_test, y_proba)
    }

# Logistic Regression
lr = LogisticRegression(max_iter=5000)
lr_params = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"]
}
lr_search = RandomizedSearchCV(lr, lr_params, n_iter=4, cv=3, scoring='roc_auc', random_state=42)
lr_search.fit(X_train, y_train)
lr_metrics = evaluate_model(lr_search.best_estimator_, X_test, y_test)
print("🔎 Logistic Regression:", lr_metrics)


🔎 Logistic Regression: {'Accuracy': 0.9053990487639157, 'Precision': 0.7041666666666667, 'Recall': 0.3069936421435059, 'F1': 0.4275774826059456, 'ROC_AUC': 0.911310484918994}


In [4]:
import mlflow.sklearn

with mlflow.start_run(run_name="logistic_regression_model"):
    mlflow.log_params(lr_search.best_params_)
    mlflow.log_metrics(lr_metrics)
    mlflow.sklearn.log_model(lr_search.best_estimator_, "model")
    




In [5]:
rf = RandomForestClassifier()
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=5, cv=3, scoring='roc_auc', random_state=42)
rf_search.fit(X_train, y_train)
rf_metrics = evaluate_model(rf_search.best_estimator_, X_test, y_test)
print("🌲 Random Forest:", rf_metrics)

# Save
with mlflow.start_run(run_name="random_forest_model"):
    mlflow.log_params(rf_search.best_params_)
    mlflow.log_metrics(rf_metrics)
    mlflow.sklearn.log_model(rf_search.best_estimator_, "model")




🌲 Random Forest: {'Accuracy': 0.9963413996759526, 'Precision': 0.9819168173598554, 'Recall': 0.9863760217983651, 'F1': 0.9841413683733575, 'ROC_AUC': 0.9997649405765164}




In [7]:
import joblib
joblib.dump(lr_search.best_estimator_, "../models/logistic_model.pkl")
joblib.dump(rf_search.best_estimator_, "../models/rf_model.pkl")  

['../models/rf_model.pkl']

In [8]:

## Model Comparison and Registration
# --------------------------------

# Compare models and select the best one
model_comparison = pd.DataFrame({
    "Logistic Regression": lr_metrics,
    "Random Forest": rf_metrics
}).T

print("\nModel Performance Comparison:")
print(model_comparison)

# Determine best model based on ROC AUC (you can change this to your preferred metric)
best_model_name = model_comparison['ROC_AUC'].idxmax()
best_model = lr_search.best_estimator_ if best_model_name == "Logistic Regression" else rf_search.best_estimator_

print(f"\nBest model: {best_model_name}")

## Register Best Model in MLflow
# ------------------------------

# Start a new run for model registration
with mlflow.start_run(run_name="best_model_registration") as run:
    # Log parameters and metrics
    if best_model_name == "Logistic Regression":
        mlflow.log_params(lr_search.best_params_)
        mlflow.log_metrics(lr_metrics)
    else:
        mlflow.log_params(rf_search.best_params_)
        mlflow.log_metrics(rf_metrics)
    
    # Log the model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="fraud_detection_model",
        registered_model_name="FraudDetectionModel"
    )
    
    # Add description
    mlflow.set_tag("mlflow.note.content",
                  f"Best performing {best_model_name} model for fraud detection")

print("\n✅ Best model registered in MLflow Model Registry")

# Save best model locally
joblib.dump(best_model, "../models/best_model.pkl")
print("Best model saved locally at ../models/best_model.pkl")




Model Performance Comparison:
                     Accuracy  Precision    Recall        F1   ROC_AUC
Logistic Regression  0.905399   0.704167  0.306994  0.427577  0.911310
Random Forest        0.996341   0.981917  0.986376  0.984141  0.999765

Best model: Random Forest


Registered model 'FraudDetectionModel' already exists. Creating a new version of this model...
Created version '2' of model 'FraudDetectionModel'.



✅ Best model registered in MLflow Model Registry
Best model saved locally at ../models/best_model.pkl
