In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import IsolationForest, GradientBoostingClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                           precision_score, recall_score, f1_score, accuracy_score)
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('processeddataset/final_feature_paySim.csv')
df.head()

In [None]:
fraud = df[df['isFraud']==1]
non_fraud = df[df['isFraud']==0].sample(n=fraud.shape[0], random_state=42)

In [None]:
new_df = pd.concat([fraud, non_fraud]).reset_index(drop=True)
new_df.head()

In [None]:
feature_columns = [
'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
'hour', 'is_weekend', 'high_risk_hour', 'high_risk_type', 'large_amount_flag',
'zero_balance_orig', 'zero_balance_dest', 'balance_ratio_orig', 'balance_ratio_dest',
'cust_avg_amt', 'cust_std_amt', 'cust_txn_count'
]
X = df[feature_columns]
y = df['isFraud']

In [None]:
print("Original class distribution:", Counter(y))
print(f"Fraud rate: {y.mean():.6f}")

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
def calculate_metrics(y_true, y_pred, y_scores, model_name):

    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_scores)
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Business metrics
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
    true_positive_rate = recall  # Same as recall
    
    # Classification report
    cr = classification_report(y_true, y_pred, output_dict=True)
    
    results = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'false_positive_rate': false_positive_rate,
        'false_negative_rate': false_negative_rate,
        'true_positive_rate': true_positive_rate,
        'true_positives': tp,
        'false_positives': fp,
        'true_negatives': tn,
        'false_negatives': fn,
        'classification_report': cr,
        'predictions': y_pred,
        'scores': y_scores
    }
    
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"False Negative Rate: {false_negative_rate:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    
    return results


In [None]:
def log_model_to_mlflow(model, metrics, model_name, params):

    # Log parameters
    for param, value in params.items():
        mlflow.log_param(param, value)
    
    # Log metrics
    mlflow.log_metric("accuracy", metrics['accuracy'])
    mlflow.log_metric("precision", metrics['precision'])
    mlflow.log_metric("recall", metrics['recall'])
    mlflow.log_metric("f1_score", metrics['f1_score'])
    mlflow.log_metric("roc_auc", metrics['roc_auc'])
    mlflow.log_metric("false_positive_rate", metrics['false_positive_rate'])
    mlflow.log_metric("false_negative_rate", metrics['false_negative_rate'])
    mlflow.log_metric("true_positive_rate", metrics['true_positive_rate'])
    mlflow.log_metric("true_positives", metrics['true_positives'])
    mlflow.log_metric("false_positives", metrics['false_positives'])
    mlflow.log_metric("true_negatives", metrics['true_negatives'])
    mlflow.log_metric("false_negatives", metrics['false_negatives'])
    
    # Log classification report as JSON
    mlflow.log_dict(metrics['classification_report'], "classification_report.json")
    
    # Log confusion matrix as plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(metrics['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    mlflow.log_figure(plt.gcf(), f"confusion_matrix_{model_name.replace(' ', '_')}.png")
    plt.close()
    
    # Log model
    mlflow.sklearn.log_model(model, name = f"model_{model_name.replace(' ', '_')}",
                             input_example=X_train.head(5))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
mlflow.set_experiment("Isolation_Forest_Hyperparameter_Tuning_USing_RandomizedSearchCV")
def randomized_search_tuning(X_train, X_test, y_test, n_iter=5):
    # Define parameter distribution
    param_dist = {
        'n_estimators': randint(50, 300),
        'contamination': uniform(0.001, 0.1),  # 0.1% to 10%
        'max_samples': uniform(0.5, 0.5),  # 0.5 to 1.0
        'max_features': uniform(0.5, 0.5)   # 0.5 to 1.0
    }
    
    # Create Isolation Forest model
    iso_forest = IsolationForest(random_state=42, verbose=0)
    
    # Custom scorer for anomaly detection
    from sklearn.metrics import make_scorer, f1_score
    scorer = make_scorer(f1_score)
    
    with mlflow.start_run(run_name="RandomizedSearch_IF"):
        # Perform randomized search
        random_search = RandomizedSearchCV(
            estimator=iso_forest,
            param_distributions=param_dist,
            n_iter=n_iter,
            scoring=scorer,
            cv=3,
            random_state=42,
            n_jobs=-1,
            verbose=1
        )
        
        # Fit the model
        random_search.fit(X_train)
        
        # Log best parameters
        mlflow.log_params(random_search.best_params_)
        
        # Get best model
        best_model = random_search.best_estimator_
        
        # Evaluate on test set
        y_pred = best_model.predict(X_test) # type: ignore
        y_scores = best_model.decision_function(X_test) # type: ignore
        y_pred_binary = (y_pred == -1).astype(int)
        
        # Calculate and log metrics
        test_results = calculate_metrics(y_test, y_pred_binary, y_scores, "Best Isolation Forest")
        for metric_name, metric_value in test_results.items():
            if isinstance(metric_value, (int, float)):
                mlflow.log_metric(f"test_{metric_name}", metric_value)
        
        # Log the best model
        mlflow.sklearn.log_model(best_model, "best_model") # type: ignore
        
        # Log search results
        mlflow.log_metric("best_cv_score", random_search.best_score_)
        
        return best_model, random_search.best_params_, test_results

In [None]:
best_model, best_params_, test_results = randomized_search_tuning(X_train, X_test, y_test)