In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc, f1_score, confusion_matrix
import joblib
import os
from imblearn.over_sampling import SMOTE

In [2]:
# Set up directory paths
DATA_DIR = os.path.join("../", "data")
OUTPUT_DIR = os.path.join("../", "outputs")
MODEL_DIR = os.path.join(OUTPUT_DIR, "models")
METRICS_DIR = os.path.join(OUTPUT_DIR, "metrics")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(METRICS_DIR, exist_ok=True)

In [3]:
fraud_data = pd.read_csv(os.path.join(OUTPUT_DIR, "processed_fraud_data.csv"))
creditcard_data = pd.read_csv(os.path.join(
    OUTPUT_DIR, "processed_creditcard_data.csv"))
ip_data = pd.read_csv(os.path.join(DATA_DIR, "IpAddress_to_Country.csv"))

In [4]:
def prepare_data(df, dataset_name):
    # Separate features and target
    X = df.drop(columns=['class'])
    y = df['class']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    # Apply SMOTE to training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    print(f"\nClass distribution after SMOTE in {dataset_name} (training):\n", pd.Series(
        y_train_resampled).value_counts(normalize=True))

    return X_train_resampled, X_test, y_train_resampled, y_test

In [5]:
def preprocess_fraud_data(fraud_data, ip_data):
    # Convert timestamps to datetime
    fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
    fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

    # Create time-based features
    fraud_data['time_since_signup'] = (
        # Hours
        fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds() / 3600.0
    fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
    fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek

    # Merge with IP data to get country
    def ip_to_country(ip):
        for _, row in ip_data.iterrows():
            if row['lower_bound_ip_address'] <= ip <= row['upper_bound_ip_address']:
                return row['country']
        return 'Unknown'

    fraud_data['country'] = fraud_data['ip_address'].apply(ip_to_country)

    # Encode categorical variables
    categorical_cols = ['source', 'browser', 'sex', 'country']
    fraud_data = pd.get_dummies(
        fraud_data, columns=categorical_cols, drop_first=True)

    # Drop non-numeric columns
    fraud_data = fraud_data.drop(
        columns=['signup_time', 'purchase_time', 'user_id', 'device_id', 'ip_address'])

    # Standardize column names
    fraud_data = fraud_data.rename(columns={'class': 'class'})

    # Handle missing values
    fraud_data = fraud_data.fillna(fraud_data.mean())

    return fraud_data

In [6]:
def train_and_evaluate(X_train, X_test, y_train, y_test, dataset_name):
    # Initialize models
    models = {
        'logistic': LogisticRegression(random_state=42, max_iter=1000),
        'random_forest': RandomForestClassifier(random_state=42, n_estimators=100)
    }

    results = []

    for model_name, model in models.items():
        # Train model
        model.fit(X_train, y_train)

        # Predict probabilities and labels
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)

        # Calculate metrics
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        auc_pr = auc(recall, precision)
        f1 = f1_score(y_test, y_pred)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix - {model_name} ({dataset_name})")
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(os.path.join(
            METRICS_DIR, f"{dataset_name}_{model_name}_confusion_matrix.png"))
        plt.show()
        plt.close()

        # Save results
        results.append({
            'model': model_name,
            'AUC-PR': auc_pr,
            'F1-Score': f1
        })

        # Save model
        joblib.dump(model, os.path.join(
            MODEL_DIR, f"{dataset_name}_{model_name}_model.pkl"))

    # Save metrics to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(
        METRICS_DIR, f"{dataset_name}_metrics.csv"), index=False)

    return results_df

In [None]:
def justify_model_selection(fraud_results, creditcard_results):
    print("\nModel Performance Comparison:")
    print("\nFraud_Data Metrics:\n", fraud_results)
    print("\ncreditcard Metrics:\n", creditcard_results)

    # Example justification (customize based on results)
    justification = """
    Model Selection Justification:
    - Logistic Regression: Provides interpretable results, suitable for explaining fraud predictions to stakeholders. However, it may underperform on complex patterns due to its linear nature.
    - Random Forest: Captures non-linear relationships and interactions between features, likely performing better on imbalanced data due to its ensemble nature.
    Based on AUC-PR and F1-Score, Random Forest is selected as the best model if it shows higher performance, balancing false positives (to avoid customer inconvenience) and false negatives (to minimize financial loss). Logistic Regression is preferred if interpretability is prioritized for business needs.
    """
    with open(os.path.join(METRICS_DIR, "model_selection_justification.txt"), "w") as f:
        f.write(justification)
    print(justification)

: 

In [None]:
fraud_data = preprocess_fraud_data(fraud_data, ip_data)
fraud_X_train, fraud_X_test, fraud_y_train, fraud_y_test = prepare_data(
    fraud_data, "Fraud_Data")
creditcard_X_train, creditcard_X_test, creditcard_y_train, creditcard_y_test = prepare_data(
    creditcard_data, "creditcard")

# Train and evaluate models
fraud_results = train_and_evaluate(
    fraud_X_train, fraud_X_test, fraud_y_train, fraud_y_test, "fraud_data")
creditcard_results = train_and_evaluate(
    creditcard_X_train, creditcard_X_test, creditcard_y_train, creditcard_y_test, "creditcard")

# Justify model selection
justify_model_selection(fraud_results, creditcard_results)