In [None]:
# 📦 Import Required Libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
import psycopg2
import mysql.connector
import warnings
warnings.filterwarnings('ignore')

# 🔧 MLflow Configuration
mlflow.set_tracking_uri("http://localhost:5002")
mlflow.set_experiment("fraud-detection-training")

print("✅ Libraries imported successfully!")
print(f"🎯 MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"📊 Current experiment: {mlflow.get_experiment_by_name('fraud-detection-training')}")


In [None]:
# 💾 Data Loading Functions
def load_data_from_postgres():
    """Load transaction data from PostgreSQL"""
    try:
        conn = psycopg2.connect(
            host="localhost",
            port=5432,
            database="bigdata",
            user="bigdata_user",
            password="bigdata_pass"
        )
        
        # Load sample transaction data (you can modify this query)
        query = """
        SELECT 
            transaction_id,
            customer_id,
            merchant_id,
            amount,
            merchant_category,
            timestamp,
            CASE 
                WHEN amount > 1000 AND merchant_category = 'online' THEN 1 
                WHEN amount > 5000 THEN 1
                ELSE 0 
            END as is_fraud
        FROM (
            SELECT 
                'txn_' || generate_series(1, 10000) as transaction_id,
                'cust_' || (random() * 1000)::int as customer_id,
                'merch_' || (random() * 500)::int as merchant_id,
                (random() * 10000)::numeric(10,2) as amount,
                (ARRAY['retail', 'online', 'grocery', 'gas', 'restaurant'])[floor(random() * 5 + 1)] as merchant_category,
                NOW() - (random() * interval '30 days') as timestamp
        ) t
        """
        
        df = pd.read_sql(query, conn)
        conn.close()
        print(f"✅ Loaded {len(df)} transactions from PostgreSQL")
        return df
        
    except Exception as e:
        print(f"❌ Error loading from PostgreSQL: {e}")
        return generate_synthetic_data()

def generate_synthetic_data():
    """Generate synthetic fraud detection data for training"""
    np.random.seed(42)
    n_samples = 10000
    
    data = {
        'transaction_id': [f'txn_{i}' for i in range(n_samples)],
        'customer_id': [f'cust_{np.random.randint(1, 1000)}' for _ in range(n_samples)],
        'merchant_id': [f'merch_{np.random.randint(1, 500)}' for _ in range(n_samples)],
        'amount': np.random.exponential(100, n_samples),
        'merchant_category': np.random.choice(['retail', 'online', 'grocery', 'gas', 'restaurant'], n_samples),
        'hour': np.random.randint(0, 24, n_samples),
        'day_of_week': np.random.randint(0, 7, n_samples),
    }
    
    df = pd.DataFrame(data)
    
    # Create fraud labels based on business rules
    df['is_fraud'] = (
        ((df['amount'] > 1000) & (df['merchant_category'] == 'online')) |
        (df['amount'] > 5000) |
        ((df['hour'] < 6) & (df['amount'] > 500))
    ).astype(int)
    
    print(f"✅ Generated {len(df)} synthetic transactions")
    print(f"📊 Fraud rate: {df['is_fraud'].mean():.2%}")
    return df

# Load the data
print("🔄 Loading transaction data...")
df = load_data_from_postgres()
df.head()


In [None]:
# 🔧 Feature Engineering Pipeline
def create_features(df):
    """Create features for fraud detection"""
    df = df.copy()
    
    # Amount-based features
    df['amount_log'] = np.log1p(df['amount'])
    df['amount_zscore'] = (df['amount'] - df['amount'].mean()) / df['amount'].std()
    df['is_high_amount'] = (df['amount'] > df['amount'].quantile(0.95)).astype(int)
    
    # Time-based features (if timestamp exists)
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_night'] = ((df['hour'] < 6) | (df['hour'] > 22)).astype(int)
    
    # Merchant category encoding
    le_merchant = LabelEncoder()
    df['merchant_category_encoded'] = le_merchant.fit_transform(df['merchant_category'])
    
    # Customer frequency features (simplified)
    customer_counts = df['customer_id'].value_counts()
    df['customer_frequency'] = df['customer_id'].map(customer_counts)
    df['is_new_customer'] = (df['customer_frequency'] == 1).astype(int)
    
    # Merchant frequency features
    merchant_counts = df['merchant_id'].value_counts()
    df['merchant_frequency'] = df['merchant_id'].map(merchant_counts)
    df['is_new_merchant'] = (df['merchant_frequency'] == 1).astype(int)
    
    print("✅ Feature engineering completed")
    print(f"📊 Total features created: {len([col for col in df.columns if col not in ['transaction_id', 'customer_id', 'merchant_id', 'timestamp', 'is_fraud']])}")
    
    return df, le_merchant

# Apply feature engineering
print("🔄 Creating features...")
df_features, merchant_encoder = create_features(df)

# Select features for modeling
feature_columns = [
    'amount', 'amount_log', 'amount_zscore', 'is_high_amount',
    'hour', 'day_of_week', 'is_weekend', 'is_night',
    'merchant_category_encoded', 'customer_frequency', 'is_new_customer',
    'merchant_frequency', 'is_new_merchant'
]

X = df_features[feature_columns]
y = df_features['is_fraud']

print(f"📊 Feature matrix shape: {X.shape}")
print(f"📊 Target distribution: {y.value_counts().to_dict()}")


In [None]:
# 🎯 Model Training with MLflow Experiments
def train_and_log_model(model, model_name, X_train, X_test, y_train, y_test, scaler=None):
    """Train model and log everything to MLflow"""
    
    with mlflow.start_run(run_name=f"{model_name}_experiment"):
        # Log parameters
        if hasattr(model, 'get_params'):
            mlflow.log_params(model.get_params())
        
        # Scale features if needed
        if scaler:
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            mlflow.log_param("feature_scaling", "StandardScaler")
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
            mlflow.log_param("feature_scaling", "None")
        
        # Train model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        if y_pred_proba is not None:
            auc = roc_auc_score(y_test, y_pred_proba)
            mlflow.log_metric("auc", auc)
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1_weighted')
        mlflow.log_metric("cv_f1_mean", cv_scores.mean())
        mlflow.log_metric("cv_f1_std", cv_scores.std())
        
        # Log model
        if scaler:
            # Create a pipeline for models that need scaling
            from sklearn.pipeline import Pipeline
            pipeline = Pipeline([
                ('scaler', scaler),
                ('model', model)
            ])
            mlflow.sklearn.log_model(pipeline, "model")
        else:
            mlflow.sklearn.log_model(model, "model")
        
        # Log feature importance if available
        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': feature_columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            # Save feature importance as artifact
            feature_importance.to_csv("feature_importance.csv", index=False)
            mlflow.log_artifact("feature_importance.csv")
        
        print(f"✅ {model_name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, AUC: {auc if y_pred_proba is not None else 'N/A'}")
        
        return model, {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'auc': auc if y_pred_proba is not None else None
        }

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"📊 Training set: {X_train.shape}, Test set: {X_test.shape}")

# Initialize models to compare
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000)
}

# Train all models
results = {}
print("\n🚀 Training models with MLflow tracking...")

for model_name, model in models.items():
    print(f"\n🔄 Training {model_name}...")
    
    # Use scaler for LogisticRegression
    scaler = StandardScaler() if model_name == 'LogisticRegression' else None
    
    trained_model, metrics = train_and_log_model(
        model, model_name, X_train, X_test, y_train, y_test, scaler
    )
    results[model_name] = metrics

print("\n✅ All models trained and logged to MLflow!")


In [None]:
# 📊 Model Comparison and Selection
import matplotlib.pyplot as plt

# Compare results
results_df = pd.DataFrame(results).T
print("🏆 Model Performance Comparison:")
print(results_df.round(4))

# Find best model
best_model_name = results_df['f1_score'].idxmax()
print(f"\n🥇 Best model: {best_model_name} (F1: {results_df.loc[best_model_name, 'f1_score']:.4f})")

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Metrics comparison
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1_score']
results_df[metrics_to_plot].plot(kind='bar', ax=axes[0])
axes[0].set_title('Model Performance Metrics')
axes[0].set_ylabel('Score')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].tick_params(axis='x', rotation=45)

# AUC comparison
auc_data = results_df['auc'].dropna()
auc_data.plot(kind='bar', ax=axes[1], color='orange')
axes[1].set_title('AUC Scores')
axes[1].set_ylabel('AUC')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

# Log the comparison chart to MLflow
with mlflow.start_run(run_name="model_comparison"):
    mlflow.log_artifact('model_comparison.png')
    
    # Log the best model metrics
    for metric, value in results_df.loc[best_model_name].items():
        if value is not None:
            mlflow.log_metric(f"best_{metric}", value)
    
    mlflow.log_param("best_model", best_model_name)

print("✅ Model comparison logged to MLflow")


In [None]:
# 🚀 Model Registration and Deployment
# Register the best model in MLflow Model Registry
client = mlflow.tracking.MlflowClient()

# Get the run with the best model
experiment = mlflow.get_experiment_by_name("fraud-detection-training")
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Find the run with the best F1 score
best_run = runs.loc[runs['metrics.f1_score'].idxmax()]
best_run_id = best_run['run_id']

print(f"🎯 Best run ID: {best_run_id}")
print(f"🏆 Best F1 score: {best_run['metrics.f1_score']:.4f}")

# Register the model
model_name = "fraud_detection_model"
model_version = mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/model",
    name=model_name
)

print(f"✅ Model registered as '{model_name}' version {model_version.version}")

# Transition to Production stage
client.transition_model_version_stage(
    name=model_name,
    version=model_version.version,
    stage="Production"
)

print(f"🚀 Model version {model_version.version} moved to Production stage")

# Add model description and tags
client.update_model_version(
    name=model_name,
    version=model_version.version,
    description=f"Fraud detection model using {best_model_name} algorithm. "
               f"Trained on {len(df)} transactions with F1 score: {best_run['metrics.f1_score']:.4f}"
)

client.set_model_version_tag(
    name=model_name,
    version=model_version.version,
    key="algorithm",
    value=best_model_name
)

client.set_model_version_tag(
    name=model_name,
    version=model_version.version,
    key="data_version",
    value="v1.0"
)

print("✅ Model metadata updated")


In [None]:
# 🔍 Test Production Model Loading
# Test loading the registered model
loaded_model = mlflow.sklearn.load_model(f"models:/{model_name}/Production")

# Test prediction on a sample
sample_transaction = X_test.iloc[0:1]
prediction = loaded_model.predict(sample_transaction)
prediction_proba = loaded_model.predict_proba(sample_transaction)

print("🧪 Testing Production Model:")
print(f"📊 Sample features: {sample_transaction.iloc[0].to_dict()}")
print(f"🎯 Prediction: {'FRAUD' if prediction[0] == 1 else 'LEGITIMATE'}")
print(f"📈 Fraud probability: {prediction_proba[0][1]:.4f}")

# Test the API endpoint (if running)
import requests
import json

try:
    # Test API endpoint
    api_url = "http://localhost:5001/api/v1/predict"
    
    # Create a test transaction with required fields
    test_transaction = {
        "transaction_id": "test_001",
        "customer_id": "cust_123",
        "merchant_id": "merch_456", 
        "amount": 1500.00,
        "merchant_category": "online",
        "timestamp": "2025-01-04T15:30:00Z"
    }
    
    response = requests.post(api_url, json=test_transaction)
    
    if response.status_code == 200:
        api_result = response.json()
        print(f"\n🌐 API Test Result:")
        print(f"🎯 Fraud Probability: {api_result['fraud_probability']}")
        print(f"📊 Risk Level: {api_result['risk_level']}")
        print(f"⚡ Model Version: {api_result['model_version']}")
    else:
        print(f"❌ API Error: {response.status_code} - {response.text}")
        
except Exception as e:
    print(f"⚠️  Could not test API: {e}")
    print("💡 Make sure the fraud detection API is running on localhost:5001")

print("\n🎉 Training pipeline completed successfully!")
print("🔗 Next steps:")
print("1. 📊 Visit MLflow UI: http://localhost:5002")
print("2. 🔍 Check experiment runs and model registry")
print("3. 📈 Compare model performance metrics")
print("4. 🚀 Deploy the best model to production")
