In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                           precision_score, recall_score, f1_score, roc_auc_score,
                           roc_curve, precision_recall_curve, auc, average_precision_score)

# Import imbalanced-learn libraries
try:
    from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
    from imblearn.under_sampling import RandomUnderSampler, TomekLinks
    from imblearn.combine import SMOTEENN, SMOTETomek
    IMBLEARN_AVAILABLE = True
    print("✓ imbalanced-learn library available")
except ImportError:
    print("⚠️ imbalanced-learn not available. Install with: pip install imbalanced-learn")
    IMBLEARN_AVAILABLE = False

from collections import Counter
import time

class FraudDetectionPipeline:
    """
    Complete pipeline for credit card fraud detection
    """
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = None
        self.models = {}
        self.results = []
        np.random.seed(random_state)
    
    def load_data(self, file_path=None):
        """
        Load credit card fraud dataset
        
        For real data, download from:
        https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
        """
        if file_path:
            print(f"Loading data from {file_path}...")
            data = pd.read_csv(file_path)
            print(f"Loaded {len(data):,} transactions")
        else:
            print("Creating synthetic fraud detection dataset...")
            data = self._create_synthetic_dataset()
        
        return data
    
    def _create_synthetic_dataset(self, n_samples=50000, fraud_rate=0.002):
        """Create realistic synthetic fraud dataset"""
        n_fraud = int(n_samples * fraud_rate)
        n_normal = n_samples - n_fraud
        
        print(f"Generating {n_samples:,} transactions:")
        print(f"  Normal: {n_normal:,} ({(1-fraud_rate)*100:.1f}%)")
        print(f"  Fraud: {n_fraud:,} ({fraud_rate*100:.3f}%)")
        
        # Normal transactions
        normal_data = {
            'Time': np.random.exponential(3600, n_normal),
            'V1': np.random.normal(-0.3, 1.0, n_normal),
            'V2': np.random.normal(0.1, 1.1, n_normal),
            'V3': np.random.normal(-0.2, 1.2, n_normal),
            'V4': np.random.normal(0.2, 1.0, n_normal),
            'V5': np.random.normal(-0.1, 1.1, n_normal),
            'V6': np.random.normal(0.0, 1.0, n_normal),
            'V7': np.random.normal(-0.1, 1.0, n_normal),
            'V8': np.random.normal(0.0, 1.1, n_normal),
            'V9': np.random.normal(-0.2, 1.1, n_normal),
            'V10': np.random.normal(-0.1, 1.0, n_normal),
            'Amount': np.random.lognormal(3.0, 1.5, n_normal).clip(0, 10000),
            'Class': np.zeros(n_normal)
        }
        
        # Fraudulent transactions (different patterns)
        fraud_data = {
            'Time': np.random.exponential(1200, n_fraud),
            'V1': np.random.normal(2.5, 1.2, n_fraud),
            'V2': np.random.normal(-2.0, 1.3, n_fraud),
            'V3': np.random.normal(1.8, 1.1, n_fraud),
            'V4': np.random.normal(-2.2, 1.4, n_fraud),
            'V5': np.random.normal(1.5, 1.0, n_fraud),
            'V6': np.random.normal(-1.4, 1.2, n_fraud),
            'V7': np.random.normal(1.0, 1.0, n_fraud),
            'V8': np.random.normal(-1.1, 1.3, n_fraud),
            'V9': np.random.normal(1.3, 1.1, n_fraud),
            'V10': np.random.normal(-1.6, 1.2, n_fraud),
            'Amount': np.random.lognormal(4.5, 2.5, n_fraud).clip(0, 25000),
            'Class': np.ones(n_fraud)
        }
        
        # Combine and shuffle data
        all_data = {}
        for key in normal_data.keys():
            all_data[key] = np.concatenate([normal_data[key], fraud_data[key]])
        
        df = pd.DataFrame(all_data)
        return df.sample(frac=1).reset_index(drop=True)
    
    def explore_data(self, data):
        """Comprehensive data exploration"""
        print("\n" + "="*60)
        print("DATA EXPLORATION")
        print("="*60)
        
        print(f"Dataset shape: {data.shape}")
        print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # Class distribution
        class_counts = data['Class'].value_counts()
        fraud_rate = class_counts[1] / len(data)
        
        print(f"\nClass Distribution:")
        print(f"  Normal (0): {class_counts[0]:,} ({class_counts[0]/len(data)*100:.2f}%)")
        print(f"  Fraud (1): {class_counts[1]:,} ({fraud_rate*100:.3f}%)")
        print(f"  Imbalance ratio: {class_counts[0]/class_counts[1]:.0f}:1")
        
        # Statistical summary
        print(f"\nTransaction Amount Analysis:")
        normal_amounts = data[data['Class'] == 0]['Amount']
        fraud_amounts = data[data['Class'] == 1]['Amount']
        
        print(f"Normal - Mean: ${normal_amounts.mean():.2f}, Median: ${normal_amounts.median():.2f}")
        print(f"Fraud - Mean: ${fraud_amounts.mean():.2f}, Median: ${fraud_amounts.median():.2f}")
        
        # Check for missing values
        missing = data.isnull().sum().sum()
        print(f"\nMissing values: {missing}")
        
        return data
    
    def preprocess_data(self, data):
        """Data preprocessing with feature engineering"""
        print("\n" + "="*60)
        print("DATA PREPROCESSING")
        print("="*60)
        
        # Separate features and target
        X = data.drop('Class', axis=1)
        y = data['Class']
        
        # Feature engineering
        X = self._feature_engineering(X)
        
        # Train-test split (stratified to maintain class balance)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=self.random_state, stratify=y
        )
        
        print(f"Data split: {X_train.shape[0]:,} train, {X_test.shape[0]:,} test")
        print(f"Train fraud rate: {y_train.mean():.4f}")
        print(f"Test fraud rate: {y_test.mean():.4f}")
        
        # Feature scaling with RobustScaler (less sensitive to outliers)
        self.scaler = RobustScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Convert back to DataFrame for easier handling
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)
        
        print(f"✓ Feature scaling completed")
        print(f"✓ Final feature count: {X.shape[1]}")
        
        return X_train_scaled, X_test_scaled, y_train, y_test
    
    def _feature_engineering(self, X):
        """Advanced feature engineering for fraud detection"""
        X_new = X.copy()
        
        # PCA component statistics
        pca_cols = [col for col in X.columns if col.startswith('V')]
        if len(pca_cols) > 0:
            X_new['PCA_sum'] = X[pca_cols].sum(axis=1)
            X_new['PCA_std'] = X[pca_cols].std(axis=1)
            X_new['PCA_max'] = X[pca_cols].max(axis=1)
            X_new['PCA_min'] = X[pca_cols].min(axis=1)
            X_new['PCA_range'] = X_new['PCA_max'] - X_new['PCA_min']
        
        # Time-based features
        if 'Time' in X.columns:
            X_new['Hour'] = (X['Time'] % 86400) // 3600
            X_new['is_night'] = ((X_new['Hour'] >= 22) | (X_new['Hour'] <= 6)).astype(int)
            X_new['is_weekend'] = ((X_new['Hour'] % 7) >= 5).astype(int)
        
        # Amount-based features
        if 'Amount' in X.columns:
            X_new['Amount_log'] = np.log1p(X['Amount'])
            # Amount percentiles
            percentiles = X['Amount'].quantile([0.25, 0.75, 0.9, 0.95, 0.99])
            X_new['Amount_very_low'] = (X['Amount'] <= percentiles[0.25]).astype(int)
            X_new['Amount_high'] = (X['Amount'] > percentiles[0.75]).astype(int)
            X_new['Amount_very_high'] = (X['Amount'] > percentiles[0.95]).astype(int)
        
        print(f"Feature engineering: {X.shape[1]} -> {X_new.shape[1]} features")
        return X_new
    
    def handle_imbalanced_data(self, X_train, y_train):
        """Apply techniques to handle imbalanced dataset"""
        print("\n" + "="*60)
        print("HANDLING IMBALANCED DATA")
        print("="*60)
        
        print(f"Original distribution: {Counter(y_train)}")
        
        datasets = {'Original': (X_train, y_train)}
        
        # Apply SMOTE if available
        if IMBLEARN_AVAILABLE:
            smote = SMOTE(random_state=self.random_state, k_neighbors=5)
            X_smote, y_smote = smote.fit_resample(X_train, y_train)
            datasets['SMOTE'] = (X_smote, y_smote)
            print(f"SMOTE distribution: {Counter(y_smote)}")
            
            # Borderline SMOTE (more conservative)
            borderline_smote = BorderlineSMOTE(random_state=self.random_state)
            X_borderline, y_borderline = borderline_smote.fit_resample(X_train, y_train)
            datasets['Borderline_SMOTE'] = (X_borderline, y_borderline)
            print(f"Borderline SMOTE distribution: {Counter(y_borderline)}")
        
        # Undersampling
        X_under, y_under = self._random_undersample(X_train, y_train, ratio=3.0)
        datasets['Undersampled'] = (X_under, y_under)
        print(f"Undersampled distribution: {Counter(y_under)}")
        
        return datasets
    
    def _random_undersample(self, X, y, ratio=2.0):
        """Random undersampling of majority class"""
        minority_class = y.value_counts().idxmin()
        majority_class = y.value_counts().idxmax()
        
        minority_indices = y[y == minority_class].index
        majority_indices = y[y == majority_class].index
        
        n_minority = len(minority_indices)
        n_majority_target = int(n_minority * ratio)
        
        # Random sample from majority class
        majority_sampled = np.random.choice(
            majority_indices, 
            min(n_majority_target, len(majority_indices)), 
            replace=False
        )
        
        selected_indices = list(minority_indices) + list(majority_sampled)
        return X.loc[selected_indices], y.loc[selected_indices]
    
    def train_models(self, datasets, X_test, y_test):
        """Train multiple models on different datasets"""
        print("\n" + "="*60)
        print("MODEL TRAINING & EVALUATION")
        print("="*60)
        
        # Define models to test
        models = {
            'Logistic Regression': LogisticRegression(random_state=self.random_state, max_iter=1000),
            'Random Forest': RandomForestClassifier(random_state=self.random_state, n_estimators=100),
            'Gradient Boosting': GradientBoostingClassifier(random_state=self.random_state, n_estimators=100),
            'SVM': SVC(random_state=self.random_state, probability=True)
        }
        
        # Class-weighted versions for comparison
        weighted_models = {
            'Weighted LR': LogisticRegression(random_state=self.random_state, max_iter=1000, class_weight='balanced'),
            'Weighted RF': RandomForestClassifier(random_state=self.random_state, n_estimators=100, class_weight='balanced'),
        }
        
        all_models = {**models, **weighted_models}
        results = []
        
        # Train models on each dataset
        for dataset_name, (X_train, y_train) in datasets.items():
            print(f"\n--- {dataset_name} Dataset ({len(X_train):,} samples) ---")
            
            for model_name, model in all_models.items():
                try:
                    start_time = time.time()
                    
                    # Train model
                    model.fit(X_train, y_train)
                    
                    # Predict on test set
                    y_pred = model.predict(X_test)
                    y_pred_proba = model.predict_proba(X_test)[:, 1]
                    
                    train_time = time.time() - start_time
                    
                    # Evaluate
                    metrics = self._evaluate_model(y_test, y_pred, y_pred_proba)
                    
                    result = {
                        'Dataset': dataset_name,
                        'Model': model_name,
                        'Train_Time': train_time,
                        **metrics
                    }
                    results.append(result)
                    
                    print(f"  {model_name:15s}: F1={metrics['F1']:.3f}, "
                          f"Precision={metrics['Precision']:.3f}, "
                          f"Recall={metrics['Recall']:.3f}")
                    
                except Exception as e:
                    print(f"  {model_name:15s}: ERROR - {str(e)}")
        
        self.results = pd.DataFrame(results)
        return self.results
    
    def _evaluate_model(self, y_true, y_pred, y_pred_proba):
        """Comprehensive model evaluation for fraud detection"""
        
        # Basic classification metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        # ROC and PR AUC
        try:
            roc_auc = roc_auc_score(y_true, y_pred_proba)
            pr_auc = average_precision_score(y_true, y_pred_proba)
        except:
            roc_auc, pr_auc = 0.0, 0.0
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        if cm.size == 4:
            tn, fp, fn, tp = cm.ravel()
        else:
            tn, fp, fn, tp = 0, 0, 0, 0
        
        # Cost-sensitive analysis (fraud detection specific)
        cost_fp = 50     # Cost of blocking legitimate transaction
        cost_fn = 5000   # Cost of missing fraud (much higher impact)
        total_cost = fp * cost_fp + fn * cost_fn
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f2_score = (5 * precision * recall) / (4 * precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'F2': f2_score,
            'ROC_AUC': roc_auc,
            'PR_AUC': pr_auc,
            'Specificity': specificity,
            'Total_Cost': total_cost,
            'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn
        }
    
    def analyze_results(self):
        """Comprehensive results analysis"""
        if self.results.empty:
            print("No results to analyze. Train models first.")
            return
        
        print("\n" + "="*60)
        print("RESULTS ANALYSIS")
        print("="*60)
        
        # Best models by different metrics
        metrics = ['F1', 'Precision', 'Recall', 'ROC_AUC', 'Total_Cost']
        
        print("🏆 BEST MODELS BY METRIC:")
        print("-" * 60)
        
        for metric in metrics:
            if metric == 'Total_Cost':
                best = self.results.loc[self.results[metric].idxmin()]
                print(f"{metric:12s}: {best['Model']} on {best['Dataset']} = ${best[metric]:,.0f}")
            else:
                best = self.results.loc[self.results[metric].idxmax()]
                print(f"{metric:12s}: {best['Model']} on {best['Dataset']} = {best[metric]:.3f}")
        
        # Top 5 models
        print("\n📊 TOP 5 MODELS BY F1-SCORE:")
        print("-" * 100)
        top5 = self.results.nlargest(5, 'F1')
        cols = ['Model', 'Dataset', 'Precision', 'Recall', 'F1', 'ROC_AUC', 'Total_Cost']
        for _, row in top5.iterrows():
            print(f"{row['Model']:20s} on {row['Dataset']:15s}: "
                  f"P={row['Precision']:.3f} R={row['Recall']:.3f} "
                  f"F1={row['F1']:.3f} AUC={row['ROC_AUC']:.3f} Cost=${row['Total_Cost']:,.0f}")
        
        # Dataset comparison
        print("\n📈 DATASET PERFORMANCE COMPARISON:")
        print("-" * 60)
        dataset_perf = self.results.groupby('Dataset').agg({
            'F1': ['mean', 'max'],
            'Precision': ['mean', 'max'],
            'Recall': ['mean', 'max'],
            'Total_Cost': ['mean', 'min']
        }).round(3)
        
        for dataset in self.results['Dataset'].unique():
            subset = self.results[self.results['Dataset'] == dataset]
            print(f"{dataset:15s}: Avg F1={subset['F1'].mean():.3f}, "
                  f"Max F1={subset['F1'].max():.3f}, "
                  f"Min Cost=${subset['Total_Cost'].min():,.0f}")
        
        # Business insights
        best_model = self.results.loc[self.results['F1'].idxmax()]
        print("\n💼 BUSINESS INSIGHTS:")
        print("-" * 60)
        print(f"• Best model: {best_model['Model']} on {best_model['Dataset']}")
        print(f"• Fraud detection rate (Recall): {best_model['Recall']:.1%}")
        print(f"• Precision (Accuracy when flagging fraud): {best_model['Precision']:.1%}")
        print(f"• False positive rate: {best_model['FP']/(best_model['FP']+best_model['TN']):.2%}")
        print(f"• Estimated daily cost: ${best_model['Total_Cost']:,.0f}")
        
        return self.results
    
    def generate_recommendations(self):
        """Generate actionable recommendations"""
        print("\n" + "="*60)
        print("RECOMMENDATIONS & NEXT STEPS")
        print("="*60)
        
        best = self.results.loc[self.results['F1'].idxmax()]
        
        print("🎯 DEPLOYMENT RECOMMENDATIONS:")
        print("-" * 40)
        print(f"1. Deploy: {best['Model']} trained on {best['Dataset']}")
        print(f"2. Expected performance: {best['Recall']:.1%} fraud detection rate")
        print(f"3. Monitor false positive rate: {best['FP']/(best['FP']+best['TN']):.2%}")
        print(f"4. Set up cost-sensitive thresholds for decision making")
        
        print("\n🔧 TECHNICAL IMPLEMENTATION:")
        print("-" * 40)
        print("1. Implement real-time scoring API")
        print("2. Set up model monitoring and drift detection")
        print("3. Create automated retraining pipeline")
        print("4. Implement A/B testing framework")
        print("5. Integrate with existing fraud systems")
        
        print("\n💡 IMPROVEMENT OPPORTUNITIES:")
        print("-" * 40)
        print("1. Ensemble methods for better robustness")
        print("2. Deep learning models (Neural Networks, Autoencoders)")
        print("3. Advanced feature engineering (customer behavior patterns)")
        print("4. Online learning for concept drift adaptation")
        print("5. Integration with external data sources")

def main():
    """
    Main function to run the complete fraud detection pipeline
    """
    print("🔒 CREDIT CARD FRAUD DETECTION SYSTEM")
    print("=" * 60)
    print("Technologies: Python, scikit-learn, imbalanced-learn")
    print("Dataset: Synthetic (replace with real creditcard.csv)")
    print("=" * 60)
    
    # Initialize pipeline
    pipeline = FraudDetectionPipeline(random_state=42)
    
    # Load data (replace None with 'creditcard.csv' for real data)
    data = pipeline.load_data(file_path=None)
    
    # Explore data
    pipeline.explore_data(data)
    
    # Preprocess data
    X_train, X_test, y_train, y_test = pipeline.preprocess_data(data)
    
    # Handle imbalanced data
    datasets = pipeline.handle_imbalanced_data(X_train, y_train)
    
    # Train models
    results = pipeline.train_models(datasets, X_test, y_test)
    
    # Analyze results
    pipeline.analyze_results()
    
    # Generate recommendations
    pipeline.generate_recommendations()
    
    print("\n✅ FRAUD DETECTION ANALYSIS COMPLETED!")
    return pipeline

# Example usage
if __name__ == "__main__":
    # Run the complete fraud detection pipeline
    fraud_pipeline = main()
    
    print(f"\n🚀 Pipeline ready for production deployment!")
    print(f"📊 Best F1-Score: {fraud_pipeline.results['F1'].max():.3f}")
    print(f"💰 Lowest Cost: ${fraud_pipeline.results['Total_Cost'].min():,.0f}")
    

✓ imbalanced-learn library available
🔒 CREDIT CARD FRAUD DETECTION SYSTEM
Technologies: Python, scikit-learn, imbalanced-learn
Dataset: Synthetic (replace with real creditcard.csv)
Creating synthetic fraud detection dataset...
Generating 50,000 transactions:
  Normal: 49,900 (99.8%)
  Fraud: 100 (0.200%)

DATA EXPLORATION
Dataset shape: (50000, 13)
Memory usage: 4.96 MB

Class Distribution:
  Normal (0): 49,900 (99.80%)
  Fraud (1): 100 (0.200%)
  Imbalance ratio: 499:1

Transaction Amount Analysis:
Normal - Mean: $62.55, Median: $20.08
Fraud - Mean: $807.03, Median: $41.24

Missing values: 0

DATA PREPROCESSING
Feature engineering: 12 -> 24 features
Data split: 40,000 train, 10,000 test
Train fraud rate: 0.0020
Test fraud rate: 0.0020
✓ Feature scaling completed
✓ Final feature count: 24

HANDLING IMBALANCED DATA
Original distribution: Counter({0.0: 39920, 1.0: 80})


  File "C:\Users\Aryan\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\Aryan\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Aryan\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\Aryan\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


SMOTE distribution: Counter({0.0: 39920, 1.0: 39920})
Borderline SMOTE distribution: Counter({0.0: 39920, 1.0: 39920})
Undersampled distribution: Counter({0.0: 240, 1.0: 80})

MODEL TRAINING & EVALUATION

--- Original Dataset (40,000 samples) ---
  Logistic Regression: F1=0.950, Precision=0.950, Recall=0.950
  Random Forest  : F1=0.889, Precision=1.000, Recall=0.800
  Gradient Boosting: F1=0.872, Precision=0.895, Recall=0.850
  SVM            : F1=0.889, Precision=1.000, Recall=0.800
  Weighted LR    : F1=0.792, Precision=0.679, Recall=0.950
  Weighted RF    : F1=0.889, Precision=1.000, Recall=0.800

--- SMOTE Dataset (79,840 samples) ---
  Logistic Regression: F1=0.826, Precision=0.731, Recall=0.950
  Random Forest  : F1=0.895, Precision=0.944, Recall=0.850
  Gradient Boosting: F1=0.826, Precision=0.731, Recall=0.950
  SVM            : F1=0.633, Precision=0.475, Recall=0.950
  Weighted LR    : F1=0.826, Precision=0.731, Recall=0.950
  Weighted RF    : F1=0.895, Precision=0.944, Recall