In [2]:
"""
Fraud Detection Machine Learning System

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    precision_recall_curve,
    f1_score
)
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import kagglehub
import warnings
warnings.filterwarnings('ignore')


class FraudDetectionModel:
    """Main class for fraud detection pipeline"""
    
    def __init__(self, data_path=None, use_kaggle=True):
        """
        Initialize the fraud detection model
        
        Args:
            data_path (str, optional): Path to the CSV dataset. If None, downloads from Kaggle.
            use_kaggle (bool): If True and data_path is None, automatically downloads from Kaggle.
        """
        self.data_path = data_path
        self.use_kaggle = use_kaggle
        self.df = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = StandardScaler()
        self.models = {}
        self.results = {}
        
    def load_data(self):
        """Load and display basic information about the dataset"""
        print("Loading dataset...")
        
        # If no data path provided, download from Kaggle
        if self.data_path is None and self.use_kaggle:
            print("No local dataset found.So Downloading from Kaggle...")
            try:
                # Download dataset from Kaggle using kagglehub
                # This downloads the entire dataset and returns the path
                import os
                dataset_path = kagglehub.dataset_download("valakhorasani/bank-transaction-dataset-for-fraud-detection")
                print(f"[OK] Dataset downloaded to: {dataset_path}")
                
                # Find the CSV file in the downloaded dataset
                csv_files = [f for f in os.listdir(dataset_path) if f.endswith('.csv')]
                if not csv_files:
                    raise FileNotFoundError("No CSV file found in the downloaded dataset")
                
                csv_file = os.path.join(dataset_path, csv_files[0])
                print(f"Loading CSV file: {csv_files[0]}")
                self.df = pd.read_csv(csv_file)
                print("[OK] Dataset loaded successfully from Kaggle!")
            except Exception as e:
                print(f"Error downloading from Kaggle: {e}")
                print("\nPlease ensure you have:")
                print("1. Kaggle API credentials set up (~/.kaggle/kaggle.json)")
                print("2. Or provide a local data_path when creating the model")
                raise
        else:
            # Load from local file
            print(f"Loading from local file: {self.data_path}")
            self.df = pd.read_csv(self.data_path)
        
        print(f"\nDataset shape: {self.df.shape}")
        print(f"\nFirst few rows:")
        print(self.df.head())
        print(f"\nDataset info:")
        print(self.df.info())
        print(f"\nMissing values:")
        print(self.df.isnull().sum())
        
        # Check for fraud distribution
        if 'isFraud' in self.df.columns:
            print(f"\nFraud distribution:")
            print(self.df['isFraud'].value_counts())
            print(f"\nFraud percentage: {self.df['isFraud'].mean() * 100:.2f}%")
        else:
            print("\n[INFO] No fraud label column found. Generating synthetic fraud labels...")
            self.generate_fraud_labels()
        
        return self.df
    
    def generate_fraud_labels(self):
        """Generate synthetic fraud labels based on suspicious patterns"""
        print("\n" + "="*50)
        print("GENERATING SYNTHETIC FRAUD LABELS")
        print("="*50)
        
        # Create fraud labels based on suspicious patterns
        
        fraud_indicators = []
        
        # High transaction amounts (top 10%)
        high_amount_threshold = self.df['TransactionAmount'].quantile(0.90)
        fraud_indicators.append(self.df['TransactionAmount'] > high_amount_threshold)
        
        # Multiple login attempts (>= 3)
        fraud_indicators.append(self.df['LoginAttempts'] >= 3)
        
        # Low account balance with high transaction
        low_balance = self.df['AccountBalance'] < self.df['TransactionAmount']
        fraud_indicators.append(low_balance)
        
        # Long transaction duration (top 15%)
        long_duration_threshold = self.df['TransactionDuration'].quantile(0.85)
        fraud_indicators.append(self.df['TransactionDuration'] > long_duration_threshold)
        
        # Combine indicators: fraud if 2 or more suspicious patterns
        fraud_score = sum(fraud_indicators)
        self.df['isFraud'] = (fraud_score >= 2).astype(int)
        
        print(f"\nFraud labels generated based on suspicious patterns:")
        print(f"- High transaction amounts (> ${high_amount_threshold:.2f})")
        print(f"- Multiple login attempts (>= 3)")
        print(f"- Low account balance vs transaction amount")
        print(f"- Long transaction duration (> {long_duration_threshold} seconds)")
        print(f"\nFraud distribution:")
        print(self.df['isFraud'].value_counts())
        print(f"\nFraud percentage: {self.df['isFraud'].mean() * 100:.2f}%")
    
    def preprocess_data(self):
        """Preprocess the data: handle missing values, encode categoricals, scale features"""
        print("\n" + "="*50)
        print("PREPROCESSING DATA")
        print("="*50)
        
        # Make a copy to avoid modifying original
        df_processed = self.df.copy()
        
        # Handle missing values
        df_processed = df_processed.fillna(df_processed.median(numeric_only=True))
        
        # Identify categorical columns
        categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
        print(f"\nCategorical columns found: {categorical_cols}")
        
        # Encode categorical variables
        label_encoders = {}
        for col in categorical_cols:
            if col in df_processed.columns:
                le = LabelEncoder()
                df_processed[col] = le.fit_transform(df_processed[col].astype(str))
                label_encoders[col] = le
                print(f"Encoded column: {col}")
        
        # Separate features and target
        # Common fraud column names
        fraud_col = None
        for col in ['isFraud', 'is_fraud', 'fraud', 'Fraud', 'Class']:
            if col in df_processed.columns:
                fraud_col = col
                break
        
        if fraud_col is None:
            raise ValueError("Could not find fraud label column. Please ensure dataset has 'isFraud' or similar column.")
        
        print(f"\nUsing '{fraud_col}' as target variable")
        
        X = df_processed.drop(columns=[fraud_col])
        y = df_processed[fraud_col]
        
        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        print(f"\nTraining set size: {self.X_train.shape}")
        print(f"Test set size: {self.X_test.shape}")
        
        # Scale features
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
        
        print("\nData preprocessing completed!")
        
    def handle_imbalance(self):
        """Handle class imbalance using SMOTE"""
        print("\n" + "="*50)
        print("HANDLING CLASS IMBALANCE WITH SMOTE")
        print("="*50)
        
        print(f"\nOriginal training set distribution:")
        print(f"Non-fraud: {sum(self.y_train == 0)}")
        print(f"Fraud: {sum(self.y_train == 1)}")
        
        smote = SMOTE(random_state=42)
        self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train)
        
        print(f"\nAfter SMOTE:")
        print(f"Non-fraud: {sum(self.y_train == 0)}")
        print(f"Fraud: {sum(self.y_train == 1)}")
        
    def train_models(self):
        """Train multiple ML models"""
        print("\n" + "="*50)
        print("TRAINING MODELS")
        print("="*50)
        
        # Logistic Regression
        print("\n1. Training Logistic Regression...")
        lr = LogisticRegression(random_state=42, max_iter=1000)
        lr.fit(self.X_train, self.y_train)
        self.models['Logistic Regression'] = lr
        print("   [OK] Completed")
        
        # Random Forest
        print("\n2. Training Random Forest...")
        rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        rf.fit(self.X_train, self.y_train)
        self.models['Random Forest'] = rf
        print("   [OK] Completed")
        
        # XGBoost
        print("\n3. Training XGBoost...")
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        )
        xgb_model.fit(self.X_train, self.y_train)
        self.models['XGBoost'] = xgb_model
        print("   [OK] Completed")
        
    def evaluate_models(self):
        """Evaluate all trained models"""
        print("\n" + "="*50)
        print("MODEL EVALUATION")
        print("="*50)
        
        for name, model in self.models.items():
            print(f"\n{'='*50}")
            print(f"{name}")
            print(f"{'='*50}")
            
            # Predictions
            y_pred = model.predict(self.X_test)
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            
            # Metrics
            print("\nClassification Report:")
            print(classification_report(self.y_test, y_pred))
            
            print("\nConfusion Matrix:")
            cm = confusion_matrix(self.y_test, y_pred)
            print(cm)
            
            # Calculate metrics
            roc_auc = roc_auc_score(self.y_test, y_pred_proba)
            f1 = f1_score(self.y_test, y_pred)
            
            print(f"\nROC-AUC Score: {roc_auc:.4f}")
            print(f"F1 Score: {f1:.4f}")
            
            # Store results
            self.results[name] = {
                'predictions': y_pred,
                'probabilities': y_pred_proba,
                'roc_auc': roc_auc,
                'f1_score': f1,
                'confusion_matrix': cm
            }
    
    def plot_results(self):
        """Create visualization plots for model performance"""
        print("\n" + "="*50)
        print("GENERATING VISUALIZATIONS")
        print("="*50)
        
        # ROC Curves
        plt.figure(figsize=(10, 6))
        for name, results in self.results.items():
            fpr, tpr, _ = roc_curve(self.y_test, results['probabilities'])
            plt.plot(fpr, tpr, label=f"{name} (AUC = {results['roc_auc']:.4f})")
        
        plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves - Fraud Detection Models')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
        print("[OK] Saved ROC curves to 'roc_curves.png'")
        plt.close()
        
        # Confusion Matrices
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        for idx, (name, results) in enumerate(self.results.items()):
            sns.heatmap(results['confusion_matrix'], annot=True, fmt='d', 
                       cmap='Blues', ax=axes[idx])
            axes[idx].set_title(f'{name}\nConfusion Matrix')
            axes[idx].set_ylabel('Actual')
            axes[idx].set_xlabel('Predicted')
        
        plt.tight_layout()
        plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
        print("[OK] Saved confusion matrices to 'confusion_matrices.png'")
        plt.close()
        
        # Model Comparison
        model_names = list(self.results.keys())
        roc_scores = [self.results[name]['roc_auc'] for name in model_names]
        f1_scores = [self.results[name]['f1_score'] for name in model_names]
        
        x = np.arange(len(model_names))
        width = 0.35
        
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.bar(x - width/2, roc_scores, width, label='ROC-AUC', alpha=0.8)
        ax.bar(x + width/2, f1_scores, width, label='F1-Score', alpha=0.8)
        
        ax.set_xlabel('Models')
        ax.set_ylabel('Scores')
        ax.set_title('Model Performance Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels(model_names)
        ax.legend()
        ax.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
        print("[OK] Saved model comparison to 'model_comparison.png'")
        plt.close()
        
    def get_best_model(self):
        """Return the best performing model based on ROC-AUC score"""
        best_model_name = max(self.results, key=lambda x: self.results[x]['roc_auc'])
        best_model = self.models[best_model_name]
        best_score = self.results[best_model_name]['roc_auc']   
        
        print("\n" + "="*50)
        print("BEST MODEL")
        print("="*50)
        print(f"\nBest Model: {best_model_name}")
        print(f"ROC-AUC Score: {best_score:.4f}")
        print(f"F1 Score: {self.results[best_model_name]['f1_score']:.4f}")
        
        return best_model_name, best_model
    
    def run_pipeline(self, use_smote=True):
        """Run the complete fraud detection pipeline"""
        print("\n" + "="*50)
        print("FRAUD DETECTION ML PIPELINE")
        print("="*50)
        
        # Load data
        self.load_data()
        
        # Preprocess
        self.preprocess_data()
        
        # Handle imbalance
        if use_smote:
            self.handle_imbalance()
        
        # Train models
        self.train_models()
        
        # Evaluate
        self.evaluate_models()
        
        # Visualize
        self.plot_results()
        
        # Get best model
        self.get_best_model()
        
        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*50)
        print("\nGenerated files:")
        print("  - roc_curves.png")
        print("  - confusion_matrices.png")
        print("  - model_comparison.png")


def main():
    """Main function to run the fraud detection system"""
    print("="*60)
    print("BANK TRANSACTION FRAUD DETECTION SYSTEM")
    print("="*60)
    
    # Create and run the model
    # By default, downloads dataset automatically from Kaggle
    # To use a local file instead, pass: FraudDetectionModel(data_path="your_file.csv")
    fraud_detector = FraudDetectionModel()
    fraud_detector.run_pipeline(use_smote=True)


if __name__ == "__main__":
    main()

BANK TRANSACTION FRAUD DETECTION SYSTEM

FRAUD DETECTION ML PIPELINE
Loading dataset...
No local dataset found.So Downloading from Kaggle...
[OK] Dataset downloaded to: C:\Users\Abhi\.cache\kagglehub\datasets\valakhorasani\bank-transaction-dataset-for-fraud-detection\versions\4
Loading CSV file: bank_transactions_data_2.csv
[OK] Dataset loaded successfully from Kaggle!

Dataset shape: (2512, 16)

First few rows:
  TransactionID AccountID  TransactionAmount      TransactionDate  \
0      TX000001   AC00128              14.09  2023-04-11 16:29:14   
1      TX000002   AC00455             376.24  2023-06-27 16:44:19   
2      TX000003   AC00019             126.29  2023-07-10 18:16:08   
3      TX000004   AC00070             184.50  2023-05-05 16:32:11   
4      TX000005   AC00411              13.45  2023-10-16 17:51:24   

  TransactionType   Location DeviceID      IP Address MerchantID Channel  \
0           Debit  San Diego  D000380  162.198.218.92       M015     ATM   
1           Debit