In [None]:
# Ensure required packages are installed
import sys
import subprocess

# Helper to install missing packages
def install_package(pkg):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])

# Check for GPU availability
def check_gpu_availability():
    gpu_info = {}

    # Check CUDA
    try:
        import torch
        gpu_info['torch_cuda'] = torch.cuda.is_available()
        if gpu_info['torch_cuda']:
            gpu_info['cuda_devices'] = torch.cuda.device_count()
            gpu_info['cuda_device_name'] = torch.cuda.get_device_name(0)
    except ImportError:
        gpu_info['torch_cuda'] = False

    # Check CuPy (for cuML)
    try:
        import cupy
        gpu_info['cupy'] = True
    except ImportError:
        gpu_info['cupy'] = False

    return gpu_info


In [None]:
# Install GPU packages if available
def setup_gpu_packages():
    gpu_info = check_gpu_availability()
    packages_installed = {}

    # Install PyTorch if not available
    if not gpu_info.get('torch_cuda', False):
        try:
            print("Installing PyTorch with CUDA support...")
            install_package('torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118')
            packages_installed['torch'] = True
        except:
            print("Failed to install PyTorch with CUDA. Installing CPU version...")
            install_package('torch torchvision torchaudio')
            packages_installed['torch'] = False

    # Try to install cuML (RAPIDS)
    try:
        import cuml
        packages_installed['cuml'] = True
    except ImportError:
        try:
            print("Installing cuML for GPU acceleration...")
            install_package('cuml-cu11')
            packages_installed['cuml'] = True
        except:
            print("cuML installation failed. Will use CPU alternatives.")
            packages_installed['cuml'] = False

    # Install XGBoost with GPU support
    try:
        import xgboost as xgb
        # Check if GPU support is available
        packages_installed['xgb_gpu'] = 'gpu' in str(xgb.XGBClassifier().get_params())
    except ImportError:
        try:
            print("Installing XGBoost with GPU support...")
            install_package('xgboost[gpu]')
            packages_installed['xgb_gpu'] = True
        except:
            install_package('xgboost')
            packages_installed['xgb_gpu'] = False

    # Install LightGBM with GPU support
    try:
        import lightgbm
        packages_installed['lgb_gpu'] = True
    except ImportError:
        try:
            print("Installing LightGBM with GPU support...")
            install_package('lightgbm --install-option=--gpu')
            packages_installed['lgb_gpu'] = True
        except:
            install_package('lightgbm')
            packages_installed['lgb_gpu'] = False

    # Install CatBoost (has built-in GPU support)
    try:
        import catboost
        packages_installed['catboost'] = True
    except ImportError:
        print("Installing CatBoost...")
        install_package('catboost')
        packages_installed['catboost'] = True

    return packages_installed

# Setup packages
gpu_packages = setup_gpu_packages()

Installing CatBoost...


In [None]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, make_scorer
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

# GPU-specific imports
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    TORCH_AVAILABLE = torch.cuda.is_available()
    print(f"PyTorch CUDA available: {TORCH_AVAILABLE}")
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available")

try:
    import cuml
    from cuml.ensemble import RandomForestClassifier as cuRF
    from cuml.preprocessing import StandardScaler as cuScaler
    from cuml.model_selection import train_test_split as cu_train_test_split
    CUML_AVAILABLE = True
    print("cuML available for GPU acceleration")
except ImportError:
    CUML_AVAILABLE = False
    print("cuML not available - using CPU alternatives")

try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False

try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except ImportError:
    LGB_AVAILABLE = False

try:
    import catboost as cb
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False

# Imbalanced learning imports
try:
    from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
    from imblearn.combine import SMOTETomek
    from imblearn.pipeline import Pipeline as ImbPipeline
except ModuleNotFoundError:
    print("imbalanced-learn not found. Installing...")
    install_package('imbalanced-learn')
    from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
    from imblearn.combine import SMOTETomek
    from imblearn.pipeline import Pipeline as ImbPipeline

PyTorch CUDA available: True
cuML available for GPU acceleration


In [None]:
# ============================================================================
# GPU-Accelerated Fraud Detection - Production Ready
# ============================================================================

class GPUNeuralNetwork(nn.Module):
    """GPU-accelerated neural network for fraud detection"""
    def __init__(self, input_size, hidden_sizes=[512, 256, 128], dropout_rate=0.3):
        super(GPUNeuralNetwork, self).__init__()

        layers = []
        prev_size = input_size

        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, 1))
        layers.append(nn.Sigmoid())

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

class FraudDetectorGPU:
    def __init__(self, device=None):
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
        print(f"Using device: {self.device}")

        self.model = None
        self.scaler = None

    def fit(self, X, y, epochs=100, batch_size=1024, learning_rate=0.001):
        """Train the neural network"""
        # Scale features
        if CUML_AVAILABLE:
            self.scaler = cuScaler()
        else:
            self.scaler = RobustScaler()

        X_scaled = self.scaler.fit_transform(X)

        # Convert to tensors
        X_tensor = torch.FloatTensor(X_scaled).to(self.device)
        y_tensor = torch.FloatTensor(y.values.reshape(-1, 1)).to(self.device)

        # Create model
        self.model = GPUNeuralNetwork(X.shape[1]).to(self.device)

        # Loss and optimizer
        criterion = nn.BCELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

        # Training loop
        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        self.model.train()
        for epoch in range(epochs):
            epoch_loss = 0
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()
                predictions = self.model(batch_X)
                loss = criterion(predictions, batch_y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(dataloader):.4f}')

    def predict_proba(self, X):
        """Predict probabilities"""
        self.model.eval()
        X_scaled = self.scaler.transform(X)
        X_tensor = torch.FloatTensor(X_scaled).to(self.device)

        with torch.no_grad():
            predictions = self.model(X_tensor).cpu().numpy()

        # Return probabilities for both classes
        probs_class_0 = 1 - predictions.flatten()
        probs_class_1 = predictions.flatten()
        return np.column_stack([probs_class_0, probs_class_1])

    def predict(self, X, threshold=0.5):
        """Make predictions"""
        probs = self.predict_proba(X)[:, 1]
        return (probs >= threshold).astype(int)

In [None]:
def load_or_generate_data(path='Fraud.csv'):
    """Load or generate fraud detection dataset"""
    try:
        df = pd.read_csv(path)
        print("✓ Loaded real dataset")
    except FileNotFoundError:
        print("⚠️  Sample dataset created")
        np.random.seed(42)
        n = 200000
        steps = np.random.randint(1, 745, n)
        types = np.random.choice(
            ['CASH-IN', 'CASH-OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'],
            n, p=[0.2, 0.15, 0.1, 0.35, 0.2]
        )
        amounts = np.clip(np.random.lognormal(5, 2, n), 0.01, 1e6)
        old_org = np.clip(np.random.lognormal(7, 1.5, n), 0, 5e5)
        fraud_prob = np.where(types == 'TRANSFER', 0.008,
                              np.where(types == 'CASH-OUT', 0.004, 0.001))
        high_amt = amounts > np.percentile(amounts, 95)
        night = (steps % 24) < 6
        weekend = ((steps // 24) % 7) >= 5
        fraud_prob *= (1 + high_amt * 3 + night * 2 + weekend * 1.5)
        is_fraud = np.random.binomial(1, fraud_prob)
        new_org = np.clip(
            old_org + np.where(np.isin(types, ['CASH-OUT', 'TRANSFER', 'PAYMENT']), -amounts, amounts),
            0, None
        )
        is_merchant = np.random.choice([0, 1], n, p=[0.7, 0.3])
        old_dest = np.where(is_merchant, 0,
                            np.clip(np.random.lognormal(6, 1.2, n), 0, None))
        new_dest = np.where(is_merchant, 0,
                            np.where(np.isin(types, ['TRANSFER', 'PAYMENT']), old_dest + amounts, old_dest))
        df = pd.DataFrame({
            'step': steps,
            'type': types,
            'amount': amounts,
            'nameOrig': ['C' + str(i) for i in range(n)],
            'oldbalanceOrg': old_org,
            'newbalanceOrig': new_org,
            'nameDest': [
                'M' + str(i) if is_merchant[i] else 'C' + str(i + n)
                for i in range(n)
            ],
            'oldbalanceDest': old_dest,
            'newbalanceDest': new_dest,
            'isFraud': is_fraud
        })
        df['isFlaggedFraud'] = (
            (df['type'] == 'TRANSFER') & (df['amount'] > 200000)
        ).astype(int)
    return df

def preprocess(df):
    """Preprocess the dataset"""
    # Handle missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(0)

    # Merchant flags
    df['is_merchant_dest'] = df['nameDest'].str.startswith('M').astype(int)
    df['is_merchant_orig'] = df['nameOrig'].str.startswith('M').astype(int)

    # Zero balances for merchant destinations
    mask = df['is_merchant_dest'] == 1
    df.loc[mask, ['oldbalanceDest', 'newbalanceDest']] = 0
    return df

def engineer_features(df):
    """Engineer features for fraud detection"""
    # Time features
    df['hour_of_day'] = df['step'] % 24
    df['day_of_week'] = ((df['step'] // 24) % 7)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_night'] = ((df['hour_of_day'] >= 22) | (df['hour_of_day'] <= 6)).astype(int)

    # Amount features
    df['amount_log'] = np.log1p(df['amount'])
    df['is_large_amount'] = (df['amount'] > df['amount'].quantile(0.95)).astype(int)
    df['is_round_amount'] = (df['amount'] % 1000 == 0).astype(int)

    # Balance ratios
    df['orig_amt_ratio'] = df['oldbalanceOrg'] / (df['amount'] + 1)
    df['dest_amt_ratio'] = df['oldbalanceDest'] / (df['amount'] + 1)

    # Velocity features
    df_sorted = df.sort_values(['nameOrig', 'step'])
    df['time_diff'] = df_sorted.groupby('nameOrig')['step'].diff().fillna(999)
    df['rapid_txn'] = (df['time_diff'] <= 2).astype(int)

    # Type encoding
    df['type_enc'] = LabelEncoder().fit_transform(df['type'])
    return df

def feature_selection(df):
    """Select best features using multiple methods"""
    from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel
    from sklearn.ensemble import ExtraTreesClassifier

    target = df['isFraud']
    features = [c for c in df.columns if c not in ['isFraud','nameOrig','nameDest','type']]
    X = df[features]

    # ANOVA F-test
    anova_selector = SelectKBest(f_classif, k=20).fit(X, target)
    anova_features = set(X.columns[anova_selector.get_support()])

    # Tree-based selection
    tree_selector = ExtraTreesClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)
    tree_selector.fit(X, target)
    tree_model = SelectFromModel(tree_selector, prefit=True, threshold='median')
    tree_features = set(X.columns[tree_model.get_support()])

    # RFE selection
    rfe_selector = RFE(
        ExtraTreesClassifier(n_estimators=50, class_weight='balanced', n_jobs=-1),
        n_features_to_select=20
    ).fit(X, target)
    rfe_features = set(X.columns[rfe_selector.support_])

    # Combine selections (majority vote)
    selected_features = (anova_features & tree_features) | \
                       (anova_features & rfe_features) | \
                       (tree_features & rfe_features)

    return list(selected_features)

def get_gpu_models():
    """Get GPU-accelerated models"""
    models = {}

    # XGBoost with GPU
    if XGB_AVAILABLE:
        try:
            models['XGB_GPU'] = xgb.XGBClassifier(
                tree_method='gpu_hist',
                gpu_id=0,
                random_state=42,
                eval_metric='logloss',
                use_label_encoder=False
            )
            print("✓ XGBoost GPU enabled")
        except:
            models['XGB'] = xgb.XGBClassifier(
                random_state=42,
                eval_metric='logloss',
                use_label_encoder=False
            )
            print("⚠️  XGBoost GPU failed, using CPU")

    # LightGBM with GPU
    if LGB_AVAILABLE:
        try:
            models['LGB_GPU'] = lgb.LGBMClassifier(
                device='gpu',
                random_state=42,
                verbosity=-1
            )
            print("✓ LightGBM GPU enabled")
        except:
            models['LGB'] = lgb.LGBMClassifier(
                random_state=42,
                verbosity=-1
            )
            print("⚠️  LightGBM GPU failed, using CPU")

    # CatBoost with GPU
    if CATBOOST_AVAILABLE:
        try:
            models['CAT_GPU'] = cb.CatBoostClassifier(
                task_type='GPU',
                random_state=42,
                verbose=False
            )
            print("✓ CatBoost GPU enabled")
        except:
            models['CAT'] = cb.CatBoostClassifier(
                random_state=42,
                verbose=False
            )
            print("⚠️  CatBoost GPU failed, using CPU")

    # cuML Random Forest (if available)
    if CUML_AVAILABLE:
        models['cuRF'] = cuRF(
            n_estimators=100,
            random_state=42
        )
        print("✓ cuML Random Forest enabled")

    return models


In [None]:
def main():
    """Main execution function"""
    print("=== GPU-Accelerated Fraud Detection System ===")
    print(f"GPU Packages Status: {gpu_packages}")

    # Load and preprocess data
    print("\n1. Loading and preprocessing data...")
    df = load_or_generate_data()
    df = preprocess(df)
    df = engineer_features(df)

    # Feature selection
    print("\n2. Selecting features...")
    selected_features = feature_selection(df)
    print(f"Selected {len(selected_features)} features")

    # Prepare data
    X = df[selected_features].fillna(0)
    y = df['isFraud']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Handle class imbalance
    print("\n3. Handling class imbalance...")
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    print(f"Balanced training set size: {len(y_train_balanced)}")

    # GPU Neural Network
    if TORCH_AVAILABLE:
        print("\n4. Training GPU Neural Network...")
        gpu_nn = FraudDetectorGPU()
        gpu_nn.fit(X_train_balanced, y_train_balanced, epochs=50, batch_size=2048)

        # Evaluate Neural Network
        nn_probs = gpu_nn.predict_proba(X_test)[:, 1]
        nn_auc = roc_auc_score(y_test, nn_probs)
        print(f"Neural Network Test AUC: {nn_auc:.4f}")

    # GPU Tree-based models
    print("\n5. Training GPU Tree Models...")
    gpu_models = get_gpu_models()
    trained_models = {}

    for name, model in gpu_models.items():
        try:
            print(f"Training {name}...")
            model.fit(X_train_balanced, y_train_balanced)

            # Evaluate
            if hasattr(model, 'predict_proba'):
                probs = model.predict_proba(X_test)[:, 1]
            else:
                # For cuML models that might not have predict_proba
                preds = model.predict(X_test)
                probs = preds

            auc = roc_auc_score(y_test, probs)
            print(f"{name} Test AUC: {auc:.4f}")
            trained_models[name] = model

        except Exception as e:
            print(f"Failed to train {name}: {str(e)}")

    # Ensemble (if we have multiple models)
    if len(trained_models) > 1:
        print("\n6. Creating ensemble...")
        # Filter models that work with VotingClassifier
        sklearn_models = {}
        for name, model in trained_models.items():
            if hasattr(model, 'predict_proba') and 'cu' not in name.lower():
                sklearn_models[name] = model

        if len(sklearn_models) >= 2:
            ensemble = VotingClassifier(
                estimators=list(sklearn_models.items()),
                voting='soft',
                n_jobs=-1
            )

            # Note: Ensemble needs to be retrained on CPU data
            scaler = RobustScaler()
            X_train_scaled = scaler.fit_transform(X_train_balanced)
            X_test_scaled = scaler.transform(X_test)

            ensemble.fit(X_train_scaled, y_train_balanced)
            ensemble_probs = ensemble.predict_proba(X_test_scaled)[:, 1]
            ensemble_auc = roc_auc_score(y_test, ensemble_probs)
            print(f"Ensemble Test AUC: {ensemble_auc:.4f}")

    # Find optimal threshold
    print("\n7. Finding optimal threshold...")
    if TORCH_AVAILABLE:
        precision, recall, thresholds = precision_recall_curve(y_test, nn_probs)
        f1_scores = 2 * precision * recall / (precision + recall)
        optimal_idx = np.nanargmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]
        optimal_f1 = f1_scores[optimal_idx]
        print(f"Optimal threshold: {optimal_threshold:.4f}, F1-score: {optimal_f1:.4f}")

    # Save models
    print("\n8. Saving models...")
    import joblib

    if TORCH_AVAILABLE:
        torch.save(gpu_nn.model.state_dict(), 'gpu_fraud_model.pth')
        joblib.dump(gpu_nn.scaler, 'gpu_scaler.pkl')
        print("✓ Saved GPU neural network model")

    for name, model in trained_models.items():
        try:
            if 'cu' not in name.lower():  # Skip cuML models for joblib
                joblib.dump(model, f'{name.lower()}_model.pkl')
                print(f"✓ Saved {name} model")
        except:
            print(f"⚠️  Could not save {name} model")

    print("\n=== Training Complete ===")
    print("GPU acceleration status:")
    print(f"  - PyTorch CUDA: {TORCH_AVAILABLE}")
    print(f"  - cuML: {CUML_AVAILABLE}")
    print(f"  - XGBoost GPU: {gpu_packages.get('xgb_gpu', False)}")
    print(f"  - LightGBM GPU: {gpu_packages.get('lgb_gpu', False)}")
    print(f"  - CatBoost GPU: {gpu_packages.get('catboost', False)}")

if __name__ == '__main__':
    main()

=== GPU-Accelerated Fraud Detection System ===
GPU Packages Status: {'cuml': True, 'xgb_gpu': False, 'lgb_gpu': True, 'catboost': True}

1. Loading and preprocessing data...
✓ Loaded real dataset

2. Selecting features...
Selected 20 features

3. Handling class imbalance...
