In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import optuna
import cuml
from cuml.ensemble import IsolationForest as cuIsolationForest
import shap

In [6]:
df=pd.read_csv('/kaggle/input/newdataframe/NewDatFrame.csv')

In [7]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'accountNumber', 'customerId',
       'creditLimit', 'availableMoney', 'transactionDateTime',
       'transactionAmount', 'merchantName', 'acqCountry',
       'merchantCountryCode', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'currentExpDate', 'accountOpenDate',
       'dateOfLastAddressChange', 'cardCVV', 'enteredCVV', 'cardLast4Digits',
       'transactionType', 'echoBuffer', 'currentBalance', 'merchantCity',
       'merchantState', 'merchantZip', 'cardPresent', 'posOnPremises',
       'recurringAuthInd', 'expirationDateKeyInMatch', 'merchant_loc',
       'merchant_lat', 'merchant_lon', 'prevTransactionTime', 'timeDelta',
       'txn_count_24h', 'txn_count_7d', 'amount_zscore', 'amount_to_avg_ratio',
       'amount_24h', 'prev_lat', 'prev_lon', 'distance_from_prev_txn',
       'speed_kmph', 'prev_country', 'different_country', 'cvv_match',
       'exp_date_match', 'is_foreign_transaction', 'amount_to_limit_ratio',
       'am

In [None]:
def preprocess_data(X_train):
    # Drop columns that are:
    # - Direct identifiers (PII)
    # - Redundant features
    # - Features that would lead to data leakage
    # - Features with high cardinality that need special encoding
    
    columns_to_drop = [
        # Identifiers
        'Unnamed: 0.1', 'Unnamed: 0','accountNumber', 'customerId', 'cardLast4Digits', 'cardCVV', 'enteredCVV',
        
        # Date/time columns (already processed into derived features)
        'transactionDateTime', 'currentExpDate', 'accountOpenDate', 'dateOfLastAddressChange',
        
        # Raw location data (already processed into distance/speed)
        'merchantCity', 'merchantState', 'merchantZip', 'merchant_loc', 'merchant_lat', 'merchant_lon',
        
        # Echo/buffer data (likely technical metadata)
        'echoBuffer',
        
        # Features derived from other features that are kept
        'prevTransactionTime' 
    ]
    
    # Keep only the relevant columns
    X_clean = X_train.drop(columns=columns_to_drop, errors='ignore')
    
    # Convert categorical features to one-hot encoding
    categorical_cols = ['merchantName', 'acqCountry', 'merchantCountryCode', 'posEntryMode', 
                      'posConditionCode', 'merchantCategoryCode', 'transactionType', 'prev_country']
    
    for col in categorical_cols:
        if col in X_clean.columns:
            # For high cardinality features, consider frequency encoding instead
            if X_clean[col].nunique() > 20:
                # Replace with frequency encoding
                freq = X_clean[col].value_counts(normalize=True)
                X_clean[col] = X_clean[col].map(freq)
            else:
                # One-hot encode
                X_clean = pd.get_dummies(X_clean, columns=[col], prefix=col, drop_first=True)
    
    # Fill any missing values
    X_clean = X_clean.fillna(0)
    
    return X_clean

In [None]:
# 2. Set up hyperparameter optimization with Optuna
def objective(trial, X_train_processed, X_val=None, y_val=None):
    # Define hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),
        'contamination': trial.suggest_float('contamination', 0.01, 0.2),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'n_jobs': -1,  # Use all CPU cores
        'random_state': 42,
        'verbose': 0
    }
    
    # Use GPU implementation if possible, otherwise fallback to CPU
    try:
        # For cuML, we need to adjust parameters slightly
        cuml_params = params.copy()
        cuml_params.pop('n_jobs', None)  # cuML doesn't use n_jobs
        cuml_params.pop('verbose', None)  # Not needed in cuML
        
        # Train Isolation Forest model using GPU
        model = cuIsolationForest(**cuml_params)
    except (ImportError, Exception) as e:
        print(f"Using CPU implementation due to: {e}")
        model = IsolationForest(**params)
    
    # Fit the model
    model.fit(X_train_processed)
    
    # Evaluate the model
    # For anomaly detection, we'll use the negative anomaly score as the metric
    # Lower anomaly scores indicate more anomalous instances
    if X_val is not None and y_val is not None:
        # Get anomaly scores
        scores = -model.score_samples(X_val)
        
        # Calculate AUC
        from sklearn.metrics import roc_auc_score
        auc = roc_auc_score(y_val, scores)
        return auc
    else:
        # If no validation set, use the training set anomaly scores distribution
        scores = -model.score_samples(X_train_processed)
        # Return the separation between anomaly scores (higher is better)
        return np.std(scores)

In [None]:
# 3. Main function to orchestrate the process
def train_isolation_forest(X_train, y_train, n_trials=50):
    # Split data into train and validation sets
    from sklearn.model_selection import train_test_split
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    # Preprocess the data
    print("Preprocessing data...")
    X_train_processed = preprocess_data(X_train_split)
    X_val_processed = preprocess_data(X_val)
    
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_processed)
    X_val_scaled = scaler.transform(X_val_processed)
    
    # Create an Optuna study for hyperparameter optimization
    print("Starting hyperparameter optimization...")
    study = optuna.create_study(direction='maximize', study_name='isolation_forest_optimization')
    
    # Optimize the hyperparameters
    study.optimize(lambda trial: objective(trial, X_train_scaled, X_val_scaled, y_val), n_trials=n_trials)
    
    # Get the best hyperparameters
    best_params = study.best_params
    print(f"Best parameters: {best_params}")
    print(f"Best value (AUC): {study.best_value:.4f}")
    
    # Get the top 5 most important hyperparameters
    param_importances = optuna.importance.get_param_importances(study)
    print("\nTop 5 Hyperparameter Importances:")
    for param, importance in sorted(param_importances.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{param}: {importance:.4f}")
    
    # Train the final model with the best hyperparameters
    print("\nTraining final model with best parameters...")
    try:
        final_params = best_params.copy()
        final_params.pop('n_jobs', None)
        final_params.pop('verbose', None)
        final_model = cuIsolationForest(**final_params, random_state=42)
    except:
        final_params = best_params.copy()
        final_params['n_jobs'] = -1
        final_params['verbose'] = 0
        final_model = IsolationForest(**final_params, random_state=42)
    
    # Combine training and validation for final training
    X_combined = np.vstack([X_train_scaled, X_val_scaled])
    final_model.fit(X_combined)
    
    # Feature importance analysis
    try:
        # For feature importance, we use SHAP values if possible
        print("\nCalculating feature importance...")
        X_importance = pd.DataFrame(X_combined, columns=X_train_processed.columns)
        explainer = shap.Explainer(final_model, X_importance)
        shap_values = explainer(X_importance)
        
        # Get mean absolute SHAP value for each feature as importance
        feature_importance = pd.DataFrame({
            'Feature': X_train_processed.columns,
            'Importance': np.abs(shap_values.values).mean(axis=0)
        })
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
        print("\nTop 10 Most Important Features:")
        print(feature_importance.head(10))
    except Exception as e:
        print(f"Could not calculate feature importance: {e}")
    
    return final_model, best_params, param_importances, study

In [None]:
model, best_params, param_importances, study = train_isolation_forest(X_train, y_train, n_trials=50)