In [None]:
"""
Fraud Detection with RandomForest (Clean & Realistic Pipeline)
=============================================================
Goal: realistic, generalizable performance — training F1 around 0.75–0.85, test F1 around 0.70–0.80.

This notebook:
- Handles categorical and numerical preprocessing correctly (no leakage)
- Balances classes using class weights (not global oversampling)
- Performs randomized hyperparameter search with StratifiedKFold
- Evaluates using F1, precision, recall, and ROC-AUC
- Searches for the best probability threshold for optimal F1
- Saves the best model for reuse

Target column: **IsFraud**
"""

# %% [markdown]
# ## 1. Setup and Imports
# Import required libraries and set configurations.

import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import joblib

# %% [markdown]
# ## 2. Configuration
# Adjust these settings based on your dataset.

TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'
TARGET = 'IsFraud'  # <-- Updated target column name
RANDOM_STATE = 42
CATEGORICAL_OVERRIDE = None  # e.g. ['merchant','type'] if needed

# %% [markdown]
# ## 3. Data Loading
# Load and combine your train/test datasets or perform a split if only one CSV exists.

if os.path.exists(TRAIN_CSV) and os.path.exists(TEST_CSV):
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_CSV)
    print(f"Loaded train.csv ({train_df.shape}) and test.csv ({test_df.shape})")
    df = pd.concat([train_df, test_df], ignore_index=True)
else:
    csvs = [f for f in os.listdir('.') if f.endswith('.csv')]
    if not csvs:
        raise FileNotFoundError('No CSVs found. Place train.csv/test.csv in the working folder or adjust the script.')
    sizes = [(f, os.path.getsize(f)) for f in csvs]
    sizes.sort(key=lambda x: x[1], reverse=True)
    df = pd.read_csv(sizes[0][0])
    print(f"Loaded {sizes[0][0]} as combined dataframe ({df.shape})")

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in dataframe columns: {df.columns.tolist()}")

if 'train_df' in locals() and 'test_df' in locals():
    X_train = train_df.drop(columns=[TARGET])
    y_train = train_df[TARGET]
    X_test = test_df.drop(columns=[TARGET])
    y_test = test_df[TARGET]
else:
    X = df.drop(columns=[TARGET])
    y = df[TARGET]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
    print(f"Split data: X_train {X_train.shape}, X_test {X_test.shape}")

print('\nClass distribution (train):')
print(y_train.value_counts(normalize=True).rename('proportion'))
print('\nClass distribution (test):')
print(y_test.value_counts(normalize=True).rename('proportion'))

# %% [markdown]
# ## 4. Feature Preprocessing
# Detect categorical/numerical columns and build preprocessing pipeline.

if CATEGORICAL_OVERRIDE:
    categorical_cols = [c for c in CATEGORICAL_OVERRIDE if c in X_train.columns]
else:
    categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
    for col in X_train.select_dtypes(include=['int64','int32']):
        if X_train[col].nunique() < 20:
            categorical_cols.append(col)
    categorical_cols = sorted(list(set(categorical_cols)))

numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols[:10]}{'...' if len(numeric_cols)>10 else ''}")

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# %% [markdown]
# ## 5. Model and Hyperparameter Search
# Set up the RandomForest classifier with balanced class weights and search for optimal hyperparameters.

rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced')

pipeline = Pipeline(steps=[('preproc', preprocessor), ('clf', rf)])

param_dist = {
    'clf__n_estimators': sp_randint(150, 500),
    'clf__max_depth': sp_randint(6, 16),
    'clf__min_samples_split': sp_randint(4, 12),
    'clf__min_samples_leaf': sp_randint(2, 8),
    'clf__max_features': ['sqrt', 'log2', 0.3, 0.5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

rs = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=40,
    scoring='f1',
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=RANDOM_STATE
)
# Drop rows where target is NaN
train_na_rows = y_train[y_train.isna()].index
if len(train_na_rows) > 0:
    print(f"Dropping {len(train_na_rows)} rows with missing target values...")
    X_train = X_train.drop(index=train_na_rows)
    y_train = y_train.drop(index=train_na_rows)



print('\nStarting RandomizedSearchCV...')
rs.fit(X_train, y_train)

print('\nBest parameters:')
print(rs.best_params_)
print('\nBest CV F1: {:.4f}'.format(rs.best_score_))

# %% [markdown]
# ## 6. Model Evaluation
# Evaluate model on the test set, optimize threshold for best F1, and report results.

best = rs.best_estimator_
y_pred_proba = best.predict_proba(X_test)[:,1]

thresholds = np.linspace(0.1, 0.9, 33)
metrics = []
for t in thresholds:
    y_pred_t = (y_pred_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_t, zero_division=0)
    precision = precision_score(y_test, y_pred_t, zero_division=0)
    recall = recall_score(y_test, y_pred_t, zero_division=0)
    metrics.append((t, f1, precision, recall))

best_thresh, best_f1, best_prec, best_rec = max(metrics, key=lambda x: x[1])
print(f"\nBest threshold: {best_thresh:.3f} | F1: {best_f1:.4f}, Precision: {best_prec:.4f}, Recall: {best_rec:.4f}")

y_pred_best = (y_pred_proba >= best_thresh).astype(int)

print('\nClassification Report:')
print(classification_report(y_test, y_pred_best, zero_division=0))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_best))
print('ROC-AUC:', roc_auc_score(y_test, y_pred_proba))

# %% [markdown]
# ## 7. Feature Importances
# Extract and visualize the most important features contributing to the prediction.

preproc = best.named_steps['preproc']
clf = best.named_steps['clf']

num_features = numeric_cols
cat_features = []
if categorical_cols:
    ohe = preproc.named_transformers_['cat'].named_steps['ohe']
    cat_names = []
    for i, col in enumerate(categorical_cols):
        cats = ohe.categories_[i]
        names = [f"{col}__{str(c)}" for c in cats]
        cat_names.extend(names)
    cat_features = cat_names

feature_names = list(num_features) + list(cat_features)

importances = clf.feature_importances_
if len(importances) == len(feature_names):
    fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print('\nTop 30 Feature Importances:')
    display(fi.head(30))
else:
    print('Feature importance alignment issue — skipping.')

# %% [markdown]
# ## 8. Save Best Model
# Save the tuned RandomForest model for later use.

model_path = 'rf_best_model.joblib'
joblib.dump(rs.best_estimator_, model_path)
print(f"Model saved as {model_path}")

# %% [markdown]
# ## 9. Next Steps
# - If test F1 < 0.7, perform feature engineering (e.g., ratios, time-based features).
# - Try SMOTE *inside* cross-validation (never globally).
# - After RandomForest stabilizes, experiment with XGBoost or LightGBM.
# - Always evaluate on a separate unseen set to confirm generalization.


Loaded train.csv ((17524, 35)) and test.csv ((1000, 35))

Class distribution (train):
IsFraud
0.0    0.772681
1.0    0.227319
Name: proportion, dtype: float64

Class distribution (test):
IsFraud
0.0    0.67982
1.0    0.32018
Name: proportion, dtype: float64

Categorical columns (23): ['BankA_BIC', 'BankA_Name', 'Beneficiary_Address', 'Beneficiary_Country', 'Beneficiary_Name', 'Charges', 'CounterpartyRegion', 'Counterparty_BIC', 'Currency', 'Intermediary_BIC', 'LogTimestamp', 'MessageType', 'OrderingCustomer_Address', 'OrderingCustomer_Country', 'OrderingCustomer_Name', 'PurposeCode', 'RandomID', 'RemittanceInfo', 'Status', 'TxnDayOfWeek', 'TxnReference_TRN', 'UETR_UUID', 'ValueDate']
Numeric columns (11): ['Amount', 'OrderingCustomer_Account', 'Beneficiary_Account', 'IsHighRiskCountry', 'TxnHour', 'AmountInUSD', 'RelativeAmountToAvg', 'IsRepeatBeneficiary', 'TxnFrequencyLast7Days', 'TotalAmountLast7Days']...
Dropping 649 rows with missing target values...

Starting RandomizedSearchCV..