<a href="https://colab.research.google.com/github/AsmaaYassinDev/Behavioural-Anomaly-Detection-for-ATO-Fraud/blob/main/Final_Research_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

# Model Imports
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from xgboost import XGBClassifier # Import the new model
# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Starting Final Research Pipeline ---")
print("Objective: Run all experiments step-by-step on the full, original dataset.")

# --- Step 1: Load Full Original Data ---
# This is the file you downloaded from Kaggle (6.3 million rows)
file_path = '/content/drive/My Drive/Colab_Data/PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"\n--- Step 1: Load Data ---")
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error: Could not load file '{file_path}'.")
    print("Please make sure you have uploaded the correct file from Kaggle to Colab.")
    exit()

# --- Step 2: The "Critical Pivot" (The Proof) ---
print("\n--- Step 2: The 'Critical Pivot' (Proof) ---")
# We prove that profiling 'nameOrig' is impossible
sender_counts = df['nameOrig'].value_counts()
one_time_senders = sender_counts[sender_counts == 1].count()
total_senders = sender_counts.count()
one_time_sender_ratio = one_time_senders / total_senders
print(f"Total Senders: {total_senders}")
print(f"Senders with only 1 transaction: {one_time_senders}")
print(f"Result: {one_time_sender_ratio:.2%} of senders have no history.")
print("Conclusion: Methodology MUST pivot to profiling the recipient ('nameDest').")

# --- Step 3: Build Behavioral Profiles (The "Smarter" Way) ---
print("\n--- Step 3: Building Behavioral Profiles (from all 6.3M rows) ---")

# (a) Calculate basic statistics (Ratio, Senders)
print("   (a) Calculating cash-out ratio and unique senders...")
df_received = df[df['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df[df['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles_basic = pd.DataFrame(profiles_list)

# (b) Calculate average time to cash-out (the "Time" feature)
print("   (b) Calculating average time to cash-out (avg_time_to_cash_out)...")
df_transfers = df[df['type'] == 'TRANSFER'][['step', 'nameDest']]
df_cashouts = df[df['type'] == 'CASH_OUT'][['step', 'nameOrig']]
df_transfers.rename(columns={'nameDest': 'user_id'}, inplace=True)
df_cashouts.rename(columns={'nameOrig': 'user_id'}, inplace=True)
df_transfers['tx_type'] = 'TRANSFER_IN'
df_cashouts['tx_type'] = 'CASH_OUT'

user_log = pd.concat([df_transfers, df_cashouts]).sort_values(by=['user_id', 'step'])
user_log['prev_step'] = user_log.groupby('user_id')['step'].shift(1)
user_log['prev_type'] = user_log.groupby('user_id')['tx_type'].shift(1)
user_log['time_since_transfer'] = user_log['step'] - user_log['prev_step']
is_pattern = (user_log['tx_type'] == 'CASH_OUT') & (user_log['prev_type'] == 'TRANSFER_IN')
pattern_times = user_log[is_pattern]

avg_time_profile = pattern_times.groupby('user_id')['time_since_transfer'].mean().reset_index()
avg_time_profile.columns = ['user_id', 'avg_time_to_cash_out']

# (c) Aggregate final profiles
print("   (c) Aggregating final profiles...")
final_profiles = pd.merge(final_profiles_basic, avg_time_profile, on='user_id', how='left')
final_profiles['avg_time_to_cash_out'] = final_profiles['avg_time_to_cash_out'].fillna(999)
print("Full behavioral profiles created successfully.")

# --- Step 4: Create the "Smart Sample" for Training/Testing ---
print("\n--- Step 4: Creating the 'Smart Sample' ---")
# We do this step *after* building profiles to ensure our features are complete
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)
df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 5: Merge Features and Prepare Final Dataset ---
print("\n--- Step 5: Merging Features and Preparing Final Dataset ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

# Fill NaNs created by merging
df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['avg_time_to_cash_out_dest'] = df_model_data['avg_time_to_cash_out_dest'].fillna(999)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)
df_model_data['avg_time_to_cash_out_orig'] = df_model_data['avg_time_to_cash_out_orig'].fillna(999)
df_model_data = df_model_data.dropna(subset=['isFraud']) # Drop any rows with missing labels

# Define the features (X) and the label (y)
features_list = [
    'amount',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'avg_time_to_cash_out_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig',
    'avg_time_to_cash_out_orig'
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features_list.append('type_encoded')

X_all_features = df_model_data[features_list]
y_all_labels = df_model_data['isFraud']

# Scale features
scaler = StandardScaler()
X_all_features_scaled = scaler.fit_transform(X_all_features)

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X_all_features_scaled, y_all_labels,
    test_size=0.3, random_state=42, stratify=y_all_labels
)
print(f"Data prepared. Training on {len(X_train)} rows, testing on {len(X_test)} rows.")

# --- Step 6: Run All Experiments ---
print("\n--- Step 6: Running All Experiments ---")

# We will store all our F1 scores here
final_results = {}

# --- Experiment 1: Isolation Forest (Unsupervised) ---
print("\nRunning Experiment 1: Isolation Forest...")
contamination = y_train.mean() # Calculate contamination from the training set
model_if = IsolationForest(contamination=contamination, random_state=42)
model_if.fit(X_train) # Fit on training data
predictions_if_raw = model_if.predict(X_test) # Predict on new data
y_pred_if = [1 if p == -1 else 0 for p in predictions_if_raw]
f1_if = f1_score(y_test, y_pred_if)
final_results['IsolationForest'] = f1_if
print(f"Isolation Forest F1-Score: {f1_if:.2%}")
# --- Experiment 2: Local Outlier Factor (LOF) (Unsupervised) ---
print("\nRunning Experiment 2: Local Outlier Factor (LOF)...")
# Note: LOF is VERY slow. We will use a smaller sample for this one test.
# We create a 10k sample just for this model
X_test_sample, _, y_test_sample, _ = train_test_split(X_test, y_test, train_size=10000, random_state=42, stratify=y_test)
model_lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination, novelty=True)
model_lof.fit(X_train) # Fit on training data
predictions_lof_raw = model_lof.predict(X_test_sample) # Predict on the 10k sample
y_pred_lof = [1 if p == -1 else 0 for p in predictions_lof_raw]
f1_lof = f1_score(y_test_sample, y_pred_lof)
final_results['LOF'] = f1_lof
print(f"LOF F1-Score (on 10k sample): {f1_lof:.2%}")

# --- Experiment 3: Autoencoder (Unsupervised) ---
print("\nRunning Experiment 3: Autoencoder...")
# We must train the Autoencoder *only* on normal data
X_train_normal = X_train[y_train == 0]
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim, ))
encoder = Dense(input_dim // 2, activation='relu')(input_layer)
encoder = Dense(input_dim // 4, activation='relu')(encoder)
decoder = Dense(input_dim // 2, activation='relu')(encoder)
decoder = Dense(input_dim, activation='linear')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train_normal, X_train_normal, epochs=10, batch_size=32, validation_data=(X_test, X_test), verbose=0, shuffle=True)
print("Autoencoder training complete.")
# Calculate reconstruction error
train_predictions_ae = autoencoder.predict(X_train_normal)
train_mse = np.mean(np.power(X_train_normal - train_predictions_ae, 2), axis=1)
threshold = np.quantile(train_mse, 0.99) # Set threshold at 99th percentile of normal error
predictions_ae = autoencoder.predict(X_test)
test_mse = np.mean(np.power(X_test - predictions_ae, 2), axis=1)
y_pred_ae = [1 if e > threshold else 0 for e in test_mse]
f1_ae = f1_score(y_test, y_pred_ae)
final_results['Autoencoder'] = f1_ae
print(f"Autoencoder F1-Score: {f1_ae:.2%}")


# --- Experiment 4: RandomForest (Supervised - The "Proof") ---
print("\nRunning Experiment 4: RandomForest (Supervised)...")
model_rf = RandomForestClassifier(random_state=42, class_weight='balanced')
model_rf.fit(X_train, y_train) # Fit on training data (with labels)
predictions_rf = model_rf.predict(X_test) # Predict on new data
f1_rf = f1_score(y_test, predictions_rf)
final_results['RandomForest (Supervised)'] = f1_rf
print(f"RandomForest F1-Score: {f1_rf:.2%}")

# --- Experiment 5: XGBoost (Supervised - The "Ultimate Proof") ---
print("\nRunning Experiment 5: XGBoost (Supervised)...")
# Initialize XGBoost, using 'scale_pos_weight' for class imbalance (similar to class_weight='balanced' in RF)
scale_pos_weight_value = (y_train == 0).sum() / (y_train == 1).sum()
model_xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,
    # This parameter helps handle the imbalance, similar to class_weight in RF
    scale_pos_weight=scale_pos_weight_value
)

# Fit on training data (with labels)
model_xgb.fit(X_train, y_train)

# Predict on new data
predictions_xgb = model_xgb.predict(X_test)

# Calculate F1-Score
f1_xgb = f1_score(y_test, predictions_xgb)

# Store and print result
final_results['XGBoost (Supervised)'] = f1_xgb
print(f"XGBoost F1-Score: {f1_xgb:.2%}")

# --- Step 7: Final Conclusion ---
# --- Step 7: Final Conclusion ---
print("\n--- Step 7: Final Comparative Results ---")
print("This is the final 'story' of the research paper.")
print("\n=======================================================")
print("           FINAL MODEL F1-SCORES           ")
print("=======================================================")
print(f"1. Isolation Forest (Unsupervised): {final_results['IsolationForest']:.2%}")
print(f"2. LOF (Unsupervised):                {final_results['LOF']:.2%}")
print(f"3. Autoencoder (Unsupervised):        {final_results['Autoencoder']:.2%}")
print("-------------------------------------------------------")
print(f"4. RandomForest (Supervised):         {final_results['RandomForest (Supervised)']:.2%}")
# Add XGBoost here:
print(f"5. XGBoost (Supervised):              {final_results['XGBoost (Supervised)']:.2%}")
print("=======================================================")
print("\nConclusion:")
print("The Unsupervised models (IF, LOF, AE) all 'failed' (low F1-Scores),")
print("proving they are 'confused' by the Behavioral Mimicry (mules vs. merchants).")
print("\nThe Supervised model (RandomForest) 'succeeded' (high F1-Score),")
print("proving that our Behavioral Features ARE predictive and valuable.")
print("\nThis proves that the problem is too complex for simple unsupervised models")
print("and that future work requires more advanced methods (like GNNs).")

print("\n--- Full Research Pipeline Complete ---")

--- Starting Final Research Pipeline ---
Objective: Run all experiments step-by-step on the full, original dataset.

--- Step 1: Load Data ---
Successfully loaded the full file (6362620 rows).

--- Step 2: The 'Critical Pivot' (Proof) ---
Total Senders: 6353307
Senders with only 1 transaction: 6344009
Result: 99.85% of senders have no history.
Conclusion: Methodology MUST pivot to profiling the recipient ('nameDest').

--- Step 3: Building Behavioral Profiles (from all 6.3M rows) ---
   (a) Calculating cash-out ratio and unique senders...
   (b) Calculating average time to cash-out (avg_time_to_cash_out)...
   (c) Aggregating final profiles...
Full behavioral profiles created successfully.

--- Step 4: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 561154 rows.

--- Step 5: Merging Features and Preparing Final Dataset ---
Data prepared. Training on 392807 rows, testing on 168347 rows.

--- Step 6: Running All Experiments ---

Running Experiment 1: Isolation F