<a href="https://colab.research.google.com/github/AsmaaYassinDev/Behavioural-Anomaly-Detection-for-ATO-Fraud/blob/main/Hybrid_Model_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

print("--- Hybrid Model Execution Started (Autoencoder + RF) ---")

# --- 1. Load Data ---
# Ensure the file path matches your environment
file_path = '/content/drive/My Drive/Colab_Data/PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Data Loaded: {len(df)} rows")
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

# --- 2. Create Smart Sample & Features ---
# This part replicates the feature engineering steps from previous experiments
# to ensure consistency.

print("\n--- Building Behavioral Profiles (The 'Hard' Features) ---")
# Filter for relevant transaction types
df_received = df[df['type'].isin(['TRANSFER', 'CASH_IN'])]
df_cashed_out = df[df['type'] == 'CASH_OUT']

# Calculate aggregations
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

# Create profiles for all users
all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []

for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)

    # dest_cash_out_ratio: How much of received money is cashed out? (Mules ~ 1.0)
    ratio = cashed_out / (received + 1e-6)
    ratio = min(ratio, 1.0)

    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })

final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles built.")

print("\n--- Creating Smart Sample ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)

# Get all users involved in fraud
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
# Sample normal transactions
df_normal = df[df['isFraud'] == 0]
df_normal_sample = df_normal.sample(n=min(500000, len(df_normal)), random_state=42)

df_model_data = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates()
print(f"Smart Sample Created: {len(df_model_data)} rows")

# --- 3. Merge Features ---
print("\n--- Merging Features ---")
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

for col in ['dest_cash_out_ratio_dest', 'dest_unique_senders_dest', 'dest_cash_out_ratio_orig', 'dest_unique_senders_orig']:
    df_model_data[col] = df_model_data[col].fillna(0)

# Structural Features (Arithmetic Diffs)
df_model_data['balance_diff_orig'] = df_model_data['oldbalanceOrg'] - df_model_data['newbalanceOrig']
df_model_data['balance_diff_dest'] = df_model_data['newbalanceDest'] - df_model_data['oldbalanceDest']
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes

# --- 5. Prepare Data for Hybrid Model ---
# Features that the Autoencoder will use to learn patterns
ae_features = [
    'amount', 'type_encoded', 'balance_diff_orig', 'balance_diff_dest',
    'dest_cash_out_ratio_dest', 'dest_unique_senders_dest'
]

df_model_data = df_model_data.dropna(subset=['isFraud'])
X = df_model_data[ae_features]
y = df_model_data['isFraud']

# Scale features (important for Neural Networks)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)

# --- 6. Phase 1: Train Autoencoder (Feature Extractor) ---
print("\n--- Phase 1: Training Autoencoder ---")
# Train only on NORMAL transactions to learn the "normal" representation
X_train_normal = X_train[y_train == 0]

input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim, ))
encoder = Dense(16, activation="tanh")(input_layer)
encoder = Dense(8, activation="relu")(encoder) # Latent Space representation
decoder = Dense(16, activation='tanh')(encoder)
decoder = Dense(input_dim, activation='linear')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train for a few epochs
autoencoder.fit(X_train_normal, X_train_normal, epochs=5, batch_size=32, verbose=1)

# --- 7. Generate "Anomaly Score" Feature ---
print("\n--- Generating Reconstruction Error Features ---")
# Calculate reconstruction error (MSE) for both train and test sets
train_preds = autoencoder.predict(X_train)
test_preds = autoencoder.predict(X_test)

train_mse = np.mean(np.power(X_train - train_preds, 2), axis=1)
test_mse = np.mean(np.power(X_test - test_preds, 2), axis=1)

# Add this MSE as a NEW feature to the dataset
X_train_hybrid = np.column_stack((X_train, train_mse))
X_test_hybrid = np.column_stack((X_test, test_mse))

print(f"New Feature Set Shape: {X_train_hybrid.shape} (Added MSE column)")

# --- 8. Phase 2: Train Random Forest on Hybrid Features ---
print("\n--- Phase 2: Training Random Forest on Hybrid Features ---")
# Train RF using the original features PLUS the Autoencoder's anomaly score
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100)
rf_model.fit(X_train_hybrid, y_train)

# --- 9. Evaluate ---
y_pred = rf_model.predict(X_test_hybrid)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\n=======================================================")
print(f"      RESULTS: Hybrid Model (AE + RF)")
print("=======================================================")
print(f"Precision: {precision:.2%}")
print(f"Recall:    {recall:.2%}")
print(f"F1-Score:  {f1:.2%}")
print("-------------------------------------------------------")
print("Confusion Matrix:")
print(cm)
print("=======================================================")

--- Hybrid Model Execution Started (Autoencoder + RF) ---
Data Loaded: 6362620 rows

--- Building Behavioral Profiles (The 'Hard' Features) ---
Behavioral profiles built.

--- Creating Smart Sample ---
Smart Sample Created: 561154 rows

--- Merging Features ---

--- Phase 1: Training Autoencoder ---
Epoch 1/5
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - loss: 0.2995
Epoch 2/5
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - loss: 0.1605
Epoch 3/5
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - loss: 0.0964
Epoch 4/5
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - loss: 0.0785
Epoch 5/5
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step - loss: 0.0398

--- Generating Reconstruction Error Features ---
[1m12276/12276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1ms/step
[1m5261/5261[0m [32m━━━━━━━━━━━━━━━━━━━━