<a href="https://colab.research.google.com/github/AsmaaYassinDev/Behavioural-Anomaly-Detection-for-ATO-Fraud/blob/main/Autoencoder_Experiment_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

print("--- Autoencoder Execution Started (Unsupervised) ---")

# --- 1. Load Data ---
file_path = '/content/drive/My Drive/Colab_Data/PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Data Loaded: {len(df)} rows")
except Exception as e:
    print(f"Error: {e}")
    exit()

# --- 2. Build Behavioral Profiles (Engineered Features) ---
print("\n--- Building Behavioral Profiles ---")
df_received = df[df['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df[df['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = cashed_out / (received + 1e-6)
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)

# --- 3. Create Smart Sample ---
print("\n--- Creating Smart Sample ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)

df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
df_normal_sample = df_normal.sample(n=min(500000, len(df_normal)), random_state=42)

df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates()
print(f"Smart Sample Created: {len(df_smart_sample)} rows")

# --- 4. Merge Features ---
print("\n--- Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

for col in ['dest_cash_out_ratio_dest', 'dest_unique_senders_dest', 'dest_cash_out_ratio_orig', 'dest_unique_senders_orig']:
    df_model_data[col] = df_model_data[col].fillna(0)

# Hybrid Engineering
df_model_data['balance_diff_orig'] = df_model_data['oldbalanceOrg'] - df_model_data['newbalanceOrig']
df_model_data['balance_diff_dest'] = df_model_data['newbalanceDest'] - df_model_data['oldbalanceDest']
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes

# Select Features (Hybrid Set)
selected_features = [
    'amount',
    'type_encoded',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig',
    'balance_diff_orig',
    'balance_diff_dest'
]

df_model_data = df_model_data.dropna(subset=['isFraud'])
X = df_model_data[selected_features]
y = df_model_data['isFraud']

# Scale Features (Critical for Autoencoders)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split Data (Standard Stratified Split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)

# --- 5. Train Autoencoder ---
print("\n--- Training Autoencoder ---")

# IMPORTANT: Train ONLY on Normal transactions (y=0)
# The goal is for the model to learn "Normalcy" and fail on "Fraud"
X_train_normal = X_train[y_train == 0]

# Model Architecture
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim, ))

# Encoder (Compressing)
encoder = Dense(16, activation="tanh")(input_layer)
encoder = Dense(8, activation="relu")(encoder)

# Decoder (Reconstructing)
decoder = Dense(16, activation='tanh')(encoder)
decoder = Dense(input_dim, activation='linear')(decoder) # Output layer matches input dimension

autoencoder = Model(inputs=input_layer, outputs=decoder)

autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = autoencoder.fit(
    X_train_normal, X_train_normal, # Input and Target are the same
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_data=(X_test, X_test),
    verbose=1
)

# --- 6. Detect Anomalies (Reconstruction Error) ---
print("\n--- Predicting Anomalies ---")

# 1. Predict the output for the test set
predictions = autoencoder.predict(X_test)

# 2. Calculate MSE (Reconstruction Error) for each transaction
mse = np.mean(np.power(X_test - predictions, 2), axis=1)

# 3. Set a Threshold
# We assume the top X% of errors are fraud (where X is the actual fraud rate in training)
threshold = np.quantile(mse, 1 - y_train.mean())

print(f"Reconstruction Error Threshold: {threshold:.4f}")

# 4. Classify: If Error > Threshold, it's Fraud (1)
y_pred = [1 if e > threshold else 0 for e in mse]

# --- 7. Evaluate Results ---
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\n=======================================================")
print("      RESULTS: Autoencoder (Unsupervised)")
print("=======================================================")
print(f"Precision: {precision:.2%}")
print(f"Recall:    {recall:.2%}")
print(f"F1-Score:  {f1:.2%}")
print("-------------------------------------------------------")
print("Confusion Matrix:")
print(cm)
print("=======================================================")
print("\n--- Execution Complete ---")

--- Autoencoder Execution Started (Unsupervised) ---
Data Loaded: 6362620 rows

--- Building Behavioral Profiles ---

--- Creating Smart Sample ---
Smart Sample Created: 561154 rows

--- Merging Features with Transactions ---

--- Training Autoencoder ---
Epoch 1/10
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - loss: 0.3045 - val_loss: 0.4598
Epoch 2/10
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - loss: 0.1680 - val_loss: 0.3838
Epoch 3/10
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - loss: 0.1042 - val_loss: 0.3243
Epoch 4/10
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - loss: 0.1078 - val_loss: 0.3132
Epoch 5/10
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - loss: 0.0841 - val_loss: 0.2978
Epoch 6/10
[1m12096/12096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - loss: 0.0675 - val_loss: 0