<a href="https://colab.research.google.com/github/AsmaaYassinDev/Behavioural-Anomaly-Detection-for-ATO-Fraud/blob/main/unsupervised_baseline_model_IF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, confusion_matrix
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Improved Isolation Forest) ---")
print("Objective: Identify Anomalous Accounts based on their behavior.")

# --- Load Data ---
file_path = '/content/drive/My Drive/Colab_Data/PS_20174392719_1491204439457_log.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]

df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)

df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Build Behavioral Profiles ---
print("\n--- Step 2: Building Behavioral Profiles ---")
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = cashed_out / (received + 1e-6)
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 3: Merge Features ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

for col in ['dest_cash_out_ratio_dest', 'dest_unique_senders_dest', 'dest_cash_out_ratio_orig', 'dest_unique_senders_orig']:
    df_model_data[col] = df_model_data[col].fillna(0)

# Add engineered features
df_model_data['balance_diff_orig'] = df_model_data['oldbalanceOrg'] - df_model_data['newbalanceOrig']
df_model_data['balance_diff_dest'] = df_model_data['newbalanceDest'] - df_model_data['oldbalanceDest']

# Encode transaction type
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes

# --- Step 4: Isolation Forest ---
print("\n--- Step 4: Training Isolation Forest Model ---")
features = [
    'amount',
    'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig',
    'balance_diff_orig',
    'balance_diff_dest',
    'type_encoded'
]
X = df_model_data[features]
y_true = df_model_data['isFraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

contamination = y_true.mean()
print(f"Fraud (Contamination) rate in the Smart Sample: {contamination:.2%}")

model = IsolationForest(
    n_estimators=300,
    max_samples=0.7,
    contamination=contamination,
    random_state=42,
    n_jobs=-1
)
model.fit(X_scaled)
predictions = model.predict(X_scaled)
df_model_data['anomaly_prediction'] = [1 if p == -1 else 0 for p in predictions]
print("Model training complete.")

# --- Step 5: Results ---
print("\n--- Step 5: Identifying Anomalous Accounts ---")
anomalous_accounts = np.union1d(
    df_model_data[df_model_data['anomaly_prediction'] == 1]['nameDest'].unique(),
    df_model_data[df_model_data['anomaly_prediction'] == 1]['nameOrig'].unique()
)
print(f"\n[Final Result]: The model found {len(anomalous_accounts)} 'anomalous' accounts.")
print("Sample of accounts the model considered 'anomalous':")
print(anomalous_accounts[:20])

print("\n--- For Comparison (The Truth) ---")
print(f"The number of 'real' fraudulent accounts (isFraud=1) was: {len(all_fraud_user_ids)}")
print("Sample of 'real' fraudulent accounts:")
print(all_fraud_user_ids[:20])

f1 = f1_score(y_true, df_model_data['anomaly_prediction'])
print(f"\nThe model's F1-Score (for confirmation): {f1:.2%}")

# Confusion Matrix
cm = confusion_matrix(y_true, df_model_data['anomaly_prediction'])
print("\nConfusion Matrix:")
print(cm)

print("\n--- Improved Isolation Forest Execution Complete ---")


--- Execution Started (Improved Isolation Forest) ---
Objective: Identify Anomalous Accounts based on their behavior.
Successfully loaded the full file (6362620 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 561154 rows.

--- Step 2: Building Behavioral Profiles ---
Behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Training Isolation Forest Model ---
Fraud (Contamination) rate in the Smart Sample: 1.46%
Model training complete.

--- Step 5: Identifying Anomalous Accounts ---

[Final Result]: The model found 12877 'anomalous' accounts.
Sample of accounts the model considered 'anomalous':
['C1000331499' 'C1000407130' 'C1000484178' 'C1000628778' 'C1000868784'
 'C1000937208' 'C1001249070' 'C1001476563' 'C100148341' 'C1001742979'
 'C1001844551' 'C1001875185' 'C1002031672' 'C1002055980' 'C1002183297'
 'C1002446735' 'C1002698644' 'C1003015322' 'C1003280328' 'C1003377863']

--- For Compariso