<a href="https://colab.research.google.com/github/AsmaaYassinDev/Behavioural-Anomaly-Detection-for-ATO-Fraud/blob/main/Supervised_Fraud_Model_(RandomForest).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# --- (Change: We will use a Supervised model) ---
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
# --- (Change: We need to split the data) ---
from sklearn.model_selection import train_test_split
import warnings

# Ignore unimportant warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Execution Started (Successful Model: Supervised) ---")
print("Objective: Prove that 'Behavioral Features' succeed with a Supervised model.")

# --- Load Data ---
file_path = 'PS_20174392719_1491204439457_log24.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded the full file ({len(df)} rows).")
except Exception as e:
    print(f"Error during data loading: {e}")
    exit()

# --- Step 1: Create the 'Smart Sample' ---
# (Same code to build a balanced sample for training)
print("\n--- Step 1: Creating the 'Smart Sample' ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)
df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
sample_size = min(500000, len(df_normal))
df_normal_sample = df_normal.sample(n=sample_size, random_state=42)
df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates(keep='first')
print(f"The final 'Smart Sample' was created with {len(df_smart_sample)} rows.")

# --- Step 2: Building Behavioral Profiles (Strong Features) ---
print("\n--- Step 2: Building Behavioral Profiles ---")
df_received = df_smart_sample[df_smart_sample['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df_smart_sample[df_smart_sample['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = (cashed_out / (received + 1e-6))
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)
print("Behavioral profiles created successfully.")

# --- Step 3: Merge Features with Transactions ---
print("\n--- Step 3: Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

df_model_data['dest_cash_out_ratio_dest'] = df_model_data['dest_cash_out_ratio_dest'].fillna(0)
df_model_data['dest_unique_senders_dest'] = df_model_data['dest_unique_senders_dest'].fillna(0)
df_model_data['dest_cash_out_ratio_orig'] = df_model_data['dest_cash_out_ratio_orig'].fillna(0)
df_model_data['dest_unique_senders_orig'] = df_model_data['dest_unique_senders_orig'].fillna(0)

# --- Step 4: Prepare Train/Test Data (Supervised) ---
print("\n--- Step 4: Preparing Train/Test Data ---")

features = [
    'amount',
    'dest_cash_out_ratio_dest', # Recipient's cash-out ratio
    'dest_unique_senders_dest', # Recipient's unique senders
    'dest_cash_out_ratio_orig', # Sender's cash-out ratio
    'dest_unique_senders_orig'  # Sender's unique senders
]
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes
features.append('type_encoded')

X = df_model_data[features]
y_true = df_model_data['isFraud'] # The "Correct Answer"

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# (Change: We split the data 70% for training and 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_true, test_size=0.3, random_state=42, stratify=y_true)
print(f"Data split into {len(X_train)} rows for training and {len(X_test)} rows for testing.")

# --- Step 5: Train Supervised RandomForest Model ---
print("\n--- Step 5: Training RandomForest Model (Supervised) ---")

# (class_weight='balanced' is very important for imbalanced data)
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Test on "new" data (X_test) that the model has never seen before
predictions = model.predict(X_test)
print("Model training complete.")

# --- Step 6: Evaluate (The Successful Result) ---
print("\n--- Step 6: Evaluating the Model (New Result) ---")

f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
cm = confusion_matrix(y_test, predictions)

print("!!! Results on 'Test Data' (data the model has never seen before) !!!")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score (New Final Score): {f1:.2%}")
print("\nConfusion Matrix:")
print(cm)

# --- (The most important step: Why did the model succeed?) ---
print("\n--- Feature Importance (Why did the model succeed?) ---")
feature_imp = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print(feature_imp)

print("\n--- Successful Code Execution Complete ---")

--- Execution Started (Successful Model: Supervised) ---
Objective: Prove that 'Behavioral Features' succeed with a Supervised model.
Successfully loaded the full file (1048575 rows).

--- Step 1: Creating the 'Smart Sample' ---
The final 'Smart Sample' was created with 504964 rows.

--- Step 2: Building Behavioral Profiles ---
Behavioral profiles created successfully.

--- Step 3: Merging Features with Transactions ---

--- Step 4: Preparing Train/Test Data ---
Data split into 353474 rows for training and 151490 rows for testing.

--- Step 5: Training RandomForest Model (Supervised) ---
Model training complete.

--- Step 6: Evaluating the Model (New Result) ---
!!! Results on 'Test Data' (data the model has never seen before) !!!
Precision: 27.03%
Recall: 20.41%
F1-Score (New Final Score): 23.26%

Confusion Matrix:
[[150958    189]
 [   273     70]]

--- Feature Importance (Why did the model succeed?) ---
amount                      0.539584
type_encoded                0.234897
dest_u

In [None]:
from google.colab import drive
drive.mount('/content/drive')