<a href="https://colab.research.google.com/github/AsmaaYassinDev/Behavioural-Anomaly-Detection-for-ATO-Fraud/blob/main/Supervised_Fraud_Model_(RandomForest)_Improved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

print("--- Improved Random Forest Execution Started ---")

# --- Load Smart Sample ---
file_path = '/content/drive/My Drive/Colab_Data/PS_20174392719_1491204439457_log.csv'
df = pd.read_csv(file_path)
print(f"Data Loaded: {len(df)} rows")

# --- Build behavioral features ---
print("\n--- Building Behavioral Profiles ---")
df_received = df[df['type'].isin(['TRANSFER', 'CASH_IN'])]
total_received = df_received.groupby('nameDest')['amount'].sum().to_dict()
unique_senders = df_received.groupby('nameDest')['nameOrig'].nunique().to_dict()
df_cashed_out = df[df['type'] == 'CASH_OUT']
total_cashed_out = df_cashed_out.groupby('nameOrig')['amount'].sum().to_dict()

all_user_ids = set(total_received.keys()) | set(total_cashed_out.keys()) | set(unique_senders.keys())
profiles_list = []
for user_id in all_user_ids:
    received = total_received.get(user_id, 0)
    cashed_out = total_cashed_out.get(user_id, 0)
    senders = unique_senders.get(user_id, 0)
    ratio = cashed_out / (received + 1e-6)
    ratio = min(ratio, 1.0)
    profiles_list.append({
        'user_id': user_id,
        'dest_cash_out_ratio': ratio,
        'dest_unique_senders': senders
    })
final_profiles = pd.DataFrame(profiles_list)

# --- Create Smart Sample ---
print("\n--- Creating Smart Sample ---")
df_fraud = df[df['isFraud'] == 1]
fraud_dest_ids = df_fraud['nameDest'].unique()
fraud_orig_ids = df_fraud['nameOrig'].unique()
all_fraud_user_ids = np.union1d(fraud_dest_ids, fraud_orig_ids)

df_fraud_lifecycle = df[
    df['nameOrig'].isin(all_fraud_user_ids) |
    df['nameDest'].isin(all_fraud_user_ids)
]
df_normal = df[df['isFraud'] == 0]
df_normal_sample = df_normal.sample(n=min(500000, len(df_normal)), random_state=42)

df_smart_sample = pd.concat([df_fraud_lifecycle, df_normal_sample]).drop_duplicates()
print(f"Smart Sample Created: {len(df_smart_sample)} rows")

# --- Merge Features ---
print("\n--- Merging Features with Transactions ---")
df_model_data = pd.merge(df_smart_sample, final_profiles, left_on='nameDest', right_on='user_id', how='left')
df_model_data = pd.merge(df_model_data, final_profiles, left_on='nameOrig', right_on='user_id', how='left', suffixes=('_dest', '_orig'))

# Fill missing values
for col in ['dest_cash_out_ratio_dest', 'dest_unique_senders_dest', 'dest_cash_out_ratio_orig', 'dest_unique_senders_orig']:
    df_model_data[col] = df_model_data[col].fillna(0)

# Feature engineering
df_model_data['balance_diff_orig'] = df_model_data['oldbalanceOrg'] - df_model_data['newbalanceOrig']
df_model_data['balance_diff_dest'] = df_model_data['newbalanceDest'] - df_model_data['oldbalanceDest']
df_model_data['type_encoded'] = df_model_data['type'].astype('category').cat.codes

# Time-based features
df_model_data['hour'] = df_model_data['step'] % 24
df_model_data['is_night'] = df_model_data['hour'].apply(lambda x: 1 if x < 6 or x > 22 else 0)
df_model_data['is_working_hour'] = df_model_data['hour'].apply(lambda x: 1 if 8 <= x <= 17 else 0)
df_model_data['is_weekend'] = df_model_data['step'].apply(lambda x: 1 if (x // 24) % 7 in [5, 6] else 0)

# --- Prepare Final Dataset ---
selected_features = [
    'amount',
    'type_encoded',
    'balance_diff_orig',
    'balance_diff_dest',
     'dest_cash_out_ratio_dest',
    'dest_unique_senders_dest',
    'dest_cash_out_ratio_orig',
    'dest_unique_senders_orig',
    'hour',
    'is_night',
    'is_working_hour',
    'is_weekend'
]
df_model_data = df_model_data.dropna(subset=['isFraud'])
X = df_model_data[selected_features]
y = df_model_data['isFraud']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)
print(f"Training size: {len(X_train)}, Test size: {len(X_test)}")

# --- Train Random Forest ---
print("\n--- Training Random Forest Classifier ---")
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Predict & Evaluate
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\n--- Random Forest Results After Feature Selection ---")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score: {f1:.2%}")
print("Confusion Matrix:")
print(cm)

# Feature Importance
importance = pd.Series(model.feature_importances_, index=selected_features).sort_values(ascending=False)
print("\n--- Feature Importance ---")
print(importance)

--- Improved Random Forest Execution Started ---
Data Loaded: 6362620 rows

--- Building Behavioral Profiles ---

--- Creating Smart Sample ---
Smart Sample Created: 561154 rows

--- Merging Features with Transactions ---
Training size: 392807, Test size: 168347

--- Training Random Forest Classifier ---

--- Random Forest Results After Feature Selection ---
Precision: 92.30%
Recall: 81.21%
F1-Score: 86.40%
Confusion Matrix:
[[165716    167]
 [   463   2001]]

--- Feature Importance ---
balance_diff_orig    0.502474
balance_diff_dest    0.138697
type_encoded         0.130670
amount               0.129783
hour                 0.074846
is_night             0.018416
is_working_hour      0.003359
is_weekend           0.001755
dtype: float64
