In [9]:
import os
os.chdir('c:/Users/austi/Project/Advanced-Financial-Fraud-detection/notebooks')

# 📦 Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# 🧾 1. Load cleaned dataset
df = pd.read_csv('../data/paysim_cleaned.csv')  

# 🎯 2. Encode categorical & drop non-numeric columns
df['type'] = df['type'].astype('category').cat.codes  # encode 'type' if needed
X = df.drop(['isFraud', 'nameOrig', 'nameDest'], axis=1)  # drop string/ID cols
y = df['isFraud']


# 📊 3. Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 🧪 4. Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# 🔄 5. Simulate "new" data arrival from test set
X_test_part1, X_test_new, y_test_part1, y_test_new = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# 🧠 6. Train baseline model
baseline_model = RandomForestClassifier(n_estimators=100, random_state=42)
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test_part1)

print("📉 Baseline Performance (before adaptation):")
print(classification_report(y_test_part1, y_pred_baseline, digits=4))
print("ROC AUC:", roc_auc_score(y_test_part1, y_pred_baseline))

# 🔁 7. Adaptive learning - update model with new data
X_adapted = pd.concat([pd.DataFrame(X_train), pd.DataFrame(X_test_new)], axis=0)
y_adapted = pd.concat([y_train, y_test_new], axis=0)

adapted_model = RandomForestClassifier(n_estimators=100, random_state=42)
adapted_model.fit(X_adapted, y_adapted)
y_pred_adapted = adapted_model.predict(X_test_part1)

print("\n📈 Adapted Model Performance (after learning from new data):")
print(classification_report(y_test_part1, y_pred_adapted, digits=4))
print("ROC AUC:", roc_auc_score(y_test_part1, y_pred_adapted))


📉 Baseline Performance (before adaptation):
              precision    recall  f1-score   support

           0     0.9998    1.0000    0.9999    157101
           1     0.9868    0.8054    0.8869       185

    accuracy                         0.9998    157286
   macro avg     0.9933    0.9027    0.9434    157286
weighted avg     0.9998    0.9998    0.9997    157286

ROC AUC: 0.9026963373708462

📈 Adapted Model Performance (after learning from new data):
              precision    recall  f1-score   support

           0     0.9998    1.0000    0.9999    157101
           1     0.9868    0.8108    0.8902       185

    accuracy                         0.9998    157286
   macro avg     0.9933    0.9054    0.9450    157286
weighted avg     0.9998    0.9998    0.9998    157286

ROC AUC: 0.9053990400735489
