# Anomaly Detection & Automated Risk Reporting

**Pipeline overview**
1. Load & split data
2. Feature engineering (`GaussianAnomalyPreprocessor`)
3. Train detector (`MultiClassGaussianAnomalyDetector`)
4. Evaluate: cross-validation, ROC-AUC, Precision-Recall, FPR
5. Build RAG knowledge base
6. Generate automated reports (DeepSeek via Ollama)

## 0. Setup

In [1]:
import sys
sys.path.insert(0, '..')   # make project root importable

import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split

import config
from detection  import GaussianAnomalyPreprocessor, MultiClassGaussianAnomalyDetector
from evaluation import evaluate, cross_validate_detector, per_class_report
from reporting  import AnomalyKnowledgeBase, AnomalyReportGenerator

## 1. Load data

In [2]:
df = pd.read_csv('../data/AMLNet_Dataset.csv')
print(df.shape)
df.head()

(1090172, 17)


Unnamed: 0,step,type,amount,category,nameOrig,nameDest,oldbalanceOrg,newbalanceOrig,isFraud,isMoneyLaundering,laundering_typology,metadata,fraud_probability,hour,day_of_week,day_of_month,month
0,0,DEBIT,298.842041,Other,C8083,C7053,455489.321571,455190.479531,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
1,0,DEBIT,93.087916,Recreation,C5575,C1117,229508.291214,229415.203298,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
2,0,EFTPOS,155.644864,Healthcare,C1549,C1423,202568.806856,202413.161992,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
3,0,BPAY,299.759073,Food,C7435,C6390,491560.600203,491260.841131,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
4,0,DEBIT,173.715615,Other,C8083,C5946,455190.479531,455016.763916,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2


## 2. Train / test split

In [3]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=17,
    # stratify=y,   # preserve fraud rate in both splits but for time series is not useful
)

print(f'Train: {len(X_train):,}  |  Test: {len(X_test):,}')
print(f'Fraud rate - train: {y_train.mean():.3%}  |  test: {y_test.mean():.3%}')

Train: 872,137  |  Test: 218,035
Fraud rate - train: 0.162%  |  test: 0.150%


## 3. Feature engineering

In [4]:
prep = GaussianAnomalyPreprocessor()
prep.fit(X_train)

X_train_gauss = prep.transform(X_train)
X_test_gauss  = prep.transform(X_test)

print('Features:', X_train_gauss.columns.tolist())
X_train_gauss.describe().T

Features: ['log_amount', 'amount_zscore', 'hour_sin', 'hour_cos', 'hour_diff', 'is_weekend', 'category_freq', 'category_match', 'log_customer_tx_count', 'customer_avg_amount']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
log_amount,872137.0,0.302933,0.978851,-2.019143,-0.366744,0.0,0.633256,5.596892
amount_zscore,872137.0,1.387182,3.308428,-1.57139,-0.269312,0.0,0.730688,49.299085
hour_sin,872137.0,0.000297,0.500403,-0.707107,-0.5,0.0,0.5,0.707107
hour_cos,872137.0,-0.001305,0.499596,-0.707107,-0.5,0.0,0.5,0.707107
hour_diff,872137.0,0.004366,0.583526,-0.999574,-0.499386,0.0,0.500614,1.543021
is_weekend,872137.0,0.043037,0.20294,0.0,0.0,0.0,0.0,1.0
category_freq,872137.0,-0.097831,0.78297,-2.43724,-0.568658,0.0,0.431342,0.735842
category_match,872137.0,0.243934,0.429454,0.0,0.0,0.0,0.0,1.0
log_customer_tx_count,872137.0,-0.073716,0.970934,-5.308509,-0.567345,0.0,0.432655,2.37548
customer_avg_amount,872137.0,0.113639,1.176069,-3.008217,-0.482318,0.0,0.517682,56.686051


## 4. Train detector

In [5]:
# Attach category column so the detector can split per-class models
train_df = X_train_gauss.copy()
train_df['category'] = X_train['category'].values

test_df = X_test_gauss.copy()
test_df['category'] = X_test['category'].values

detector = MultiClassGaussianAnomalyDetector(
    contamination=config.DETECTOR_CONTAMINATION,
    min_samples=config.DETECTOR_MIN_SAMPLES,
)
detector.fit(
    df=train_df,
    class_col='category',
    feature_cols=X_train_gauss.columns.tolist(),
)

print(f'Per-class models fitted: {list(detector.class_models.keys())}')

Per-class models fitted: ['Education', 'Food', 'Healthcare', 'Housing', 'Other', 'Recreation', 'Shell Company', 'Transport', 'Utilities']


## 5. Predict on test set

In [6]:
result_df = detector.predict(test_df, class_col='category')
result_df['isFraud'] = y_test.values

print(f"Anomalies detected : {result_df['is_anomaly'].sum():,}")
print(f"Actual fraud       : {y_test.sum():,}")
result_df[result_df['is_anomaly']].head()

Anomalies detected : 548
Actual fraud       : 328


Unnamed: 0,log_amount,amount_zscore,hour_sin,hour_cos,hour_diff,is_weekend,category_freq,category_match,log_customer_tx_count,customer_avg_amount,category,anomaly_score,threshold,is_anomaly,explanation,isFraud
823749,4.751734,24.769389,0.0,-0.707107,-0.848194,0.0,0.735842,1.0,-0.088719,12.620652,Other,25.298097,8.858425,True,"{'risk_factors': [{'feature': 'amount_zscore',...",1
68244,4.377729,40.38577,-0.683013,0.183013,0.130389,0.0,0.735842,1.0,-0.021846,4.723912,Other,35.110738,8.858425,True,"{'risk_factors': [{'feature': 'amount_zscore',...",1
234320,-1.434102,0.440175,0.612372,0.353553,0.214309,0.0,-0.26842,0.0,0.187408,10.641627,Transport,9.851396,8.858425,True,{'risk_factors': [{'feature': 'customer_avg_am...,0
417879,-0.302631,0.433313,0.5,0.5,0.506783,0.0,-2.008921,0.0,-2.264889,9.56663,Utilities,9.02942,8.858425,True,{'risk_factors': [{'feature': 'customer_avg_am...,0
758636,2.994592,13.265831,-0.707107,0.0,0.010607,0.0,-0.568658,0.0,-0.204911,0.744866,Recreation,26.165926,8.858425,True,"{'risk_factors': [{'feature': 'amount_zscore',...",1


## 6. Evaluation

In [7]:
metrics = evaluate(result_df, label_col='isFraud')
pd.Series(metrics).to_frame('value')

Unnamed: 0,value
roc_auc,0.976
average_precision,0.5429
fpr,0.0015
tpr,0.6646
precision,0.3978
recall,0.6646
f1,0.4977
n_anomalies_pred,548.0
n_anomalies_true,328.0


In [8]:
# Useful for diagnosing which categories drive poor AP
class_report = per_class_report(result_df, label_col='isFraud', class_col='category')
class_report

Unnamed: 0_level_0,n,n_anomalies,roc_auc,average_precision,fpr,tpr,f1,anomaly_rate
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Shell Company,207,10,0.8873,0.5607,0.0051,0.4,0.5333,0.048
Other,47751,288,0.9813,0.7524,0.0007,0.6424,0.7298,0.006
Recreation,28582,23,0.9977,0.7729,0.0013,0.9565,0.5301,0.001
Cryptocurrency,2,2,,,,,,
Education,8612,0,,,,,,
Food,37041,0,,,,,,
Healthcare,13186,0,,,,,,
Housing,43493,0,,,,,,
Property Investment,5,5,,,,,,
Transport,32601,0,,,,,,


In [9]:
# Fits on normal records only in each fold (no leakage)
full_df = train_df.copy()
full_df['isFraud'] = y_train.values

cv_results = cross_validate_detector(
    df=full_df,
    label_col='isFraud',
    class_col='category',
    feature_cols=X_train_gauss.columns.tolist(),
)
cv_results

  Fold 1/3 — AUC=0.972  AP=0.716  FPR=0.002
  Fold 2/3 — AUC=0.972  AP=0.701  FPR=0.002
  Fold 3/3 — AUC=0.980  AP=0.690  FPR=0.002


Unnamed: 0,roc_auc,average_precision,fpr,tpr,precision,recall,f1,n_anomalies_pred,n_anomalies_true
1,0.9719,0.7161,0.0015,0.8013,0.4738,0.8013,0.5954,800.0,473.0
2,0.9721,0.7007,0.0015,0.7585,0.4447,0.7585,0.5607,805.0,472.0
3,0.9795,0.6899,0.0015,0.7797,0.4549,0.7797,0.5746,809.0,472.0
mean,0.9745,0.702233,0.0015,0.779833,0.4578,0.779833,0.5769,804.666667,472.333333
std,0.004331,0.013167,2.655742e-19,0.0214,0.014765,0.0214,0.017464,4.50925,0.57735


## 7. Save detector

In [10]:
detector_path = config.MODELS_DIR / 'detector.joblib'
detector.save(str(detector_path))
print(f'Detector saved → {detector_path}')

# Also save the preprocessor
prep_path = config.MODELS_DIR / 'preprocessor.joblib'
joblib.dump(prep, str(prep_path))
print(f'Preprocessor saved → {prep_path}')

Detector saved → c:\Users\super\Desktop\Vasudeva\Xing\llm_reports\notebook\..\models\detector.joblib
Preprocessor saved → c:\Users\super\Desktop\Vasudeva\Xing\llm_reports\notebook\..\models\preprocessor.joblib


## 8. Build RAG knowledge base


In [12]:
historical_cases = [
    {
        'case_id': 'CASE-001',
        'category': 'Food',
        'description': 'Customer made large transfer at 3am, amount 15x higher than average, to a new recipient.',
        'risk_factors': ['amount_ratio', 'hour_diff', 'is_new_recipient'],
        'resolution': 'Confirmed fraud. Account takeover. Customer reimbursed.'
    },
    {
        'case_id': 'CASE-002',
        'category': 'Electronics',
        'description': 'High-value purchase from new device, amount 8x normal, category mismatch (customer usually spends on Food).',
        'risk_factors': ['amount_zscore', 'category_match', 'customer_tx_count'],
        'resolution': 'Legitimate gift purchase. No action needed.'
    },
    {
        'case_id': 'CASE-003',
        'category': 'Transfer',
        'description': 'Customer sent multiple round-number transfers (structuring) to different recipients in short time.',
        'risk_factors': ['amount_round', 'tx_count_24h', 'unique_recipients_24h'],
        'resolution': 'Money laundering pattern. Report filed. Account frozen.'
    },
    {
        'case_id': 'CASE-004',
        'category': 'Travel',
        'description': 'Weekend transaction from new customer with low history, amount well above typical.',
        'risk_factors': ['is_weekend', 'customer_tx_count', 'amount_ratio'],
        'resolution': 'New customer testing card. Verified legitimate.'
    },
    {
        'case_id': 'CASE-005',
        'category': 'Housing',
        'description': 'Customer drained account completely (99% of balance) via single transfer.',
        'risk_factors': ['balance_drained', 'amount_ratio', 'is_new_recipient'],
        'resolution': 'Scam victim. Transaction reversed.'
    }
]

kb = AnomalyKnowledgeBase()
kb.build_from_records(historical_cases)
kb.save(config.KNOWLEDGE_BASE_DIR)

## 9. Automated report generation

Requires Ollama running locally with DeepSeek pulled:
```bash
ollama serve
ollama pull qwen2.5:1.5b
```

In [18]:
reporter = AnomalyReportGenerator(
    knowledge_base=kb,
    temperature=0.2,
)

In [19]:
# Single anomaly explanation 
sample_anomaly = result_df[result_df['is_anomaly']].iloc[0]
explanation = reporter.explain_anomaly(sample_anomaly)
print(explanation)

TypeError: string indices must be integers, not 'str'

In [None]:
# Executive summary + all individual reports 
report = reporter.run_full_pipeline(
    result_df,
    category_col='category',
    max_individual_reports=config.MAX_INDIVIDUAL_REPORTS,
    period='2025-Q3',
)

print('\n=== EXECUTIVE SUMMARY ===')
print(report['summary'])

In [None]:
# Save report to JSON 
report_path = config.REPORTS_DIR / 'risk_report_2025Q3.json'
reporter.save_report(report, report_path)
print(f'Report saved → {report_path}')