# Anomaly Detection & Automated Risk Reporting

**Pipeline overview**
1. Load & split data
2. Feature engineering (`GaussianAnomalyPreprocessor`)
3. Train detector (`MultiClassGaussianAnomalyDetector`)
4. Evaluate: cross-validation, ROC-AUC, Precision-Recall, FPR
5. Build RAG knowledge base
6. Generate automated reports (DeepSeek via Ollama)

## 0. Setup

In [None]:
import sys
sys.path.insert(0, '..')   # make project root importable

import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split

import config
from detection  import GaussianAnomalyPreprocessor, MultiClassGaussianAnomalyDetector
from evaluation import evaluate, cross_validate_detector, per_class_report
from reporting  import AnomalyKnowledgeBase, AnomalyReportGenerator

## 1. Load data

In [None]:
df = pd.read_csv('../data/AMLNet_Dataset.csv')
print(df.shape)
df.head()

(1090172, 17)


Unnamed: 0,step,type,amount,category,nameOrig,nameDest,oldbalanceOrg,newbalanceOrig,isFraud,isMoneyLaundering,laundering_typology,metadata,fraud_probability,hour,day_of_week,day_of_month,month
0,0,DEBIT,298.842041,Other,C8083,C7053,455489.321571,455190.479531,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
1,0,DEBIT,93.087916,Recreation,C5575,C1117,229508.291214,229415.203298,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
2,0,EFTPOS,155.644864,Healthcare,C1549,C1423,202568.806856,202413.161992,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
3,0,BPAY,299.759073,Food,C7435,C6390,491560.600203,491260.841131,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
4,0,DEBIT,173.715615,Other,C8083,C5946,455190.479531,455016.763916,0,0,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2


## 2. Train / test split

In [None]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=17,
    # stratify=y,   # preserve fraud rate in both splits but for time series is not useful
)

print(f'Train: {len(X_train):,}  |  Test: {len(X_test):,}')
print(f'Fraud rate - train: {y_train.mean():.3%}  |  test: {y_test.mean():.3%}')

Train: 872,137  |  Test: 218,035
Fraud rate - train: 0.162%  |  test: 0.150%


## 3. Feature engineering

In [None]:
prep = GaussianAnomalyPreprocessor()
prep.fit(X_train)

X_train_gauss = prep.transform(X_train)
X_test_gauss  = prep.transform(X_test)

print('Features:', X_train_gauss.columns.tolist())
X_train_gauss.describe().T

Features: ['log_amount', 'amount_zscore', 'hour_sin', 'hour_cos', 'hour_diff', 'is_weekend', 'category_freq', 'category_match', 'log_customer_tx_count', 'customer_avg_amount']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
log_amount,872137.0,0.302933,0.978851,-2.019143,-0.366744,0.0,0.633256,5.596892
amount_zscore,872137.0,1.387182,3.308428,-1.57139,-0.269312,0.0,0.730688,49.299085
hour_sin,872137.0,0.000297,0.500403,-0.707107,-0.5,0.0,0.5,0.707107
hour_cos,872137.0,-0.001305,0.499596,-0.707107,-0.5,0.0,0.5,0.707107
hour_diff,872137.0,0.004366,0.583526,-0.999574,-0.499386,0.0,0.500614,1.543021
is_weekend,872137.0,0.043037,0.20294,0.0,0.0,0.0,0.0,1.0
category_freq,872137.0,-0.097831,0.78297,-2.43724,-0.568658,0.0,0.431342,0.735842
category_match,872137.0,0.243934,0.429454,0.0,0.0,0.0,0.0,1.0
log_customer_tx_count,872137.0,-0.073716,0.970934,-5.308509,-0.567345,0.0,0.432655,2.37548
customer_avg_amount,872137.0,0.113639,1.176069,-3.008217,-0.482318,0.0,0.517682,56.686051


## 4. Train detector

In [None]:
# Attach category column so the detector can split per-class models
train_df = X_train_gauss.copy()
train_df['category'] = X_train['category'].values

test_df = X_test_gauss.copy()
test_df['category'] = X_test['category'].values

detector = MultiClassGaussianAnomalyDetector(
    contamination=config.DETECTOR_CONTAMINATION,
    min_samples=config.DETECTOR_MIN_SAMPLES,
)
detector.fit(
    df=train_df,
    class_col='category',
    feature_cols=X_train_gauss.columns.tolist(),
)

print(f'Per-class models fitted: {list(detector.class_models.keys())}')

Per-class models fitted: ['Education', 'Food', 'Healthcare', 'Housing', 'Other', 'Recreation', 'Shell Company', 'Transport', 'Utilities']


## 5. Predict on test set

In [None]:
result_df = detector.predict(test_df, class_col='category')
result_df['isFraud'] = y_test.values

print(f"Anomalies detected : {result_df['is_anomaly'].sum():,}")
print(f"Actual fraud       : {y_test.sum():,}")
result_df[result_df['is_anomaly']].head()

Anomalies detected : 3,125
Actual fraud       : 328


Unnamed: 0,log_amount,amount_zscore,hour_sin,hour_cos,hour_diff,is_weekend,category_freq,category_match,log_customer_tx_count,customer_avg_amount,category,anomaly_score,threshold,is_anomaly,explanation,isFraud
1048553,-0.464033,-0.319352,-0.612372,-0.353553,-0.102877,1.0,0.0,1.0,-1.036248,-0.159693,Food,1319457.0,1009643.0,True,"{'risk_factors': [{'feature': 'is_weekend', 'd...",0
38323,-0.03894,-0.578529,0.683013,0.183013,0.130338,1.0,-0.568658,0.0,-0.868571,2.065414,Recreation,1000000.0,1000000.0,True,"{'risk_factors': [{'feature': 'is_weekend', 'd...",0
915413,-0.422176,-0.251986,0.683013,-0.183013,-0.266669,1.0,-0.26842,0.0,2.092811,-0.245566,Transport,1000087.0,1000087.0,True,"{'risk_factors': [{'feature': 'is_weekend', 'd...",0
369152,1.822142,5.521046,0.5,0.5,0.426215,1.0,0.431342,0.0,-2.455923,-0.498651,Housing,1000000.0,1000000.0,True,"{'risk_factors': [{'feature': 'is_weekend', 'd...",0
249310,-0.40574,-0.437019,0.0,-0.707107,-0.959769,1.0,-0.568658,0.0,1.768294,1.159761,Recreation,1000000.0,1000000.0,True,"{'risk_factors': [{'feature': 'is_weekend', 'd...",0


## 6. Evaluation

In [None]:
metrics = evaluate(result_df, label_col='isFraud')
pd.Series(metrics).to_frame('value')

Unnamed: 0,value
roc_auc,0.6683
average_precision,0.0023
fpr,0.0143
tpr,0.064
precision,0.0067
recall,0.064
f1,0.0122
n_anomalies_pred,3125.0
n_anomalies_true,328.0


In [None]:
# Useful for diagnosing which categories drive poor AP
class_report = per_class_report(result_df, label_col='isFraud', class_col='category')
class_report

Unnamed: 0_level_0,n,n_anomalies,roc_auc,average_precision,fpr,tpr,f1,anomaly_rate
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Recreation,28582,23,0.9466,0.0173,0.0123,0.087,0.0107,0.001
Other,47751,288,0.9501,0.1098,0.015,0.059,0.0335,0.006
Shell Company,207,10,0.8701,0.3091,0.0102,0.1,0.1538,0.048
Cryptocurrency,2,2,,,,,,
Education,8612,0,,,,,,
Food,37041,0,,,,,,
Healthcare,13186,0,,,,,,
Housing,43493,0,,,,,,
Property Investment,5,5,,,,,,
Transport,32601,0,,,,,,


In [None]:
# Fits on normal records only in each fold (no leakage)
full_df = train_df.copy()
full_df['isFraud'] = y_train.values

cv_results = cross_validate_detector(
    df=full_df,
    label_col='isFraud',
    class_col='category',
    feature_cols=X_train_gauss.columns.tolist(),
)
cv_results

  Fold 1/5 — AUC=0.657  AP=0.002  FPR=0.015
  Fold 2/5 — AUC=0.651  AP=0.002  FPR=0.015
  Fold 3/5 — AUC=0.654  AP=0.002  FPR=0.015
  Fold 4/5 — AUC=0.670  AP=0.006  FPR=0.015
  Fold 5/5 — AUC=0.660  AP=0.003  FPR=0.015


Unnamed: 0,roc_auc,average_precision,fpr,tpr,precision,recall,f1,n_anomalies_pred,n_anomalies_true
1,0.6571,0.0024,0.0152,0.0599,0.0064,0.0599,0.0115,2664.0,284.0
2,0.6513,0.0024,0.0146,0.0528,0.0059,0.0528,0.0106,2556.0,284.0
3,0.6541,0.0023,0.0148,0.0389,0.0042,0.0389,0.0077,2592.0,283.0
4,0.6699,0.0061,0.0151,0.0777,0.0083,0.0777,0.015,2660.0,283.0
5,0.6602,0.0025,0.0153,0.0671,0.0071,0.0671,0.0128,2687.0,283.0
mean,0.65852,0.00314,0.015,0.05928,0.00638,0.05928,0.01152,2631.8,283.4
std,0.007176,0.001656,0.000292,0.014647,0.001516,0.014647,0.002701,55.246719,0.547723


## 7. Save detector

In [None]:
detector_path = config.MODELS_DIR / 'detector.joblib'
detector.save(str(detector_path))
print(f'Detector saved → {detector_path}')

# Also save the preprocessor
prep_path = config.MODELS_DIR / 'preprocessor.joblib'
joblib.dump(prep, str(prep_path))
print(f'Preprocessor saved → {prep_path}')

Detector saved → c:\Users\super\Desktop\Vasudeva\Xing\llm_reports\notebook\..\models\detector.joblib
Preprocessor saved → c:\Users\super\Desktop\Vasudeva\Xing\llm_reports\notebook\..\models\preprocessor.joblib


## 8. Build RAG knowledge base


In [None]:
historical_cases = [
    {
        'case_id': 'CASE-001',
        'category': 'Food',
        'description': 'Customer made large transfer at 3am, amount 15x higher than average, to a new recipient.',
        'risk_factors': ['amount_ratio', 'hour_diff', 'is_new_recipient'],
        'resolution': 'Confirmed fraud. Account takeover. Customer reimbursed.'
    },
    {
        'case_id': 'CASE-002',
        'category': 'Electronics',
        'description': 'High-value purchase from new device, amount 8x normal, category mismatch (customer usually spends on Food).',
        'risk_factors': ['amount_zscore', 'category_match', 'customer_tx_count'],
        'resolution': 'Legitimate gift purchase. No action needed.'
    },
    {
        'case_id': 'CASE-003',
        'category': 'Transfer',
        'description': 'Customer sent multiple round-number transfers (structuring) to different recipients in short time.',
        'risk_factors': ['amount_round', 'tx_count_24h', 'unique_recipients_24h'],
        'resolution': 'Money laundering pattern. Report filed. Account frozen.'
    },
    {
        'case_id': 'CASE-004',
        'category': 'Travel',
        'description': 'Weekend transaction from new customer with low history, amount well above typical.',
        'risk_factors': ['is_weekend', 'customer_tx_count', 'amount_ratio'],
        'resolution': 'New customer testing card. Verified legitimate.'
    },
    {
        'case_id': 'CASE-005',
        'category': 'Housing',
        'description': 'Customer drained account completely (99% of balance) via single transfer.',
        'risk_factors': ['balance_drained', 'amount_ratio', 'is_new_recipient'],
        'resolution': 'Scam victim. Transaction reversed.'
    }
]

kb = AnomalyKnowledgeBase()
kb.build_from_records(historical_cases)
kb.save(config.KNOWLEDGE_BASE_DIR)

ValueError: Error raised by inference endpoint: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/embeddings (Caused by NewConnectionError("HTTPConnection(host='localhost', port=11434): Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it"))

## 9. Automated report generation

Requires Ollama running locally with DeepSeek pulled:
```bash
ollama serve
ollama pull deepseek-r1:8b
```

In [None]:
reporter = AnomalyReportGenerator(
    knowledge_base=kb,
    temperature=0.2,
)

NameError: name 'kb' is not defined

In [None]:
# Single anomaly explanation 
sample_anomaly = result_df[result_df['is_anomaly']].iloc[0]
explanation = reporter.explain_anomaly(sample_anomaly)
print(explanation)

In [None]:
# Executive summary + all individual reports 
report = reporter.run_full_pipeline(
    result_df,
    category_col='category',
    max_individual_reports=config.MAX_INDIVIDUAL_REPORTS,
    period='2025-Q3',
)

print('\n=== EXECUTIVE SUMMARY ===')
print(report['summary'])

In [None]:
# Save report to JSON 
report_path = config.REPORTS_DIR / 'risk_report_2025Q3.json'
reporter.save_report(report, report_path)
print(f'Report saved → {report_path}')