#  Notebook 04: Anomaly Detection & Fraud Analysis

## AADHAAR INTELLIGENCE SYSTEM - LENS 3

---

### Objective
Detect fraudulent patterns and anomalies in **REAL UIDAI Aadhaar Data** using:
- Enrollment pattern anomaly analysis
- Isolation Forest for anomaly detection
- DBSCAN clustering for fraud ring identification

### Data Sources (Real UIDAI Data)
- **Enrolment Data**: Age-wise enrollment by pincode/district
- **Demographic Data**: Demographic update patterns
- **Biometric Data**: Biometric update patterns

### Methods
- **Isolation Forest**: Unsupervised anomaly detection (2% contamination)
- **DBSCAN**: Density-based clustering for fraud rings
- **Risk Scoring**: Multi-factor risk assessment

### Key Focus Areas
- Unusual enrollment spikes at specific pincodes
- Abnormal age distribution patterns
- Geographic anomaly clusters

In [None]:
# ============================================
# CELL 1: Import Libraries
# ============================================

import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"

# Machine Learning
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

print(" Libraries imported successfully")
print(f" Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

In [None]:
# ============================================
# CELL 2: Load Real UIDAI Datasets
# ============================================

# Output directory
OUTPUT_DIR = '../outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/charts", exist_ok=True)

# Data paths
DATA_DIR = '../data/'
ENROL_DIR = f"{DATA_DIR}enrolment/"
DEMO_DIR = f"{DATA_DIR}demographic/"
BIO_DIR = f"{DATA_DIR}biometric/"

print(" LOADING REAL UIDAI DATASETS")
print("="*60)

# Load Enrolment Data
print("\n1️⃣ Loading Enrolment Data...")
enrol_files = glob.glob(f"{ENROL_DIR}*.csv")
df_list = []
for f in enrol_files:
    df_temp = pd.read_csv(f)
    df_list.append(df_temp)
df_enrolment = pd.concat(df_list, ignore_index=True)
print(f"    Loaded {len(df_enrolment):,} records from {len(enrol_files)} files")

# Load Demographic Data
print("\n2️⃣ Loading Demographic Data...")
demo_files = glob.glob(f"{DEMO_DIR}*.csv")
df_list = []
for f in demo_files:
    df_temp = pd.read_csv(f)
    df_list.append(df_temp)
df_demographic = pd.concat(df_list, ignore_index=True)
print(f"    Loaded {len(df_demographic):,} records from {len(demo_files)} files")

# Load Biometric Data
print("\n3️⃣ Loading Biometric Data...")
bio_files = glob.glob(f"{BIO_DIR}*.csv")
df_list = []
for f in bio_files:
    df_temp = pd.read_csv(f)
    df_list.append(df_temp)
df_biometric = pd.concat(df_list, ignore_index=True)
print(f"    Loaded {len(df_biometric):,} records from {len(bio_files)} files")

# Parse dates
df_enrolment['date'] = pd.to_datetime(df_enrolment['date'], format='%d-%m-%Y', errors='coerce')
df_demographic['date'] = pd.to_datetime(df_demographic['date'], format='%d-%m-%Y', errors='coerce')
df_biometric['date'] = pd.to_datetime(df_biometric['date'], format='%d-%m-%Y', errors='coerce')

# Create total enrollment column
df_enrolment['total_enrolments'] = df_enrolment['age_0_5'] + df_enrolment['age_5_17'] + df_enrolment['age_18_greater']

# Create totals for demographic and biometric
demo_cols = [c for c in df_demographic.columns if 'demo_age' in c]
df_demographic['total_demo'] = df_demographic[demo_cols].sum(axis=1)

bio_cols = [c for c in df_biometric.columns if 'bio_age' in c]
df_biometric['total_bio'] = df_biometric[bio_cols].sum(axis=1)

print("\n" + "="*60)
print(" DATASETS LOADED SUCCESSFULLY!")
print(f"   Total Enrolment Records: {len(df_enrolment):,}")
print(f"   Total Demographic Records: {len(df_demographic):,}")
print(f"   Total Biometric Records: {len(df_biometric):,}")

In [None]:
# ============================================
# CELL 3: Feature Engineering for Anomaly Detection
# ============================================

print("\n FEATURE ENGINEERING FOR FRAUD DETECTION")
print("="*60)

# Create pincode-level aggregations from enrollment data
pincode_stats = df_enrolment.groupby('pincode').agg({
    'total_enrolments': ['sum', 'mean', 'std', 'count'],
    'age_0_5': 'sum',
    'age_5_17': 'sum',
    'age_18_greater': 'sum',
    'state': 'first',
    'district': 'first'
}).reset_index()

# Flatten column names
pincode_stats.columns = ['pincode', 'total_enrolments', 'avg_daily_enrol', 'std_enrol', 
                         'num_days', 'total_0_5', 'total_5_17', 'total_18_plus',
                         'state', 'district']

# Calculate derived metrics
pincode_stats['std_enrol'] = pincode_stats['std_enrol'].fillna(0)

# Age distribution percentages
pincode_stats['pct_0_5'] = (pincode_stats['total_0_5'] / pincode_stats['total_enrolments'] * 100).fillna(0)
pincode_stats['pct_5_17'] = (pincode_stats['total_5_17'] / pincode_stats['total_enrolments'] * 100).fillna(0)
pincode_stats['pct_18_plus'] = (pincode_stats['total_18_plus'] / pincode_stats['total_enrolments'] * 100).fillna(0)

# Coefficient of variation (volatility indicator)
pincode_stats['cv_enrol'] = (pincode_stats['std_enrol'] / pincode_stats['avg_daily_enrol'] * 100).fillna(0)
pincode_stats['cv_enrol'] = pincode_stats['cv_enrol'].replace([np.inf, -np.inf], 0)

# Merge with demographic data
demo_by_pin = df_demographic.groupby('pincode')['total_demo'].sum().reset_index()
pincode_stats = pincode_stats.merge(demo_by_pin, on='pincode', how='left')
pincode_stats['total_demo'] = pincode_stats['total_demo'].fillna(0)

# Merge with biometric data  
bio_by_pin = df_biometric.groupby('pincode')['total_bio'].sum().reset_index()
pincode_stats = pincode_stats.merge(bio_by_pin, on='pincode', how='left')
pincode_stats['total_bio'] = pincode_stats['total_bio'].fillna(0)

# Calculate update ratios (demo + bio vs enrollments)
pincode_stats['update_ratio'] = ((pincode_stats['total_demo'] + pincode_stats['total_bio']) / 
                                  pincode_stats['total_enrolments'] * 100).fillna(0)
pincode_stats['update_ratio'] = pincode_stats['update_ratio'].replace([np.inf, -np.inf], 0)

print(f" Created {len(pincode_stats):,} pincode-level aggregations")
print(f"\n Feature Statistics:")
print(pincode_stats[['total_enrolments', 'avg_daily_enrol', 'cv_enrol', 'pct_0_5', 'pct_18_plus', 'update_ratio']].describe().round(2))

In [None]:
# ============================================
# CELL 4: Isolation Forest Anomaly Detection
# ============================================

print("\n ISOLATION FOREST ANOMALY DETECTION")
print("="*60)

# Select features for anomaly detection
anomaly_features = ['avg_daily_enrol', 'std_enrol', 'cv_enrol', 
                    'pct_0_5', 'pct_5_17', 'pct_18_plus', 
                    'update_ratio']

X = pincode_stats[anomaly_features].copy()

# Handle missing/infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Isolation Forest
CONTAMINATION = 0.02  # Expect 2% anomalies

iso_forest = IsolationForest(
    n_estimators=200,
    contamination=CONTAMINATION,
    random_state=42,
    n_jobs=-1
)

# Predict anomalies (-1 = anomaly, 1 = normal)
pincode_stats['anomaly_label'] = iso_forest.fit_predict(X_scaled)
pincode_stats['anomaly_score'] = -iso_forest.score_samples(X_scaled)  # Higher = more anomalous

# Normalize anomaly score to 0-100 risk score
min_max_scaler = MinMaxScaler(feature_range=(0, 100))
pincode_stats['risk_score'] = min_max_scaler.fit_transform(
    pincode_stats[['anomaly_score']]
).flatten()

# Count anomalies
n_anomalies = (pincode_stats['anomaly_label'] == -1).sum()
anomaly_pct = n_anomalies / len(pincode_stats) * 100

print(f"\n ANOMALY DETECTION RESULTS:")
print("-" * 50)
print(f"   Total Pincodes Analyzed: {len(pincode_stats):,}")
print(f"   Anomalies Detected: {n_anomalies:,} ({anomaly_pct:.1f}%)")
print(f"   Contamination Parameter: {CONTAMINATION*100:.0f}%")
print(f"   Average Risk Score: {pincode_stats['risk_score'].mean():.1f}")
print(f"   Max Risk Score: {pincode_stats['risk_score'].max():.1f}")

In [None]:
# ============================================
# CELL 5: Top Anomalous Pincodes
# ============================================

print("\n TOP HIGH-RISK PINCODES")
print("="*60)

# Get anomalous pincodes
anomalies = pincode_stats[pincode_stats['anomaly_label'] == -1].copy()
anomalies = anomalies.sort_values('risk_score', ascending=False)

print("\n TOP 20 SUSPICIOUS PINCODES:")
print("-" * 120)
print(f"{'Rank':<5} {'Pincode':<10} {'State':<25} {'District':<20} {'Total Enrol':>12} {'CV%':>8} {'Risk':>8}")
print("-" * 120)

for i, (_, row) in enumerate(anomalies.head(20).iterrows(), 1):
    state_name = str(row['state'])[:24] if pd.notna(row['state']) else 'N/A'
    district_name = str(row['district'])[:19] if pd.notna(row['district']) else 'N/A'
    print(f"{i:<5} {row['pincode']:<10} {state_name:<25} {district_name:<20} {row['total_enrolments']:>12,.0f} {row['cv_enrol']:>8.1f} {row['risk_score']:>8.1f}")

# Calculate estimated suspicious volume
total_suspicious_enrolments = anomalies['total_enrolments'].sum()

print(f"\n SUSPICIOUS ACTIVITY SUMMARY:")
print(f"   Total Suspicious Pincodes: {len(anomalies):,}")
print(f"   Total Enrolments at Risk: {total_suspicious_enrolments:,.0f}")

In [None]:
# ============================================
# CELL 6: DBSCAN Fraud Ring Detection
# ============================================

print("\n FRAUD RING DETECTION (DBSCAN CLUSTERING)")
print("="*60)

# Focus on anomalous pincodes for fraud ring detection
high_risk = pincode_stats[pincode_stats['risk_score'] > 70].copy()

if len(high_risk) > 10:
    # Features for clustering
    cluster_features = ['cv_enrol', 'pct_0_5', 'pct_18_plus', 'update_ratio']
    X_cluster = high_risk[cluster_features].values
    
    # Handle infinities
    X_cluster = np.nan_to_num(X_cluster, nan=0, posinf=0, neginf=0)
    
    # Scale
    X_cluster_scaled = StandardScaler().fit_transform(X_cluster)
    
    # DBSCAN clustering
    dbscan = DBSCAN(eps=0.8, min_samples=3)
    high_risk['fraud_cluster'] = dbscan.fit_predict(X_cluster_scaled)
    
    # Analyze clusters
    cluster_counts = high_risk['fraud_cluster'].value_counts()
    n_clusters = len(cluster_counts[cluster_counts.index != -1])
    
    print(f"\n CLUSTERING RESULTS:")
    print("-" * 50)
    print(f"   High-Risk Pincodes Analyzed: {len(high_risk):,}")
    print(f"   Fraud Rings Detected: {n_clusters}")
    print(f"   Noise Points (isolated anomalies): {(high_risk['fraud_cluster'] == -1).sum()}")
    
    # Identify largest fraud ring
    if n_clusters > 0:
        largest_cluster = cluster_counts[cluster_counts.index != -1].idxmax()
        largest_cluster_size = cluster_counts[largest_cluster]
        
        fraud_ring = high_risk[high_risk['fraud_cluster'] == largest_cluster]
        fraud_ring_states = fraud_ring['state'].value_counts()
        
        print(f"\n LARGEST FRAUD RING (Cluster {largest_cluster}):")
        print(f"   Pincodes Involved: {largest_cluster_size}")
        print(f"   Primary State: {fraud_ring_states.index[0] if len(fraud_ring_states) > 0 else 'Unknown'}")
        print(f"   Avg CV%: {fraud_ring['cv_enrol'].mean():.1f}%")
        print(f"   Total Enrolments: {fraud_ring['total_enrolments'].sum():,.0f}")
else:
    high_risk['fraud_cluster'] = -1
    n_clusters = 0
    print("⚠️ Not enough high-risk pincodes for cluster analysis")

In [None]:
# ============================================
# CELL 7: Risk Heatmap Visualization
# ============================================

print("\n CREATING RISK HEATMAP VISUALIZATION")
print("="*60)

# Aggregate by state
state_risk_agg = pincode_stats.groupby('state').agg({
    'pincode': 'count',
    'risk_score': 'mean',
    'total_enrolments': 'sum',
    'cv_enrol': 'mean'
}).reset_index()

state_risk_agg.columns = ['State', 'Pincodes', 'Avg Risk Score', 'Total Enrollments', 'Avg CV%']
state_risk_agg = state_risk_agg.sort_values('Avg Risk Score', ascending=False)

# Top 20 states
state_risk_top = state_risk_agg.head(20)

# Create risk chart
fig_risk = go.Figure()

fig_risk.add_trace(go.Bar(
    x=state_risk_top['Avg Risk Score'],
    y=state_risk_top['State'],
    orientation='h',
    marker_color=['#D62828' if r > 50 else '#F77F00' if r > 30 else '#1B998B' 
                  for r in state_risk_top['Avg Risk Score']],
    text=[f"{r:.1f}" for r in state_risk_top['Avg Risk Score']],
    textposition='outside',
    name='Risk Score'
))

fig_risk.update_layout(
    title=dict(
        text='<b>FRAUD RISK ANALYSIS BY STATE</b><br><sup> Red = High Risk |  Orange = Medium |  Green = Low</sup>',
        x=0.5
    ),
    xaxis_title='Average Risk Score',
    yaxis_title='State',
    height=600,
    template='plotly_white'
)

fig_risk.write_html(f"{OUTPUT_DIR}/charts/04_risk_by_state.html")
print(" Risk by state chart saved!")

In [None]:
# ============================================
# CELL 8: Age Distribution Anomalies
# ============================================

print("\n AGE DISTRIBUTION ANOMALY ANALYSIS")
print("="*60)

# Find pincodes with unusual age distributions
# Normal expectation: 0-5 ~10%, 5-17 ~20%, 18+ ~70%

# Flag unusual patterns
pincode_stats['unusual_child_ratio'] = (pincode_stats['pct_0_5'] > 30) | (pincode_stats['pct_0_5'] < 1)
pincode_stats['unusual_adult_ratio'] = (pincode_stats['pct_18_plus'] > 95) | (pincode_stats['pct_18_plus'] < 40)

unusual_age = pincode_stats[(pincode_stats['unusual_child_ratio']) | (pincode_stats['unusual_adult_ratio'])]

print(f"\n AGE DISTRIBUTION ANOMALIES:")
print("-" * 50)
print(f"   Unusual Child Ratio (0-5): {pincode_stats['unusual_child_ratio'].sum():,} pincodes")
print(f"   Unusual Adult Ratio (18+): {pincode_stats['unusual_adult_ratio'].sum():,} pincodes")
print(f"   Total Flagged: {len(unusual_age):,} pincodes")

# Top unusual by child ratio
print("\n Top 10 Pincodes with Unusually High Child Ratio:")
high_child = pincode_stats[pincode_stats['pct_0_5'] > 30].nlargest(10, 'pct_0_5')
for _, row in high_child.iterrows():
    print(f"   {row['pincode']} ({row['state'][:20]}): {row['pct_0_5']:.1f}% age 0-5")

# Scatter plot of age distributions
fig_age_dist = px.scatter(
    pincode_stats.sample(min(5000, len(pincode_stats))),
    x='pct_0_5',
    y='pct_18_plus',
    color='risk_score',
    size='total_enrolments',
    hover_data=['pincode', 'state', 'district'],
    title='<b>AGE DISTRIBUTION PATTERNS</b><br><sup>Detecting Unusual Enrollment Demographics</sup>',
    labels={'pct_0_5': '% Age 0-5', 'pct_18_plus': '% Age 18+', 'risk_score': 'Risk Score'},
    color_continuous_scale='RdYlGn_r'
)

fig_age_dist.update_layout(template='plotly_white', height=500)
fig_age_dist.write_html(f"{OUTPUT_DIR}/charts/04_age_distribution.html")
print(" Age distribution chart saved!")

In [None]:
# ============================================
# CELL 9: Temporal Anomaly Patterns
# ============================================

print("\n⏰ TEMPORAL ANOMALY PATTERNS")
print("="*60)

# Group by date to find anomalous days
daily_stats = df_enrolment.groupby('date').agg({
    'total_enrolments': 'sum',
    'pincode': 'nunique'
}).reset_index()

daily_stats.columns = ['date', 'daily_enrolments', 'active_pincodes']

# Calculate rolling statistics
daily_stats['rolling_mean'] = daily_stats['daily_enrolments'].rolling(window=7, min_periods=1).mean()
daily_stats['rolling_std'] = daily_stats['daily_enrolments'].rolling(window=7, min_periods=1).std().fillna(0)

# Flag anomalous days (outside 2 std)
daily_stats['z_score'] = ((daily_stats['daily_enrolments'] - daily_stats['rolling_mean']) / 
                          daily_stats['rolling_std'].replace(0, 1))
daily_stats['is_anomaly'] = abs(daily_stats['z_score']) > 2

anomaly_days = daily_stats[daily_stats['is_anomaly']]

print(f"\n TEMPORAL ANALYSIS:")
print("-" * 50)
print(f"   Total Days Analyzed: {len(daily_stats):,}")
print(f"   Anomalous Days Detected: {len(anomaly_days):,}")
print(f"   Average Daily Enrollments: {daily_stats['daily_enrolments'].mean():,.0f}")

if len(anomaly_days) > 0:
    print(f"\n TOP ANOMALOUS DAYS:")
    for _, row in anomaly_days.nlargest(5, 'z_score').iterrows():
        print(f"   {row['date'].strftime('%Y-%m-%d')}: {row['daily_enrolments']:,.0f} (Z={row['z_score']:.2f})")

# Visualize
fig_temporal = go.Figure()

fig_temporal.add_trace(go.Scatter(
    x=daily_stats['date'],
    y=daily_stats['daily_enrolments'],
    mode='lines',
    name='Daily Enrollments',
    line=dict(color='#3498db')
))

fig_temporal.add_trace(go.Scatter(
    x=daily_stats['date'],
    y=daily_stats['rolling_mean'],
    mode='lines',
    name='7-day Rolling Mean',
    line=dict(color='#2ecc71', dash='dash')
))

# Mark anomalies
if len(anomaly_days) > 0:
    fig_temporal.add_trace(go.Scatter(
        x=anomaly_days['date'],
        y=anomaly_days['daily_enrolments'],
        mode='markers',
        name='Anomalies',
        marker=dict(color='red', size=10, symbol='x')
    ))

fig_temporal.update_layout(
    title='<b>DAILY ENROLLMENT PATTERNS & ANOMALIES</b>',
    xaxis_title='Date',
    yaxis_title='Enrollments',
    template='plotly_white',
    height=400
)

fig_temporal.write_html(f"{OUTPUT_DIR}/charts/04_temporal_anomalies.html")
print(" Temporal anomaly chart saved!")

In [None]:
# ============================================
# CELL 10: State-Level Anomaly Summary
# ============================================

print("\n STATE-LEVEL ANOMALY SUMMARY")
print("="*60)

# Group anomalies by state
anomaly_by_state = anomalies.groupby('state').agg({
    'pincode': 'count',
    'total_enrolments': 'sum',
    'risk_score': 'mean',
    'cv_enrol': 'mean'
}).reset_index()

anomaly_by_state.columns = ['State', 'Anomalous Pincodes', 'Total Enrollments', 'Avg Risk', 'Avg CV%']
anomaly_by_state = anomaly_by_state.sort_values('Anomalous Pincodes', ascending=False)

print("\n TOP STATES WITH ANOMALIES:")
print("-" * 90)
print(f"{'State':<30} {'Anomalous Pincodes':>18} {'Total Enrollments':>18} {'Avg Risk':>12}")
print("-" * 90)

for _, row in anomaly_by_state.head(10).iterrows():
    print(f"{str(row['State'])[:29]:<30} {row['Anomalous Pincodes']:>18,} {row['Total Enrollments']:>18,.0f} {row['Avg Risk']:>12.1f}")

print(f"\n Summary:")
print(f"   States with Anomalies: {len(anomaly_by_state):,}")
print(f"   Total Anomalous Pincodes: {anomaly_by_state['Anomalous Pincodes'].sum():,}")
print(f"   Total Suspicious Enrollments: {anomaly_by_state['Total Enrollments'].sum():,.0f}")

In [None]:
# ============================================
# CELL 11: Fraud Prevention Value Calculation
# ============================================

print("\n FRAUD PREVENTION VALUE ANALYSIS")
print("="*60)

# Parameters
AVG_FRAUD_VALUE = 20000  # ₹20,000 average fraud amount per case
DETECTION_RATE = 0.85    # 85% detection rate with system
FALSE_POSITIVE_COST = 500  # ₹500 cost per false positive investigation

# Calculate fraud metrics
total_anomalous_enrolments = anomalies['total_enrolments'].sum()

# Estimate fraud value (assume 5% of anomalous enrollments are fraudulent)
fraud_rate = 0.05
estimated_fraud_cases = total_anomalous_enrolments * fraud_rate
potential_fraud_value = estimated_fraud_cases * AVG_FRAUD_VALUE
preventable_fraud = potential_fraud_value * DETECTION_RATE

# False positive cost (assume 20% false positive rate)
false_positive_rate = 0.20
false_positive_investigations = len(anomalies) * false_positive_rate
false_positive_cost = false_positive_investigations * FALSE_POSITIVE_COST

# Net savings
net_savings = preventable_fraud - false_positive_cost

print("\n FRAUD PREVENTION METRICS:")
print("-" * 60)
print(f"   Anomalous Pincodes: {len(anomalies):,}")
print(f"   Suspicious Enrollments: {total_anomalous_enrolments:,.0f}")
print(f"   Estimated Fraud Cases (5%): {estimated_fraud_cases:,.0f}")
print(f"\n   Potential Fraud Value: ₹{potential_fraud_value/10000000:.1f} Crore")
print(f"   Preventable (85% detection): ₹{preventable_fraud/10000000:.1f} Crore")
print(f"   False Positive Cost: ₹{false_positive_cost/100000:.2f} Lakhs")
print(f"\n    NET SAVINGS: ₹{net_savings/10000000:.2f} Crore")

# Annualize
if len(daily_stats) > 0:
    data_period_days = (daily_stats['date'].max() - daily_stats['date'].min()).days
    if data_period_days > 0:
        annual_factor = 365 / data_period_days
        annual_savings = net_savings * annual_factor
        print(f"\n    PROJECTED ANNUAL SAVINGS: ₹{annual_savings/10000000:.0f}-{annual_savings*1.2/10000000:.0f} Crore")

print("\n" + "="*60)

In [None]:
# ============================================
# CELL 12: Summary Dashboard
# ============================================

print("\n CREATING SUMMARY DASHBOARD")
print("="*60)

# Create comprehensive anomaly dashboard
fig_dashboard = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Risk Score Distribution',
        'Anomalies by State (Top 10)',
        'CV% vs Total Enrollments',
        'Age Distribution vs Risk'
    ),
    specs=[[{"type": "histogram"}, {"type": "bar"}],
           [{"type": "scatter"}, {"type": "scatter"}]]
)

# Risk Score Distribution
fig_dashboard.add_trace(
    go.Histogram(x=pincode_stats['risk_score'], nbinsx=30, marker_color='#FF6B35'),
    row=1, col=1
)

# Anomalies by State
anomaly_state_counts = anomalies['state'].value_counts().head(10)
fig_dashboard.add_trace(
    go.Bar(x=anomaly_state_counts.values, y=anomaly_state_counts.index, 
           orientation='h', marker_color='#D62828'),
    row=1, col=2
)

# CV vs Total Enrollments
sample_data = pincode_stats.sample(min(3000, len(pincode_stats)))
fig_dashboard.add_trace(
    go.Scatter(
        x=sample_data['cv_enrol'],
        y=sample_data['total_enrolments'],
        mode='markers',
        marker=dict(
            size=5,
            color=sample_data['risk_score'],
            colorscale='RdYlGn_r',
            showscale=False
        )
    ),
    row=2, col=1
)

# Age distribution vs Risk
fig_dashboard.add_trace(
    go.Scatter(
        x=sample_data['pct_0_5'],
        y=sample_data['pct_18_plus'],
        mode='markers',
        marker=dict(
            size=5,
            color=sample_data['risk_score'],
            colorscale='RdYlGn_r',
            showscale=True,
            colorbar=dict(title='Risk')
        )
    ),
    row=2, col=2
)

fig_dashboard.update_layout(
    title=dict(
        text='<b>FRAUD DETECTION DASHBOARD</b><br><sup>Isolation Forest Anomaly Detection Results</sup>',
        x=0.5
    ),
    height=700,
    showlegend=False,
    template='plotly_white'
)

fig_dashboard.write_html(f"{OUTPUT_DIR}/charts/04_anomaly_dashboard.html")
print(" Dashboard saved!")

In [None]:
# ============================================
# CELL 13: Save Results
# ============================================

print("\n SAVING ANOMALY DETECTION RESULTS")
print("="*60)

# Save pincode risk scores
pincode_stats.to_csv(f"{OUTPUT_DIR}/04_pincode_risk_scores.csv", index=False)
print(" Pincode risk scores saved")

# Save anomalies
anomalies.to_csv(f"{OUTPUT_DIR}/04_detected_anomalies.csv", index=False)
print(" Detected anomalies saved")

# Save high risk pincodes
high_risk.to_csv(f"{OUTPUT_DIR}/04_high_risk_pincodes.csv", index=False)
print(" High risk pincodes saved")

# Save state-level risk
state_risk_agg.to_csv(f"{OUTPUT_DIR}/04_state_risk_summary.csv", index=False)
print(" State risk summary saved")

# Save anomaly by state
anomaly_by_state.to_csv(f"{OUTPUT_DIR}/04_anomalies_by_state.csv", index=False)
print(" Anomalies by state saved")

# Save daily anomalies
daily_stats.to_csv(f"{OUTPUT_DIR}/04_daily_stats.csv", index=False)
print(" Daily statistics saved")

print(f"\n All outputs saved to: {OUTPUT_DIR}")

In [None]:
# ============================================
# CELL 14: Key Insights Summary
# ============================================

print("\n" + "="*70)
print(" ANOMALY DETECTION - KEY INSIGHTS")
print("="*70)

print(f"""
 DETECTION SUMMARY:
   • Total Pincodes Analyzed: {len(pincode_stats):,}
   • Anomalies Detected: {len(anomalies):,} ({len(anomalies)/len(pincode_stats)*100:.1f}%)
   • High-Risk Pincodes (Score>70): {len(high_risk):,}
   • Fraud Clusters Identified: {n_clusters}

 GEOGRAPHIC INSIGHTS:
   • States with Anomalies: {len(anomaly_by_state):,}
   • Top Anomaly State: {anomaly_by_state.iloc[0]['State'] if len(anomaly_by_state) > 0 else 'N/A'}

 TEMPORAL INSIGHTS:
   • Anomalous Days: {len(anomaly_days):,}
   • Data Period: {daily_stats['date'].min().strftime('%Y-%m-%d')} to {daily_stats['date'].max().strftime('%Y-%m-%d')}

 FRAUD PREVENTION:
   • Suspicious Enrollments: {total_anomalous_enrolments:,.0f}
   • Estimated Fraud Value: ₹{potential_fraud_value/10000000:.1f} Crore
   • Preventable Fraud: ₹{preventable_fraud/10000000:.1f} Crore

 OUTPUTS GENERATED:
   • 6 CSV files with analysis results
   • 5 Interactive HTML charts
""")

print("="*70)
print(" NOTEBOOK 04 COMPLETE!")
print("="*70)