# 2. Ghost District Detection (Anomaly Analysis)

**Hypothesis:** Naming mismatches across APIs create 'Ghost Districts' (High Enrolment / Zero Updates).

In [None]:
from scipy import stats
import pandas as pd

# Load Aggregated Profiles
df = pd.read_csv('../analysis/results/district_profile.csv')

## 1. Calculate Update Intensity
Normalized Metric: Updates per Enrolment.

In [None]:
df['update_intensity'] = df['total_updates'] / (df['total_enrol'] + 1)
df['z_score'] = stats.zscore(df['update_intensity'])

## 2. Identify Anomalies (Ghost Districts)

In [None]:
ghosts = df[(df['total_enrol'] > 1000) & ((df['total_updates'] == 0) | (df['z_score'] < -2.0))]
print(f'Ghost Districts Found: {len(ghosts)}')
ghosts[['state', 'district', 'total_enrol']].head()

## 3. Statistical Validation (Welch's T-Test)

In [None]:
normal = df[~df.index.isin(ghosts.index)]
t_stat, p_val = stats.ttest_ind(ghosts['update_intensity'], normal['update_intensity'], equal_var=False)
print(f'P-Value: {p_val}')