In [1]:
import pandas as pd
from pathlib import Path


In [2]:
data_path = Path("../data/processed/concated_data.csv")
df = pd.read_csv(data_path)

df.head()


Unnamed: 0.1,Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,total_bio_updates
0,0,2025-03-01,Haryana,Mahendragarh,123029,280,577,857
1,1,2025-03-01,Bihar,Madhepura,852121,144,369,513
2,2,2025-03-01,Jammu and Kashmir,Punch,185101,643,1091,1734
3,3,2025-03-01,Bihar,Bhojpur,802158,256,980,1236
4,4,2025-03-01,Tamil Nadu,Madurai,625514,271,815,1086


In [3]:
# drop missing districts
df = df[df["district"].notna()].copy()

# normalize district names
df["district"] = df["district"].str.upper().str.strip()

# parse date safely
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# drop bad dates
df = df.dropna(subset=["date"])

df.shape


(1861108, 8)

In [4]:
daily = (
    df
    .groupby(["district", "date"], as_index=False)
    .agg(total_bio_updates=("total_bio_updates", "sum"))
)

daily.head()


Unnamed: 0,district,date,total_bio_updates
0,?,2025-11-17,1
1,ADILABAD,2025-03-01,12872
2,ADILABAD,2025-04-01,9333
3,ADILABAD,2025-05-01,10412
4,ADILABAD,2025-06-01,14734


In [5]:
district_stats = (
    daily
    .groupby("district")
    .agg(
        mean_updates=("total_bio_updates", "mean"),
        std_updates=("total_bio_updates", "std")
    )
    .reset_index()
)

district_stats.head()


Unnamed: 0,district,mean_updates,std_updates
0,?,1.0,
1,ADILABAD,1520.931034,2944.586838
2,AGAR MALWA,202.566265,516.212525
3,AGRA,2640.988764,6802.016797
4,AHILYANAGAR,2.125,1.807722


In [6]:
daily = daily.merge(
    district_stats,
    on="district",
    how="left"
)

daily.columns


Index(['district', 'date', 'total_bio_updates', 'mean_updates', 'std_updates'], dtype='object')

In [7]:
daily = daily.dropna(subset=["std_updates"])


In [8]:
daily["z_score"] = (
    (daily["total_bio_updates"] - daily["mean_updates"]) /
    daily["std_updates"]
)


In [9]:
daily["is_anomaly"] = daily["z_score"].abs() >= 2.5


In [10]:
district_summary = (
    daily
    .groupby("district")
    .agg(
        total_days=("date", "nunique"),
        anomaly_days=("is_anomaly", "sum"),
        avg_z_score=("z_score", "mean"),
        max_z_score=("z_score", "max")
    )
    .reset_index()
)

district_summary["anomaly_ratio"] = (
    district_summary["anomaly_days"] /
    district_summary["total_days"]
)

district_summary["risk_score"] = (
    district_summary["anomaly_ratio"] *
    district_summary["max_z_score"]
)

district_summary.head()


Unnamed: 0,district,total_days,anomaly_days,avg_z_score,max_z_score,anomaly_ratio,risk_score
0,ADILABAD,87,5,-7.656711e-18,5.265618,0.057471,0.302622
1,AGAR MALWA,83,5,2.675236e-18,4.684957,0.060241,0.282226
2,AGRA,89,5,1.1850700000000001e-17,5.457354,0.05618,0.306593
3,AHILYANAGAR,8,0,-2.775558e-17,1.5904,0.0,0.0
4,AHMADABAD,84,3,-1.982541e-17,6.823151,0.035714,0.243684


In [11]:
district_summary.to_csv(
    "../data/processed/district_risk_scores.csv",
    index=False
)
