In [1]:
from pathlib import Path

# Project root (one level above notebooks/)
PROJECT_ROOT = Path("..").resolve()

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
SHAPEFILE_DIR = DATA_DIR / "shapefile"

print("Project root:", PROJECT_ROOT)


Project root: D:\MSc. Data Science\SEM - 4\UDAI HACKATHON\UIDAI_Operational_Stress


In [2]:
import pandas as pd
import glob
import numpy as np

# Load datasets
enrolment_full = pd.concat(
    [pd.read_csv(f) for f in glob.glob("api_data_aadhar_enrolment_*.csv")],
    ignore_index=True
)

demographic_full = pd.concat(
    [pd.read_csv(f) for f in glob.glob("api_data_aadhar_demographic_*.csv")],
    ignore_index=True
)

biometric_full = pd.concat(
    [pd.read_csv(f) for f in glob.glob("api_data_aadhar_biometric_*.csv")],
    ignore_index=True
)

# Standardize date
for df in [enrolment_full, demographic_full, biometric_full]:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

print("Loaded OK")


Loaded OK


In [3]:
start = pd.to_datetime("2025-04-01")
end   = pd.to_datetime("2025-10-31")

enrol_time = enrolment_full[(enrolment_full["date"] >= start) & (enrolment_full["date"] <= end)]
demo_time  = demographic_full[(demographic_full["date"] >= start) & (demographic_full["date"] <= end)]
bio_time   = biometric_full[(biometric_full["date"] >= start) & (biometric_full["date"] <= end)]


In [4]:
common_dates = (
    set(enrol_time["date"].dropna().unique())
    & set(demo_time["date"].dropna().unique())
    & set(bio_time["date"].dropna().unique())
)

print("Common reporting days:", len(common_dates))


Common reporting days: 14


In [5]:
enrol_geo = enrol_time[enrol_time["date"].isin(common_dates)]
demo_geo  = demo_time[demo_time["date"].isin(common_dates)]
bio_geo   = bio_time[bio_time["date"].isin(common_dates)]


In [6]:
def agg_by_district(df):
    age_cols = [c for c in df.columns if "age" in c.lower()]
    return (
        df.groupby("district")[age_cols]
          .sum()
          .sum(axis=1)
          .reset_index(name="total")
    )

enrol_dist = agg_by_district(enrol_geo).rename(columns={"total": "enrolment_total"})
demo_dist  = agg_by_district(demo_geo).rename(columns={"total": "demographic_total"})
bio_dist   = agg_by_district(bio_geo).rename(columns={"total": "biometric_total"})


In [7]:
def agg_by_district(df):
    age_cols = [c for c in df.columns if "age" in c.lower()]
    return (
        df.groupby("district")[age_cols]
          .sum()
          .sum(axis=1)
          .reset_index(name="total")
    )

enrol_dist = agg_by_district(enrol_geo).rename(columns={"total": "enrolment_total"})
demo_dist  = agg_by_district(demo_geo).rename(columns={"total": "demographic_total"})
bio_dist   = agg_by_district(bio_geo).rename(columns={"total": "biometric_total"})


In [8]:
geo_filtered = (
    enrol_dist
    .merge(demo_dist, on="district")
    .merge(bio_dist, on="district")
)

# remove low-denominator noise
geo_filtered = geo_filtered[geo_filtered["enrolment_total"] >= 100]

print("Districts retained:", geo_filtered.shape[0])


Districts retained: 727


In [9]:
demo_thresh = geo_filtered["demographic_total"].median() / geo_filtered["enrolment_total"].median()
bio_thresh  = geo_filtered["biometric_total"].median() / geo_filtered["enrolment_total"].median()

geo_filtered["demo_to_enrol_ratio"] = geo_filtered["demographic_total"] / geo_filtered["enrolment_total"]
geo_filtered["bio_to_enrol_ratio"]  = geo_filtered["biometric_total"] / geo_filtered["enrolment_total"]

def classify_stress(row):
    if row["demo_to_enrol_ratio"] >= demo_thresh and row["bio_to_enrol_ratio"] >= bio_thresh:
        return "High Demographic + High Biometric"
    elif row["demo_to_enrol_ratio"] >= demo_thresh:
        return "High Demographic Stress"
    elif row["bio_to_enrol_ratio"] >= bio_thresh:
        return "High Biometric Stress"
    else:
        return "Low Stress"

geo_filtered["stress_type"] = geo_filtered.apply(classify_stress, axis=1)

geo_filtered["stress_type"].value_counts()


stress_type
High Demographic + High Biometric    234
High Demographic Stress              198
Low Stress                           190
High Biometric Stress                105
Name: count, dtype: int64

In [10]:
# Contribution by stress type
stress_contrib = (
    geo_filtered
    .groupby("stress_type")[["demographic_total", "biometric_total"]]
    .sum()
)

# Convert to percentage contribution
stress_contrib_pct = (
    stress_contrib / stress_contrib.sum()
) * 100

stress_contrib_pct


Unnamed: 0_level_0,demographic_total,biometric_total
stress_type,Unnamed: 1_level_1,Unnamed: 2_level_1
High Biometric Stress,7.588533,15.282052
High Demographic + High Biometric,34.029522,43.055061
High Demographic Stress,43.667721,23.516304
Low Stress,14.714224,18.146583
