In [19]:
### Scope Note (Hackathon Constraint)

'''Due to portal access limitations, this analysis uses **state-level aggregated data for Puducherry only**.

The system is designed to be **region-agnostic**:
- The same pipeline applies to districts or multiple states
- All metrics operate on region-month units

This submission demonstrates a **pilot-scale deployment**, focusing on
methodological rigor, forward-looking risk modeling, and decision readiness. '''



'Due to portal access limitations, this analysis uses **state-level aggregated data for Puducherry only**.\n\nThe system is designed to be **region-agnostic**:\n- The same pipeline applies to districts or multiple states\n- All metrics operate on region-month units\n\nThis submission demonstrates a **pilot-scale deployment**, focusing on\nmethodological rigor, forward-looking risk modeling, and decision readiness. '

In [20]:
import pandas as pd

enrol = pd.read_csv("../data/raw/Enrollment_data_puducherry.csv")
bio = pd.read_csv("../data/raw/Biometric_data_Puducherry.csv")


In [21]:
from pathlib import Path

# Project root directory
BASE_DIR = Path("..").resolve()


In [22]:
# Since the dataset contains only Puducherry state-wise data,
# explicitly assign region_id for analytical consistency

enrol["region_id"] = "Puducherry"
bio["region_id"] = "Puducherry"


In [23]:
# Verify single-region constraint (Puducherry-only)
print(enrol["state"].unique())


['Puducherry']


In [24]:
enrol = enrol.rename(columns={
    "State": "region_id"
})

bio = bio.rename(columns={
    "State": "region_id"
})

assert enrol["region_id"].nunique() == 1, "More than one region found"


In [25]:
enrol.head()
enrol.isna().sum()
enrol["date"].nunique()


88

In [26]:
df = enrol.merge(
    bio,
    on=["region_id", "date"],
    how="inner"
)


In [27]:
# Derive canonical enrolment & biometric totals

enrolment_cols = [
    "age_0_5",
    "age_5_17",
    "age_18_greater"
]

biometric_cols = [
    "bio_age_5_17",
    "bio_age_17_"
]

df["enrolments"] = df[enrolment_cols].sum(axis=1)
df["biometric_updates"] = df[biometric_cols].sum(axis=1)


In [35]:
# Standardize date column
df = df.rename(columns={"date": "month"})
df["month"] = pd.to_datetime(
    df["month"],
    dayfirst=True
).dt.to_period("M").astype(str)



In [36]:
# Keep only canonical analysis columns
df = df[
    [
        "region_id",
        "month",
        "enrolments",
        "biometric_updates"
    ]
]


In [37]:
df.to_csv(
    BASE_DIR / "data" / "processed" / "aadhaar_monthly_merged.csv",
    index=False
)
