# Data Cleaning & Monthly Aggregation

Objective:
- Clean date fields
- Create Month column
- Aggregate enrolment, demographic, and biometric data
- Create a single master dataset for analysis & ML


In [None]:
import pandas as pd
import numpy as np


In [None]:
# Helper function
def load_and_combine(files):
    return pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# File paths
enrol_files = [
    "../data/raw/api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv",
    "../data/raw/api_data_aadhar_enrolment/api_data_aadhar_enrolment_500000_1000000.csv",
    "../data/raw/api_data_aadhar_enrolment/api_data_aadhar_enrolment_1000000_1006029.csv"
]

demo_files = [
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_500000_1000000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_1000000_1500000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_1500000_2000000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_2000000_2071700.csv"
]

bio_files = [
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv",
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_500000_1000000.csv",
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_1000000_1500000.csv",
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_1500000_1861108.csv"
]

# Load
enrol = load_and_combine(enrol_files)
demo = load_and_combine(demo_files)
bio = load_and_combine(bio_files)

print(enrol.shape, demo.shape, bio.shape)


In [None]:
enrol['date'] = pd.to_datetime(enrol['date'], dayfirst=True)
demo['date']  = pd.to_datetime(demo['date'], dayfirst=True)
bio['date']   = pd.to_datetime(bio['date'], dayfirst=True)


In [None]:
enrol['month'] = enrol['date'].dt.to_period('M').astype(str)
demo['month']  = demo['date'].dt.to_period('M').astype(str)
bio['month']   = bio['date'].dt.to_period('M').astype(str)


In [None]:
enrol['total_enrolments'] = (
    enrol['age_0_5'] +
    enrol['age_5_17'] +
    enrol['age_18_greater']
)

demo['total_demo_updates'] = (
    demo['demo_age_5_17'] +
    demo['demo_age_17_']
)

bio['total_bio_updates'] = (
    bio['bio_age_5_17'] +
    bio['bio_age_17_']
)


In [None]:
enrol_monthly = (
    enrol
    .groupby(['state', 'district', 'month'], as_index=False)
    ['total_enrolments']
    .sum()
)

demo_monthly = (
    demo
    .groupby(['state', 'district', 'month'], as_index=False)
    ['total_demo_updates']
    .sum()
)

bio_monthly = (
    bio
    .groupby(['state', 'district', 'month'], as_index=False)
    ['total_bio_updates']
    .sum()
)


In [None]:
master = enrol_monthly.merge(
    demo_monthly,
    on=['state', 'district', 'month'],
    how='left'
).merge(
    bio_monthly,
    on=['state', 'district', 'month'],
    how='left'
)

master.fillna(0, inplace=True)


In [None]:
# Remove rows where state or district is numeric-only (string digits)
master_clean = master[
    (~master['state'].astype(str).str.isdigit()) &
    (~master['district'].astype(str).str.isdigit())
]

print("Before cleaning:", master.shape)
print("After cleaning:", master_clean.shape)



In [None]:
print(master_clean.shape)
master_clean.sample(52)


In [None]:
master_clean.to_csv(
    "../data/processed/aadhaar_master_monthly.csv",
    index=False
)

print("Saved cleaned master dataset to data/processed/")



## Pipeline Readiness

- Cleaned master dataset can be reused across analysis,
  forecasting, and dashboards
- Enables faster experimentation and reproducibility
