In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
biometric1 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv")
biometric2 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_500000_1000000.csv")
biometric3 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_1000000_1500000.csv")
biometric4 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_1500000_1861108.csv")

demo1 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv")
demo2 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_500000_1000000.csv")
demo3 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_1000000_1500000.csv")
demo4 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_1500000_2000000.csv")
demo5 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_2000000_2071700.csv")

enroll1 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv")
enroll2 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_enrolment/api_data_aadhar_enrolment_500000_1000000.csv")
enroll3 = pd.read_csv("C:/Desktop/For_hackathons/NDH/datasets/api_data_aadhar_enrolment/api_data_aadhar_enrolment_1000000_1006029.csv")

In [5]:
biometric = pd.concat([biometric1, biometric2, biometric3, biometric4], ignore_index=True)
demographic = pd.concat([demo1, demo2, demo3, demo4, demo5], ignore_index=True)
enrolment = pd.concat([enroll1, enroll2, enroll3], ignore_index=True)

In [16]:
# renaming columns

biometric = biometric.rename(columns={
    "bio_age_5_17": "bio_5_17",
    "bio_age_17_": "bio_17_plus"
})

demographic = demographic.rename(columns={
    "demo_age_5_17": "demo_5_17",
    "demo_age_17_": "demo_17_plus"
})

enrolment = enrolment.rename(columns={
    "age_0_5": "enroll_0_5",
    "age_5_17": "enroll_5_17",
    "age_18_greater": "enroll_18_plus"
})

In [None]:
# normalizing state names 

state_fix_map = {
    "Andaman And Nicobar Islands": "Andaman and Nicobar Islands",
    "Andaman and Nicobar Islands": "Andaman and Nicobar Islands",
    "Andaman & Nicobar Islands": "Andaman and Nicobar Islands",

    "Dadra And Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "Dadra & Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "Dadra and Nagar Haveli and Daman and Diu": "Dadra and Nagar Haveli and Daman and Diu",

    "Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "Daman & Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "Daman and Diu": "Dadra and Nagar Haveli and Daman and Diu",

    "Jammu & Kashmir": "Jammu and Kashmir",
    "Jammu And Kashmir": "Jammu and Kashmir",
    "Jammu and Kashmir": "Jammu and Kashmir",

    "Nct Of Delhi": "NCT of Delhi",
    "Delhi": "NCT of Delhi",

    "Westbengal": "West Bengal",
    "West Bengal": "West Bengal",
}


def normalize_state(s):
    if pd.isna(s):
        return s
    s = s.strip()
    s = s.replace("\u00A0", " ")
    s = s.replace("-", " ")
    s = " ".join(s.split())
    s = s.title()
    return state_fix_map.get(s, s)

for df in [biometric, demographic, enrolment]:
    df["state"] = df["state"].apply(normalize_state)
    df["district"] = df["district"].str.strip().str.title()


In [None]:
# we merge on common geographic and time identifiers
# INNER JOIN ensures only matching records are used

merged = biometric.merge(
    demographic,
    on=["date", "state", "district", "pincode"],
    how="inner"
).merge(
    enrolment,
    on=["date", "state", "district", "pincode"],
    how="inner"
)

print("Merged dataset shape:", merged.shape)

Merged dataset shape: (763916, 11)


In [18]:
# these represent actual operational load.

merged["total_bio"] = merged["bio_5_17"] + merged["bio_17_plus"]
merged["total_demo"] = merged["demo_5_17"] + merged["demo_17_plus"]
merged["total_enroll"] = (merged["enroll_0_5"] + merged["enroll_5_17"] + merged["enroll_18_plus"])

# ratios help compare updates vs new enrolments

merged["biometric_ratio"] = merged["total_bio"] / (merged["total_enroll"] + 1)
merged["demo_ratio"] = merged["total_demo"] / (merged["total_enroll"] + 1)

In [25]:
# high z-score means unusually high activity
# compared to national distribution.

merged["z_bio"] = (merged["total_bio"] - merged["total_bio"].mean()) / merged["total_bio"].std()
merged["z_demo"] = (merged["total_demo"] - merged["total_demo"].mean()) / merged["total_demo"].std()

# threshold: |z| > 3 indicates strong anomaly
anomalies = merged[
    (merged["z_bio"].abs() > 3) |
    (merged["z_demo"].abs() > 3)
]

print("Total anomalies detected:", anomalies.shape[0])

Total anomalies detected: 2163
