In [1]:
import os
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [2]:
def load_and_concat_csvs(folder_path):
    all_files = [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.endswith(".csv")
    ]
    
    df_list = []
    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [3]:
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATASET_PATH = os.path.join(PROJECT_ROOT, "dataset")

print("Project root:", PROJECT_ROOT)
print("Dataset path:", DATASET_PATH)

Project root: e:\Ajinkya STUDY\UIDAI
Dataset path: e:\Ajinkya STUDY\UIDAI\dataset


In [4]:
BASE_PATH = "dataset"

enrolment_path = os.path.join(DATASET_PATH, "api_data_aadhar_enrolment")
demographic_path = os.path.join(DATASET_PATH, "api_data_aadhar_demographic")
biometric_path = os.path.join(DATASET_PATH, "api_data_aadhar_biometric")

enrolment_df = load_and_concat_csvs(enrolment_path)
demographic_df = load_and_concat_csvs(demographic_path)
biometric_df = load_and_concat_csvs(biometric_path)

In [5]:
print("Enrolment:", enrolment_df.shape)
print("Demographic:", demographic_df.shape)
print("Biometric:", biometric_df.shape)

enrolment_df.head()

Enrolment: (1006029, 7)
Demographic: (2071700, 6)
Biometric: (1861108, 6)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [6]:
def standardise_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
    )
    return df

enrolment_df = standardise_columns(enrolment_df)
demographic_df = standardise_columns(demographic_df)
biometric_df = standardise_columns(biometric_df)

In [7]:
def clean_keys(df):
    df["state"] = df["state"].astype(str).str.strip()
    df["district"] = df["district"].astype(str).str.strip()
    df["pincode"] = df["pincode"].astype(str).str.strip()
    return df

enrolment_df = clean_keys(enrolment_df)
demographic_df = clean_keys(demographic_df)
biometric_df = clean_keys(biometric_df)

In [8]:
enrolment_df.duplicated(
    subset=["state", "district", "pincode"],
    keep=False
).sum()

np.int64(1004421)

In [9]:
enrolment_df = (
    enrolment_df
    .groupby(["state", "district", "pincode"], as_index=False)
    .agg({
        "age_0_5": "sum",
        "age_5_17": "sum",
        "age_18_greater": "sum"
    })
)

In [10]:
demographic_df = (
    demographic_df
    .groupby(["state", "district", "pincode"], as_index=False)
    .agg({
        "demo_age_5_17": "sum",
        "demo_age_17_": "sum"
    })
)

In [11]:
biometric_df = (
    biometric_df
    .groupby(["state", "district", "pincode"], as_index=False)
    .agg({
        "bio_age_5_17": "sum",
        "bio_age_17_": "sum"
    })
)

In [12]:
print(enrolment_df.shape)
print(demographic_df.shape)
print(biometric_df.shape)

(28982, 6)
(31495, 5)
(31295, 5)


In [13]:
master_df = enrolment_df.merge(
    demographic_df,
    on=["state", "district", "pincode"],
    how="left"
)

master_df = master_df.merge(
    biometric_df,
    on=["state", "district", "pincode"],
    how="left"
)

In [14]:
update_cols = [
    "demo_age_5_17",
    "demo_age_17_",
    "bio_age_5_17",
    "bio_age_17_"
]

master_df[update_cols] = master_df[update_cols].fillna(0)

In [19]:
master_df.head()

Unnamed: 0,state,district,pincode,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,100000,100000,100000,0,1,217,0.0,2.0,0.0,0.0
1,Andaman & Nicobar Islands,Andamans,744101,8,1,0,4.0,299.0,177.0,1147.0
2,Andaman & Nicobar Islands,Andamans,744103,24,1,0,1.0,147.0,63.0,152.0
3,Andaman & Nicobar Islands,Andamans,744105,22,0,0,1.0,135.0,43.0,157.0
4,Andaman & Nicobar Islands,Andamans,744106,3,2,0,0.0,66.0,37.0,125.0
