In [1]:
import os
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [2]:
def load_and_concat_csvs(folder_path):
    all_files = [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.endswith(".csv")
    ]
    
    df_list = []
    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [3]:
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATASET_PATH = os.path.join(PROJECT_ROOT, "dataset")

print("Project root:", PROJECT_ROOT)
print("Dataset path:", DATASET_PATH)

Project root: e:\Ajinkya STUDY\UIDAI
Dataset path: e:\Ajinkya STUDY\UIDAI\dataset


In [4]:
BASE_PATH = "dataset"

enrolment_path = os.path.join(DATASET_PATH, "api_data_aadhar_enrolment")
demographic_path = os.path.join(DATASET_PATH, "api_data_aadhar_demographic")
biometric_path = os.path.join(DATASET_PATH, "api_data_aadhar_biometric")

enrolment_df = load_and_concat_csvs(enrolment_path)
demographic_df = load_and_concat_csvs(demographic_path)
biometric_df = load_and_concat_csvs(biometric_path)

In [5]:
print("Enrolment:", enrolment_df.shape)
print("Demographic:", demographic_df.shape)
print("Biometric:", biometric_df.shape)

enrolment_df.head()

Enrolment: (1006029, 7)
Demographic: (2071700, 6)
Biometric: (1861108, 6)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [6]:
def standardise_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
    )
    return df

enrolment_df = standardise_columns(enrolment_df)
demographic_df = standardise_columns(demographic_df)
biometric_df = standardise_columns(biometric_df)

In [7]:
def clean_keys(df):
    df["state"] = df["state"].astype(str).str.strip()
    df["district"] = df["district"].astype(str).str.strip()
    df["pincode"] = df["pincode"].astype(str).str.strip()
    return df

enrolment_df = clean_keys(enrolment_df)
demographic_df = clean_keys(demographic_df)
biometric_df = clean_keys(biometric_df)

In [8]:
def drop_invalid_rows(df):
    df = df.dropna(subset=["state", "district", "pincode"])
    return df

enrolment_df = drop_invalid_rows(enrolment_df)
demographic_df = drop_invalid_rows(demographic_df)
biometric_df = drop_invalid_rows(biometric_df)

In [9]:
def remove_negative_counts(df, count_columns):
    for col in count_columns:
        df = df[df[col] >= 0]
    return df

In [10]:
enrolment_df = remove_negative_counts(
    enrolment_df,
    ["age_0_5", "age_5_17", "age_18_greater"]
)

demographic_df = remove_negative_counts(
    demographic_df,
    ["demo_age_5_17", "demo_age_17_"]
)

biometric_df = remove_negative_counts(
    biometric_df,
    ["bio_age_5_17", "bio_age_17_"]
)

In [11]:
enrolment_df = enrolment_df.drop_duplicates(
    subset=["state", "district", "pincode"]
)

demographic_df = demographic_df.drop_duplicates(
    subset=["state", "district", "pincode"]
)

biometric_df = biometric_df.drop_duplicates(
    subset=["state", "district", "pincode"]
)

In [12]:
print("Enrolment:", enrolment_df.shape)
print("Demographic:", demographic_df.shape)
print("Biometric:", biometric_df.shape)

enrolment_df.head()

Enrolment: (28982, 7)
Demographic: (31495, 6)
Biometric: (31295, 6)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [13]:
demographic_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [14]:
biometric_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [15]:
print(enrolment_df.columns.tolist())
print(demographic_df.columns.tolist())
print(biometric_df.columns.tolist())

['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [16]:
master_df = enrolment_df.merge(
    demographic_df,
    on=["state", "district", "pincode"],
    how="left"
)

In [18]:
master_df = master_df.merge(
    biometric_df,
    on=["state", "district", "pincode"],
    how="left"
)

In [19]:
update_columns = [
    "demo_age_5_17",
    "demo_age_17_",
    "bio_age_5_17",
    "bio_age_17_"
]

master_df[update_columns] = master_df[update_columns].fillna(0)

In [20]:
numeric_columns = [
    "age_0_5", "age_5_17", "age_18_greater",
    "demo_age_5_17", "demo_age_17_",
    "bio_age_5_17", "bio_age_17_"
]

master_df[numeric_columns] = master_df[numeric_columns].astype(int)

In [21]:
master_df.isna().sum()

date_x              0
state               0
district            0
pincode             0
age_0_5             0
age_5_17            0
age_18_greater      0
date_y            537
demo_age_5_17       0
demo_age_17_        0
date              585
bio_age_5_17        0
bio_age_17_         0
dtype: int64

In [23]:
os.makedirs("outputs", exist_ok=True)
master_df.to_csv("outputs/master_dataset_phase2.csv", index=False)