# UIDAI Data Hackathon 2026  
## Notebook 01: Data Merging & Consolidation

This notebook consolidates multiple CSV files (API chunks) for each Aadhaar dataset
into a single dataframe per dataset. This ensures schema consistency and reproducibility.


In [1]:
import pandas as pd
import glob
import os

pd.set_option("display.max_columns", None)

In [2]:
BASE_PATH = "/Users/aaronrao/Desktop/projects/UIDAI_Aadhaar_Data_Insights/data"

BIOMETRIC_PATH = os.path.join(BASE_PATH, "biometric")
DEMOGRAPHIC_PATH = os.path.join(BASE_PATH, "demographic")
ENROLMENT_PATH = os.path.join(BASE_PATH, "enrolment")

PROCESSED_PATH = os.path.join(BASE_PATH, "processed")
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [3]:
def merge_csv_files(folder_path):
    """
    Merges all CSV files in a given folder into a single pandas DataFrame.
    """
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    df_list = []
    for file in csv_files:
        print(f"Reading: {os.path.basename(file)}")
        df = pd.read_csv(file)
        df_list.append(df)
    
    merged_df = pd.concat(df_list, ignore_index=True)
    return merged_df

In [4]:
biometric_df = merge_csv_files(BIOMETRIC_PATH)

print("Biometric Dataset Shape:", biometric_df.shape)
biometric_df.head()

Reading: api_data_aadhar_biometric_0_500000.csv
Reading: api_data_aadhar_biometric_1000000_1500000.csv
Reading: api_data_aadhar_biometric_1500000_1861108.csv
Reading: api_data_aadhar_biometric_500000_1000000.csv
Biometric Dataset Shape: (1861108, 6)


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [5]:
demographic_df = merge_csv_files(DEMOGRAPHIC_PATH)

print("Demographic Dataset Shape:", demographic_df.shape)
demographic_df.head()

Reading: api_data_aadhar_demographic_0_500000.csv
Reading: api_data_aadhar_demographic_1000000_1500000.csv
Reading: api_data_aadhar_demographic_1500000_2000000.csv
Reading: api_data_aadhar_demographic_2000000_2071700.csv
Reading: api_data_aadhar_demographic_500000_1000000.csv
Demographic Dataset Shape: (2071700, 6)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [6]:
enrolment_df = merge_csv_files(ENROLMENT_PATH)

print("Enrolment Dataset Shape:", enrolment_df.shape)
enrolment_df.head()

Reading: api_data_aadhar_enrolment_0_500000.csv
Reading: api_data_aadhar_enrolment_1000000_1006029.csv
Reading: api_data_aadhar_enrolment_500000_1000000.csv
Enrolment Dataset Shape: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [7]:
print("Biometric Columns:", biometric_df.columns.tolist())
print("Demographic Columns:", demographic_df.columns.tolist())
print("Enrolment Columns:", enrolment_df.columns.tolist())

Biometric Columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']
Demographic Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']
Enrolment Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


In [8]:
biometric_df.to_csv(
    os.path.join(PROCESSED_PATH, "biometric_merged.csv"),
    index=False
)

demographic_df.to_csv(
    os.path.join(PROCESSED_PATH, "demographic_merged.csv"),
    index=False
)

enrolment_df.to_csv(
    os.path.join(PROCESSED_PATH, "enrolment_merged.csv"),
    index=False
)

print("Merged datasets saved successfully in 'data/processed/'")

Merged datasets saved successfully in 'data/processed/'


### Summary
- Successfully consolidated all API-chunked CSV files for each dataset
- Verified schema consistency across files
- Stored merged datasets for downstream cleaning and analysis

Next notebook: **02_data_cleaning.ipynb**
