In [4]:
import pandas as pd
from pathlib import Path

BASE_PATH = Path("../data/raw")


In [5]:
def load_csv_folder(folder_path):
    dfs = []
    for csv_file in folder_path.glob("*.csv"):
        df = pd.read_csv(csv_file)
        df["__source_file__"] = csv_file.name
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)


In [6]:
enrolment_path = BASE_PATH / "api_data_aadhar_enrolment"
enrolment_df = load_csv_folder(enrolment_path)

enrolment_df.head()


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,__source_file__
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37,api_data_aadhar_enrolment_0_500000.csv
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39,api_data_aadhar_enrolment_0_500000.csv
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,api_data_aadhar_enrolment_0_500000.csv
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15,api_data_aadhar_enrolment_0_500000.csv
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21,api_data_aadhar_enrolment_0_500000.csv


In [7]:
enrolment_df.columns
enrolment_df.info()
enrolment_df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   date             1006029 non-null  object
 1   state            1006029 non-null  object
 2   district         1006029 non-null  object
 3   pincode          1006029 non-null  int64 
 4   age_0_5          1006029 non-null  int64 
 5   age_5_17         1006029 non-null  int64 
 6   age_18_greater   1006029 non-null  int64 
 7   __source_file__  1006029 non-null  object
dtypes: int64(4), object(4)
memory usage: 61.4+ MB


date               0
state              0
district           0
pincode            0
age_0_5            0
age_5_17           0
age_18_greater     0
__source_file__    0
dtype: int64

In [8]:
demographic_path = BASE_PATH / "api_data_aadhar_demographic"
demographic_df = load_csv_folder(demographic_path)

demographic_df.head()


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,__source_file__
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529,api_data_aadhar_demographic_0_500000.csv
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375,api_data_aadhar_demographic_0_500000.csv
2,01-03-2025,Gujarat,Rajkot,360006,65,765,api_data_aadhar_demographic_0_500000.csv
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314,api_data_aadhar_demographic_0_500000.csv
4,01-03-2025,Rajasthan,Udaipur,313801,45,785,api_data_aadhar_demographic_0_500000.csv


In [9]:
demographic_df.columns
demographic_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 7 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   date             object
 1   state            object
 2   district         object
 3   pincode          int64 
 4   demo_age_5_17    int64 
 5   demo_age_17_     int64 
 6   __source_file__  object
dtypes: int64(3), object(4)
memory usage: 110.6+ MB


In [10]:
biometric_path = BASE_PATH / "api_data_aadhar_biometric"
biometric_df = load_csv_folder(biometric_path)

biometric_df.head()



Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,__source_file__
0,01-03-2025,Haryana,Mahendragarh,123029,280,577,api_data_aadhar_biometric_0_500000.csv
1,01-03-2025,Bihar,Madhepura,852121,144,369,api_data_aadhar_biometric_0_500000.csv
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091,api_data_aadhar_biometric_0_500000.csv
3,01-03-2025,Bihar,Bhojpur,802158,256,980,api_data_aadhar_biometric_0_500000.csv
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815,api_data_aadhar_biometric_0_500000.csv


In [12]:
biometric_df.columns
biometric_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 7 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   date             object
 1   state            object
 2   district         object
 3   pincode          int64 
 4   bio_age_5_17     int64 
 5   bio_age_17_      int64 
 6   __source_file__  object
dtypes: int64(3), object(4)
memory usage: 99.4+ MB


In [20]:
print("Enrolment columns:", enrolment_df.columns.tolist())
print("Demographic columns:", demographic_df.columns.tolist())
print("Biometric columns:", biometric_df.columns.tolist())

def find_state_column(df):
    for col in df.columns:
        if "state" in col.lower():
            return col
    return None

enrol_state_col = find_state_column(enrolment_df)
demo_state_col = find_state_column(demographic_df)
bio_state_col = find_state_column(biometric_df)

enrol_state_col, demo_state_col, bio_state_col


def find_date_column(df):
    for col in df.columns:
        if "date" in col.lower():
            return col
    return None

enrol_date_col = find_date_column(enrolment_df)
enrol_date_col

print("Enrolment states:", enrolment_df[enrol_state_col].nunique())
print("Demographic states:", demographic_df[demo_state_col].nunique())
print("Biometric states:", biometric_df[bio_state_col].nunique())

def find_date_column(df):
    for col in df.columns:
        if "date" in col.lower():
            return col
    return None

enrol_date_col = find_date_column(enrolment_df)
enrol_date_col

if enrol_date_col:
    print(
        enrolment_df[enrol_date_col].min(),
        enrolment_df[enrol_date_col].max()
    )


Enrolment columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater', '__source_file__']
Demographic columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_', '__source_file__']
Biometric columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_', '__source_file__']
Enrolment states: 55
Demographic states: 65
Biometric states: 57
01-04-2025 31-12-2025
