# UIDAI Data Hackathon 2026  
## Notebook 02: Data Cleaning & Preprocessing

This notebook performs data cleaning, validation, and feature engineering
on the merged Aadhaar datasets to prepare them for analysis.


In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option("display.max_columns", None)

In [2]:
BASE_PATH = "/Users/aaronrao/Desktop/projects/UIDAI_Aadhaar_Data_Insights/data/processed"

BIO_PATH = os.path.join(BASE_PATH, "biometric_merged.csv")
DEMO_PATH = os.path.join(BASE_PATH, "demographic_merged.csv")
ENROL_PATH = os.path.join(BASE_PATH, "enrolment_merged.csv")

In [3]:
biometric_df = pd.read_csv(BIO_PATH)
demographic_df = pd.read_csv(DEMO_PATH)
enrolment_df = pd.read_csv(ENROL_PATH)

print("Biometric:", biometric_df.shape)
print("Demographic:", demographic_df.shape)
print("Enrolment:", enrolment_df.shape)

Biometric: (1861108, 6)
Demographic: (2071700, 6)
Enrolment: (1006029, 7)


### Initial Data Inspection
We examine data types, missing values, and basic statistics
to understand data quality and consistency.


In [4]:
def inspect_df(df, name):
    print(f"\n{name} INFO")
    display(df.info())
    print("\nMissing Values:")
    display(df.isnull().sum())

inspect_df(biometric_df, "Biometric")
inspect_df(demographic_df, "Demographic")
inspect_df(enrolment_df, "Enrolment")


Biometric INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB


None


Missing Values:


date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64


Demographic INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   date           object
 1   state          object
 2   district       object
 3   pincode        int64 
 4   demo_age_5_17  int64 
 5   demo_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 94.8+ MB


None


Missing Values:


date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64


Enrolment INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   date            1006029 non-null  object
 1   state           1006029 non-null  object
 2   district        1006029 non-null  object
 3   pincode         1006029 non-null  int64 
 4   age_0_5         1006029 non-null  int64 
 5   age_5_17        1006029 non-null  int64 
 6   age_18_greater  1006029 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 53.7+ MB


None


Missing Values:


date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

In [5]:
for df in [biometric_df, demographic_df, enrolment_df]:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month

In [6]:
# Replace negative values (if any) with NaN
numeric_cols_bio = ['bio_age_5_17', 'bio_age_17_']
numeric_cols_demo = ['demo_age_5_17', 'demo_age_17_']
numeric_cols_enrol = ['age_0_5', 'age_5_17', 'age_18_greater']

for col in numeric_cols_bio:
    biometric_df.loc[biometric_df[col] < 0, col] = np.nan

for col in numeric_cols_demo:
    demographic_df.loc[demographic_df[col] < 0, col] = np.nan

for col in numeric_cols_enrol:
    enrolment_df.loc[enrolment_df[col] < 0, col] = np.nan


In [7]:
biometric_df[numeric_cols_bio] = biometric_df[numeric_cols_bio].fillna(0)
demographic_df[numeric_cols_demo] = demographic_df[numeric_cols_demo].fillna(0)
enrolment_df[numeric_cols_enrol] = enrolment_df[numeric_cols_enrol].fillna(0)

In [8]:
biometric_df['total_biometric_updates'] = (
    biometric_df['bio_age_5_17'] + biometric_df['bio_age_17_']
)

demographic_df['total_demographic_updates'] = (
    demographic_df['demo_age_5_17'] + demographic_df['demo_age_17_']
)

enrolment_df['total_enrolments'] = (
    enrolment_df['age_0_5'] +
    enrolment_df['age_5_17'] +
    enrolment_df['age_18_greater']
)

In [9]:
text_cols = ['state', 'district']

for df in [biometric_df, demographic_df, enrolment_df]:
    for col in text_cols:
        df[col] = df[col].astype(str).str.strip().str.title()

In [10]:
biometric_df.describe()
demographic_df.describe()
enrolment_df.describe()

Unnamed: 0,date,pincode,age_0_5,age_5_17,age_18_greater,year,month,total_enrolments
count,323791,1006029.0,1006029.0,1006029.0,1006029.0,323791.0,323791.0,1006029.0
mean,2025-06-29 23:52:26.640888576,518641.5,3.525709,1.710074,0.1673441,2025.0,6.648276,5.403127
min,2025-01-04 00:00:00,100000.0,0.0,0.0,0.0,2025.0,1.0,1.0
25%,2025-03-11 00:00:00,363641.0,1.0,0.0,0.0,2025.0,3.0,1.0
50%,2025-07-11 00:00:00,517417.0,2.0,0.0,0.0,2025.0,7.0,2.0
75%,2025-10-09 00:00:00,700104.0,3.0,1.0,0.0,2025.0,10.0,5.0
max,2025-12-11 00:00:00,855456.0,2688.0,1812.0,855.0,2025.0,12.0,3965.0
std,,205636.0,17.53851,14.36963,3.220525,0.0,3.57105,31.58275


In [11]:
biometric_df.to_csv(os.path.join(BASE_PATH, "biometric_cleaned.csv"), index=False)
demographic_df.to_csv(os.path.join(BASE_PATH, "demographic_cleaned.csv"), index=False)
enrolment_df.to_csv(os.path.join(BASE_PATH, "enrolment_cleaned.csv"), index=False)

print("Cleaned datasets saved successfully.")

Cleaned datasets saved successfully.


### Summary
- Standardized date and text fields
- Handled missing and invalid values
- Engineered aggregate features for analysis
- Prepared datasets for exploratory and cross-dataset analysis

Next notebook: **03_exploratory_analysis.ipynb**
