#  Data Understanding

Objective:
- Load all three Aadhaar datasets
- Understand columns and structure
- Identify common keys for merging
- Lock project scope


In [None]:
import pandas as pd
import numpy as np


In [None]:
# Enrolment dataset files
enrol_files = [
    "../data/raw/api_data_aadhar_enrolment/api_data_aadhar_enrolment_0_500000.csv",
    "../data/raw/api_data_aadhar_enrolment/api_data_aadhar_enrolment_500000_1000000.csv",
    "../data/raw/api_data_aadhar_enrolment/api_data_aadhar_enrolment_1000000_1006029.csv"
]

# Demographic update dataset files
demo_files = [
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_500000_1000000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_1000000_1500000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_1500000_2000000.csv",
    "../data/raw/api_data_aadhar_demographic/api_data_aadhar_demographic_2000000_2071700.csv"
]

# Biometric update dataset files
bio_files = [
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv",
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_500000_1000000.csv",
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_1000000_1500000.csv",
    "../data/raw/api_data_aadhar_biometric/api_data_aadhar_biometric_1500000_1861108.csv"
]


In [None]:
def load_and_combine(files):
    df_list = []
    for f in files:
        df_list.append(pd.read_csv(f))
    return pd.concat(df_list, ignore_index=True)


In [None]:
enrol = load_and_combine(enrol_files)
demo = load_and_combine(demo_files)
bio = load_and_combine(bio_files)

print("Enrolment shape:", enrol.shape)
print("Demographic shape:", demo.shape)
print("Biometric shape:", bio.shape)


In [None]:
enrol.head()
demo.head()
bio.head()


In [None]:
print("ENROLMENT COLUMNS:\n", enrol.columns)
print("\nDEMOGRAPHIC UPDATE COLUMNS:\n", demo.columns)
print("\nBIOMETRIC UPDATE COLUMNS:\n", bio.columns)


In [None]:
print(enrol.dtypes)
print("\n", demo.dtypes)
print("\n", bio.dtypes)


In [None]:
# Try to auto-detect date-like columns
for col in enrol.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        print("Enrolment date column:", col)

for col in demo.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        print("Demographic date column:", col)

for col in bio.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        print("Biometric date column:", col)


In [None]:
print("Enrolment date range:", enrol['date'].min(), "to", enrol['date'].max())
print("Demographic date range:", demo['date'].min(), "to", demo['date'].max())
print("Biometric date range:", bio['date'].min(), "to", bio['date'].max())


In [None]:
enrol.isna().sum().head()
demo.isna().sum().head()
bio.isna().sum().head()


## Data Quality Observations

- Presence of numeric placeholders in geographic fields
- Minor inconsistencies across datasets
- Requires validation during preprocessing

These issues are addressed in Day 2 cleaning steps.
