In [5]:
import pandas as pd
import os

# Load all biometric CSV files
biometric_folder = 'api_data_aadhar_biometric'
biometric_files = []

for file in sorted(os.listdir(biometric_folder)):
    if file.endswith('.csv'):
        file_path = os.path.join(biometric_folder, file)
        df = pd.read_csv(file_path)
        biometric_files.append((file, df))
        print(f"Loaded: {file} - Shape: {df.shape}")

# Concatenate all dataframes
biometric_data = pd.concat([df for _, df in biometric_files], ignore_index=True)
print(f"\nTotal Biometric Data - Shape: {biometric_data.shape}")
print(f"\nColumns: {biometric_data.columns.tolist()}")
print(f"\nFirst few rows:\n{biometric_data.head()}")

Loaded: api_data_aadhar_biometric_0_500000.csv - Shape: (500000, 6)
Loaded: api_data_aadhar_biometric_1000000_1500000.csv - Shape: (500000, 6)
Loaded: api_data_aadhar_biometric_1500000_1861108.csv - Shape: (361108, 6)
Loaded: api_data_aadhar_biometric_500000_1000000.csv - Shape: (500000, 6)

Total Biometric Data - Shape: (1861108, 6)

Columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

First few rows:
         date              state      district  pincode  bio_age_5_17  \
0  01-03-2025            Haryana  Mahendragarh   123029           280   
1  01-03-2025              Bihar     Madhepura   852121           144   
2  01-03-2025  Jammu and Kashmir         Punch   185101           643   
3  01-03-2025              Bihar       Bhojpur   802158           256   
4  01-03-2025         Tamil Nadu       Madurai   625514           271   

   bio_age_17_  
0          577  
1          369  
2         1091  
3          980  
4          815  


In [6]:
# Data Cleaning for Biometric Data

print("Before cleaning:")
print(f"Shape: {biometric_data.shape}")
print(f"Duplicates: {biometric_data.duplicated().sum()}")
print(f"Missing values:\n{biometric_data.isnull().sum()}\n")

# Remove duplicates
biometric_data = biometric_data.drop_duplicates().reset_index(drop=True)
print(f"After removing duplicates: {biometric_data.shape}\n")

# Fill NA values
# For numeric columns: use median
# For categorical columns: use mode (most frequent value)
for col in biometric_data.columns:
    if biometric_data[col].isnull().sum() > 0:
        if biometric_data[col].dtype in ['int64', 'float64']:
            # Use median for numeric columns
            biometric_data[col].fillna(biometric_data[col].median(), inplace=True)
            print(f"Filled {col} with median")
        else:
            # Use mode for categorical columns
            mode_val = biometric_data[col].mode()[0] if not biometric_data[col].mode().empty else "Unknown"
            biometric_data[col].fillna(mode_val, inplace=True)
            print(f"Filled {col} with mode: {mode_val}")

print(f"\nAfter filling NA values:")
print(f"Missing values remaining: {biometric_data.isnull().sum().sum()}")
print(f"Final shape: {biometric_data.shape}")
print(f"\nFirst few rows:\n{biometric_data.head()}")

Before cleaning:
Shape: (1861108, 6)
Duplicates: 94896
Missing values:
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64

After removing duplicates: (1766212, 6)


After filling NA values:
Missing values remaining: 0
Final shape: (1766212, 6)

First few rows:
         date              state      district  pincode  bio_age_5_17  \
0  01-03-2025            Haryana  Mahendragarh   123029           280   
1  01-03-2025              Bihar     Madhepura   852121           144   
2  01-03-2025  Jammu and Kashmir         Punch   185101           643   
3  01-03-2025              Bihar       Bhojpur   802158           256   
4  01-03-2025         Tamil Nadu       Madurai   625514           271   

   bio_age_17_  
0          577  
1          369  
2         1091  
3          980  
4          815  


In [7]:
biometric_data.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815
