In [5]:
import pandas as pd
import os

# Load all enrolment CSV files
enrolment_folder = 'api_data_aadhar_enrolment'
enrolment_files = []

for file in sorted(os.listdir(enrolment_folder)):
    if file.endswith('.csv'):
        file_path = os.path.join(enrolment_folder, file)
        df = pd.read_csv(file_path)
        enrolment_files.append((file, df))
        print(f"Loaded: {file} - Shape: {df.shape}")

# Concatenate all dataframes
enrolment_data = pd.concat([df for _, df in enrolment_files], ignore_index=True)
print(f"\nTotal Enrolment Data - Shape: {enrolment_data.shape}")
print(f"\nColumns: {enrolment_data.columns.tolist()}")
print(f"\nFirst few rows:\n{enrolment_data.head()}")

Loaded: api_data_aadhar_enrolment_0_500000.csv - Shape: (500000, 7)
Loaded: api_data_aadhar_enrolment_1000000_1006029.csv - Shape: (6029, 7)
Loaded: api_data_aadhar_enrolment_500000_1000000.csv - Shape: (500000, 7)

Total Enrolment Data - Shape: (1006029, 7)

Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

First few rows:
         date          state          district  pincode  age_0_5  age_5_17  \
0  02-03-2025      Meghalaya  East Khasi Hills   793121       11        61   
1  09-03-2025      Karnataka   Bengaluru Urban   560043       14        33   
2  09-03-2025  Uttar Pradesh      Kanpur Nagar   208001       29        82   
3  09-03-2025  Uttar Pradesh           Aligarh   202133       62        29   
4  09-03-2025      Karnataka   Bengaluru Urban   560016       14        16   

   age_18_greater  
0              37  
1              39  
2              12  
3              15  
4              21  


In [6]:
# Data Cleaning for Enrolment Data

print("Before cleaning:")
print(f"Shape: {enrolment_data.shape}")
print(f"Duplicates: {enrolment_data.duplicated().sum()}")
print(f"Missing values:\n{enrolment_data.isnull().sum()}\n")

# Remove duplicates
enrolment_data = enrolment_data.drop_duplicates().reset_index(drop=True)
print(f"After removing duplicates: {enrolment_data.shape}\n")

# Fill NA values
# For numeric columns: use median
# For categorical columns: use mode (most frequent value)
for col in enrolment_data.columns:
    if enrolment_data[col].isnull().sum() > 0:
        if enrolment_data[col].dtype in ['int64', 'float64']:
            # Use median for numeric columns
            enrolment_data[col].fillna(enrolment_data[col].median(), inplace=True)
            print(f"Filled {col} with median")
        else:
            # Use mode for categorical columns
            mode_val = enrolment_data[col].mode()[0] if not enrolment_data[col].mode().empty else "Unknown"
            enrolment_data[col].fillna(mode_val, inplace=True)
            print(f"Filled {col} with mode: {mode_val}")

print(f"\nAfter filling NA values:")
print(f"Missing values remaining: {enrolment_data.isnull().sum().sum()}")
print(f"Final shape: {enrolment_data.shape}")
print(f"\nFirst few rows:\n{enrolment_data.head()}")

Before cleaning:
Shape: (1006029, 7)
Duplicates: 22957
Missing values:
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

After removing duplicates: (983072, 7)


After filling NA values:
Missing values remaining: 0
Final shape: (983072, 7)

First few rows:
         date          state          district  pincode  age_0_5  age_5_17  \
0  02-03-2025      Meghalaya  East Khasi Hills   793121       11        61   
1  09-03-2025      Karnataka   Bengaluru Urban   560043       14        33   
2  09-03-2025  Uttar Pradesh      Kanpur Nagar   208001       29        82   
3  09-03-2025  Uttar Pradesh           Aligarh   202133       62        29   
4  09-03-2025      Karnataka   Bengaluru Urban   560016       14        16   

   age_18_greater  
0              37  
1              39  
2              12  
3              15  
4              21  


In [8]:
enrolment_data.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21
