In [1]:
import pandas as pd
import os

# Load all demographic CSV files
demographic_folder = 'api_data_aadhar_demographic'
demographic_files = []

for file in sorted(os.listdir(demographic_folder)):
    if file.endswith('.csv'):
        file_path = os.path.join(demographic_folder, file)
        df = pd.read_csv(file_path)
        demographic_files.append((file, df))
        print(f"Loaded: {file} - Shape: {df.shape}")

# Concatenate all dataframes
demographic_data = pd.concat([df for _, df in demographic_files], ignore_index=True)
print(f"\nTotal Demographic Data - Shape: {demographic_data.shape}")
print(f"\nColumns: {demographic_data.columns.tolist()}")
print(f"\nFirst few rows:\n{demographic_data.head()}")

Loaded: api_data_aadhar_demographic_0_500000.csv - Shape: (500000, 6)
Loaded: api_data_aadhar_demographic_1000000_1500000.csv - Shape: (500000, 6)
Loaded: api_data_aadhar_demographic_1500000_2000000.csv - Shape: (500000, 6)
Loaded: api_data_aadhar_demographic_2000000_2071700.csv - Shape: (71700, 6)
Loaded: api_data_aadhar_demographic_500000_1000000.csv - Shape: (500000, 6)

Total Demographic Data - Shape: (2071700, 6)

Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

First few rows:
         date           state    district  pincode  demo_age_5_17  \
0  01-03-2025   Uttar Pradesh   Gorakhpur   273213             49   
1  01-03-2025  Andhra Pradesh    Chittoor   517132             22   
2  01-03-2025         Gujarat      Rajkot   360006             65   
3  01-03-2025  Andhra Pradesh  Srikakulam   532484             24   
4  01-03-2025       Rajasthan     Udaipur   313801             45   

   demo_age_17_  
0           529  
1           375  
2       

(2071700, 6)

In [4]:
# Data Cleaning for Demographic Data

print("Before cleaning:")
print(f"Shape: {demographic_data.shape}")
print(f"Duplicates: {demographic_data.duplicated().sum()}")
print(f"Missing values:\n{demographic_data.isnull().sum()}\n")

# Remove duplicates
demographic_data = demographic_data.drop_duplicates().reset_index(drop=True)
print(f"After removing duplicates: {demographic_data.shape}\n")

# Fill NA values
# For numeric columns: use median
# For categorical columns: use mode (most frequent value)
for col in demographic_data.columns:
    if demographic_data[col].isnull().sum() > 0:
        if demographic_data[col].dtype in ['int64', 'float64']:
            # Use median for numeric columns
            demographic_data[col].fillna(demographic_data[col].median(), inplace=True)
            print(f"Filled {col} with median")
        else:
            # Use mode for categorical columns
            mode_val = demographic_data[col].mode()[0] if not demographic_data[col].mode().empty else "Unknown"
            demographic_data[col].fillna(mode_val, inplace=True)
            print(f"Filled {col} with mode: {mode_val}")

print(f"\nAfter filling NA values:")
print(f"Missing values remaining: {demographic_data.isnull().sum().sum()}")
print(f"Final shape: {demographic_data.shape}")
print(f"\nFirst few rows:\n{demographic_data.head()}")

Before cleaning:
Shape: (2071700, 6)
Duplicates: 473601
Missing values:
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

After removing duplicates: (1598099, 6)


After filling NA values:
Missing values remaining: 0
Final shape: (1598099, 6)

First few rows:
         date           state    district  pincode  demo_age_5_17  \
0  01-03-2025   Uttar Pradesh   Gorakhpur   273213             49   
1  01-03-2025  Andhra Pradesh    Chittoor   517132             22   
2  01-03-2025         Gujarat      Rajkot   360006             65   
3  01-03-2025  Andhra Pradesh  Srikakulam   532484             24   
4  01-03-2025       Rajasthan     Udaipur   313801             45   

   demo_age_17_  
0           529  
1           375  
2           765  
3           314  
4           785  


In [5]:
demographic_data.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785
