In [1]:
import pandas as pd 
import numpy as np 

In [None]:
df = pd.read_csv('working_datasets/merged_data.csv')

In [12]:
print(f"Total records: {len(df)}")
print(f"Duplicates: {df.duplicated().sum()}")
print(f"Missing values:\n{df.isnull().sum()}")

Total records: 1861108
Duplicates: 94896
Missing values:
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64


In [13]:
df = df.drop_duplicates()
print(f"Records after removing duplicates: {len(df)}")

Records after removing duplicates: 1766212


In [5]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
df = df.sort_values('date')

In [6]:
# Check extreme values
print(df[['bio_age_5_17', 'bio_age_17_', 'demo_age_5_17', 'demo_age_17_']].describe())

# Cap outliers at 99th percentile
for col in ['bio_age_5_17', 'bio_age_17_', 'demo_age_5_17', 'demo_age_17_']:
    q99 = df[col].quantile(0.99)
    df[col] = df[col].apply(lambda x: q99 if x > q99 else x)

        bio_age_5_17    bio_age_17_  demo_age_5_17   demo_age_17_
count  548622.000000  548622.000000  548622.000000  548622.000000
mean       12.525391      11.758889       2.520010      22.851694
std        47.101465      48.937815      12.051044      96.172368
min         0.000000       0.000000       0.000000       0.000000
25%         2.000000       2.000000       0.000000       5.000000
50%         6.000000       6.000000       1.000000      11.000000
75%        13.000000      13.000000       3.000000      24.000000
max      5854.000000    7201.000000    1883.000000   14732.000000


In [7]:
# Total biometric records
df['total_bio'] = df['bio_age_5_17'] + df['bio_age_17_']

# Total demographic records
df['total_demo'] = df['demo_age_5_17'] + df['demo_age_17_']

# Total enrollment
df['total_enrollment'] = df['age_0_5'] + df['age_5_17'] + df['age_18_greater']

# Biometric success rate (bio records / demo records)
df['bio_success_rate'] = (df['total_bio'] / df['total_demo'] * 100).round(2)
df['bio_success_rate'] = df['bio_success_rate'].replace([np.inf, -np.inf], np.nan)
df['bio_success_rate'] = df['bio_success_rate'].fillna(0)

# Add month and quarter columns
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

In [8]:
df.to_csv('working_datasets/cleaned_data.csv', index=False)
print("Cleaned data saved!")

Cleaned data saved!


In [9]:
df.describe()

Unnamed: 0,date,pincode,bio_age_5_17,bio_age_17_,demo_age_5_17,demo_age_17_,age_0_5,age_5_17,age_18_greater,total_bio,total_demo,total_enrollment,bio_success_rate,month,quarter
count,548622,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0,548622.0
mean,2025-10-18 23:55:54.795104768,516238.932248,11.228819,10.4536,2.222102,20.283982,3.37656,1.459451,0.091212,21.682419,22.506084,4.927223,169.984829,10.120278,3.626845
min,2025-04-01 00:00:00,110001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,2.0
25%,2025-09-15 00:00:00,361120.0,2.0,2.0,0.0,5.0,1.0,0.0,0.0,5.0,5.0,1.0,50.0,9.0,3.0
50%,2025-10-25 00:00:00,515408.0,6.0,6.0,1.0,11.0,2.0,0.0,0.0,12.0,12.0,2.0,100.0,10.0,4.0
75%,2025-11-11 00:00:00,700094.0,13.0,13.0,3.0,24.0,4.0,1.0,0.0,27.0,27.0,5.0,187.5,11.0,4.0
max,2025-12-29 00:00:00,855117.0,95.0,80.0,16.0,161.0,2688.0,1376.0,199.0,175.0,177.0,3965.0,14000.0,12.0,4.0
std,,207004.158918,15.901447,13.463385,3.027146,26.930198,15.710468,8.910486,1.105504,26.480696,28.964785,24.314855,281.85299,1.025234,0.484528


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 548622 entries, 0 to 718408
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   date              548622 non-null  datetime64[ns]
 1   state             548622 non-null  object        
 2   district          548622 non-null  object        
 3   pincode           548622 non-null  int64         
 4   bio_age_5_17      548622 non-null  float64       
 5   bio_age_17_       548622 non-null  float64       
 6   demo_age_5_17     548622 non-null  float64       
 7   demo_age_17_      548622 non-null  float64       
 8   age_0_5           548622 non-null  int64         
 9   age_5_17          548622 non-null  int64         
 10  age_18_greater    548622 non-null  int64         
 11  total_bio         548622 non-null  float64       
 12  total_demo        548622 non-null  float64       
 13  total_enrollment  548622 non-null  int64         
 14  bio_succe