In [1]:
#Importing libraries 
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
#Importing Datasets
enrol_guj = pd.read_csv('data/Gujarat/AdhaarMonthlyEnrolmentData_Gujarat.csv')
demo_guj = pd.read_csv('data/Gujarat/AdhaarDemographicMonthlyUpdateData_Gujarat.csv')
bio_guj = pd.read_csv('data/Gujarat/AdhaarBiometricMonthlyUpdateData_Gujarat.csv')

print(enrol_guj.shape, demo_guj.shape, bio_guj.shape)

(56231, 7) (110091, 6) (260600, 6)


In [3]:
enrol_guj.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,01-09-2025,Gujarat,Ahmadabad,380054,1,0,0
1,01-09-2025,Gujarat,Ahmadabad,382350,3,0,0
2,01-09-2025,Gujarat,Ahmedabad,380005,4,0,0
3,01-09-2025,Gujarat,Ahmedabad,380008,2,1,0
4,01-09-2025,Gujarat,Ahmedabad,380022,1,3,0


In [4]:
demo_guj.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Gujarat,Gandhinagar,382835,18,85
1,01-03-2025,Gujarat,Rajkot,360030,13,159
2,01-03-2025,Gujarat,Kachchh,370615,33,193
3,01-03-2025,Gujarat,Rajkot,360330,13,192
4,01-03-2025,Gujarat,Kachchh,370490,44,174


In [5]:
bio_guj.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,29-12-2025,Gujarat,Ahmadabad,380015,1,1
1,29-12-2025,Gujarat,Ahmadabad,380018,0,1
2,29-12-2025,Gujarat,Ahmadabad,380019,0,1
3,29-12-2025,Gujarat,Ahmadabad,380024,0,1
4,29-12-2025,Gujarat,Ahmadabad,380060,0,1


In [6]:
# Coverting All District and State name to Upper case for better view and analysis
for df in [enrol_guj, demo_guj, bio_guj]:
    df['district'] = df['district'].str.upper().str.strip()
    df['state'] = df['state'].str.upper().str.strip()

In [7]:
#Countig all unique Districts
enrol_guj['district'].nunique()

40

**The above count shows 40 district names due to spelling variations and duplicates in the dataset.
However, Gujarat officially has 34 districts.
One of these districts is newly created and is not present in this dataset.
Therefore, after removing duplicates and excluding the new district, we will standardize the data to 33 districts.**

In [8]:
#Listing out all unique Districts
enrol_guj['district'].value_counts()

district
AHMEDABAD          4754
VADODARA           3846
SURAT              3562
RAJKOT             2487
KHEDA              2381
ANAND              2326
BHAVNAGAR          2253
JUNAGADH           2235
KACHCHH            2175
SABARKANTHA        2090
JAMNAGAR           2051
MAHESANA           1959
BANASKANTHA        1887
SURENDRA NAGAR     1821
BHARUCH            1635
VALSAD             1508
GANDHINAGAR        1474
AMRELI             1463
PANCHMAHALS        1435
NAVSARI            1169
PATAN              1163
DAHOD              1084
TAPI                913
AHMADABAD           901
GIR SOMNATH         896
ARVALLI             795
MAHISAGAR           752
DEVBHUMI DWARKA     707
CHHOTAUDEPUR        670
PORBANDAR           642
DOHAD               635
MORBI               597
NARMADA             593
BOTAD               508
THE DANGS           273
BANAS KANTHA        192
SABAR KANTHA        166
SURENDRANAGAR       148
PANCH MAHALS         81
DANG                  4
Name: count, dtype: int64

In [9]:
#All the distrcits are in sorted order to find out duplicate Districts
sorted(enrol_guj['district'].unique())

['AHMADABAD',
 'AHMEDABAD',
 'AMRELI',
 'ANAND',
 'ARVALLI',
 'BANAS KANTHA',
 'BANASKANTHA',
 'BHARUCH',
 'BHAVNAGAR',
 'BOTAD',
 'CHHOTAUDEPUR',
 'DAHOD',
 'DANG',
 'DEVBHUMI DWARKA',
 'DOHAD',
 'GANDHINAGAR',
 'GIR SOMNATH',
 'JAMNAGAR',
 'JUNAGADH',
 'KACHCHH',
 'KHEDA',
 'MAHESANA',
 'MAHISAGAR',
 'MORBI',
 'NARMADA',
 'NAVSARI',
 'PANCH MAHALS',
 'PANCHMAHALS',
 'PATAN',
 'PORBANDAR',
 'RAJKOT',
 'SABAR KANTHA',
 'SABARKANTHA',
 'SURAT',
 'SURENDRA NAGAR',
 'SURENDRANAGAR',
 'TAPI',
 'THE DANGS',
 'VADODARA',
 'VALSAD']

In [None]:
## Making district map so the duplicates can be removed

In [10]:
district_map = {
    "AHMADABAD": "AHMEDABAD",
    "BANASKANTHA": "BANAS KANTHA",
    "DOHAD": "DAHOD",
    "DANG": "DANGS",
    "THE DANGS": "DANGS",
    "PANCHMAHALS": "PANCH MAHALS",
    "SABARKANTHA": "SABAR KANTHA",
    "SURENDRA NAGAR": "SURENDRANAGAR",
    "CHHOTAUDEPUR": "CHHOTA UDEPUR"
}

In [11]:
for df in [enrol_guj, demo_guj, bio_guj]:
    df['district'] = df['district'].replace(district_map)

In [12]:
enrol_guj['district'].nunique()

33

In [13]:
demo_guj['district'].nunique()

33

In [14]:
bio_guj['district'].nunique()

33

# Now all the dataset has 33 districts

# Inspecting Duplicate Rows

In [15]:
enrol_guj.duplicated(['date','state','district','pincode']).sum()

np.int64(3061)

In [19]:
enrol_dups = enrol_guj[enrol_guj.duplicated(['date','state','district','pincode'], keep=False)]

enrol_dups.sort_values(['date','district','pincode']).head(6)

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
20306,01-01-2026,GUJARAT,AHMEDABAD,380001,3,0,0
54244,01-01-2026,GUJARAT,AHMEDABAD,380001,20,5,0
20307,01-01-2026,GUJARAT,AHMEDABAD,380006,1,0,0
31538,01-01-2026,GUJARAT,AHMEDABAD,380006,1,1,0
20311,01-01-2026,GUJARAT,AHMEDABAD,380007,6,1,0
54239,01-01-2026,GUJARAT,AHMEDABAD,380007,1,0,0


**üìå Why ‚Äúduplicate‚Äù rows exist in UIDAI enrollment data**

The raw UIDAI enrollment dataset contains 56,231 rows, but many rows share the same
date, state, district, and pincode. These are not data errors ‚Äî they represent multiple Aadhaar enrollment sessions on the same day at the same location.

For example, in Ahmedabad district on 01-01-2026:

Pincode	age_0_5	age_5_17	age_18_greater
380001	   3	   0	         0
380001	   20	   5	         0

This means that in pincode 380001, two different enrollment sessions were recorded on the same day:

One session enrolled 3 children (0‚Äì5 years)

Another session enrolled 20 children (0‚Äì5 years) and 5 adolescents (5‚Äì17 years)

These are two separate UIDAI operations, not duplicate mistakes.

For analysis, these rows are aggregated into a single daily record:

380001 on 01-01-2026 ‚Üí 23 (0‚Äì5), 5 (5‚Äì17)


This converts transaction-level UIDAI logs into daily pincode-level operational intelligence, while preserving the true volume of Aadhaar activity.

In [20]:
enrol_guj_agg = enrol_guj.groupby(['date', 'state', 'district', 'pincode'], as_index=False)[
    ['age_0_5', 'age_5_17', 'age_18_greater']
    ].sum()


In [21]:
enrol_guj_agg.duplicated(['date','state','district','pincode']).sum()

np.int64(0)

**The raw UIDAI dataset contained 3,061 duplicate pincode-day records due to multiple enrollment sessions per day. We aggregated these into a single daily operational record per pincode.**

# Now the same Process for Demographic and Biometric data

In [22]:
demo_guj.duplicated(['date','state','district','pincode']).sum()

np.int64(21144)

In [23]:
demo_dups = demo_guj[demo_guj.duplicated(['date','state','district','pincode'], keep=False)]

demo_dups.sort_values(['date','district','pincode']).head(6)

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
37736,01-01-2026,GUJARAT,AHMEDABAD,363610,0,5
100079,01-01-2026,GUJARAT,AHMEDABAD,363610,0,5
45409,01-01-2026,GUJARAT,AHMEDABAD,380001,0,20
54800,01-01-2026,GUJARAT,AHMEDABAD,380001,2,32
74788,01-01-2026,GUJARAT,AHMEDABAD,380001,0,20
37737,01-01-2026,GUJARAT,AHMEDABAD,380002,0,2


In [24]:
demo_guj_agg = demo_guj.groupby(['date', 'state', 'district', 'pincode'], as_index=False)[
    ['demo_age_5_17', 'demo_age_17_']
    ].sum()

In [25]:
demo_guj_agg.duplicated(['date','state','district','pincode']).sum()

np.int64(0)

In [28]:
bio_guj.duplicated(['date','state','district','pincode']).sum()

np.int64(70045)

In [29]:
bio_dups = bio_guj[bio_guj.duplicated(['date','state','district','pincode'], keep=False)]

bio_dups.sort_values(['date','district','pincode']).head(6)

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
209623,01-01-2026,GUJARAT,AHMEDABAD,380001,3,11
211113,01-01-2026,GUJARAT,AHMEDABAD,380001,44,63
475,01-01-2026,GUJARAT,AHMEDABAD,380004,1,7
211114,01-01-2026,GUJARAT,AHMEDABAD,380004,27,23
54810,01-01-2026,GUJARAT,AHMEDABAD,380006,3,7
209624,01-01-2026,GUJARAT,AHMEDABAD,380006,0,3


In [31]:
bio_guj_agg = bio_guj.groupby(['date', 'state', 'district', 'pincode'], as_index=False)[
    ['bio_age_5_17', 'bio_age_17_']
    ].sum()

In [32]:
bio_guj_agg.duplicated(['date','state','district','pincode']).sum()

np.int64(0)