<a href="https://colab.research.google.com/github/93ak/uidai1/blob/main/uidai1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Cleaning

Text & State Standardization

In [None]:
import pandas as pd

In [None]:
def sp_low(df,cols):
  df[cols]=df[cols].apply(lambda s:s.astype(str).str.replace(' ','',regex=False).str.lower())
  return df

In [None]:
def state_proc(df):
  keys = {
    "andaman": "andamanandnicobarislands",
    "andhra": "andhrapradesh",
    "arunachal": "arunachalpradesh",
    "assam": "assam",
    "bihar": "bihar",
    "chandigarh": "chandigarh",
    "chhat": "chhattisgarh",
    "delhi": "delhi",
    "goa": "goa",
    "gujarat": "gujarat",
    "haryana": "haryana",
    "himachal": "himachalpradesh",
    "jharkhand": "jharkhand",
    "karnataka": "karnataka",
    "kerala": "kerala",
    "ladakh": "ladakh",
    "lakshadweep": "lakshadweep",
    "madhya": "madhyapradesh",
    "maharashtra": "maharashtra",
    "manipur": "manipur",
    "meghalaya": "meghalaya",
    "mizoram": "mizoram",
    "nagaland": "nagaland",
    "odisha": "odisha",
    "orissa": "odisha",
    "pudu": "puducherry",
    "pond": "puducherry",
    "punjab": "punjab",
    "rajasthan": "rajasthan",
    "sikkim": "sikkim",
    "tamilnadu": "tamilnadu",
    "telangana": "telangana",
    "tripura": "tripura",
    "uttarpradesh": "uttarpradesh",
    "khand": "uttarakhand",
    "chal": "uttarakhand",
    "westb": "westbengal",
    "jammu": "jammuandkashmir",
    "dadra": "dadranagarhavelidamandiu",
    "daman": "dadranagarhavelidamandiu",
    "balanagar": "telangana",
    "jaipur": "rajasthan",
    "madanapalle": "andhrapradesh",
    "darbhanga": "bihar",
    "puttenahalli": "karnataka",
    "nagpur": "maharashtra",
    "rajaannamalaipuram": "tamilnadu",
    'greaterkailash2':'delhi',
    'punecity':'maharashtra',
    'gurgaon':'haryana',
    'puthur':'tamilnadu'
  }

  def map_state(val):
    for k in keys:
        if k in val:
            return keys[k]
    return val

  df['State'] = df['State'].astype(str).apply(map_state)
  return df

# Preprocessing and inspection

## Biometric data


Preprocessing

In [None]:
# import glob
from datasets import load_dataset
import matplotlib.pyplot as plt

In [None]:
ds = load_dataset("an42/uidai1", data_files="bioapi1.csv")
biodf = ds["train"].to_pandas()

In [None]:
biodf['Date']=pd.to_datetime(biodf['Date'],dayfirst=True)
biodf['Year_month']=biodf['Date'].dt.strftime('%Y-%m')
biodf=state_proc(sp_low(biodf,['State','District']))
biodf=biodf[(biodf['State']!='100000')&(biodf['Year_month']!='2026-01')]
bioage=[c for c in biodf.columns if 'age' in c]
biog=biodf.groupby(['State','Year_month'])[bioage].sum().reset_index()

Inspection

In [None]:
biodf.columns

In [None]:
biodf.count()

In [None]:
bioym=biodf['Year_month'].unique()
bioym

In [None]:
biost=biodf['State'].unique()
biost

## Enrollment data

Preprocessing



In [None]:
ds = load_dataset("an42/uidai1", data_files="enrolapi1.csv")
enroldf = ds["train"].to_pandas()

In [None]:
enroldf['Date']=pd.to_datetime(enroldf['Date'],dayfirst=True)
enroldf['Year_month']=enroldf['Date'].dt.strftime('%Y-%m')
enroldf=state_proc(sp_low(enroldf,['State','District']))
enroldf=enroldf[(enroldf['State']!='100000')&(enroldf['Year_month']!='2026-01')]
enrolage=[c for c in enroldf.columns if 'Age' in c]
enrolg=enroldf.groupby(['State','Year_month'])[enrolage].sum().reset_index()

Inspection

In [None]:
enrolym=enroldf['Year_month'].unique()
enrolym

In [None]:
enroldf['State'].unique()

In [None]:
enroldf.count()

##Demographic data

In [None]:
ds = load_dataset("an42/uidai1", data_files="demoapi1.csv")
demodf = ds["train"].to_pandas()

In [None]:
demodf['Date']=pd.to_datetime(demodf['Date'],dayfirst=True)
demodf['Year_month']=demodf['Date'].dt.strftime('%Y-%m')
demodf=state_proc(sp_low(demodf,['State','District']))
demodf=demodf[(demodf['State']!='100000')&(demodf['State']!='561203')&(demodf['Year_month']!='2026-01')]
demoage=[c for c in demodf.columns if 'age' in c]
demog=demodf.groupby(['State','Year_month'])[demoage].sum().reset_index()

Inspection

In [None]:
demodf.count()

In [None]:
demoym=demodf['Year_month'].unique()
demoym

In [None]:
demodf['State'].unique()

In [None]:
demodf['District'].unique()

#Visualisation of Records - Month, State-wise

##Total Biometric Updates

In [None]:
tot=biodf.groupby('Year_month')[bioage].sum().sum(axis=1).reset_index(name='total')

ax=tot.plot(x='Year_month',y='total',title='Total biometric records')
plt.show()


In [None]:
for state,d in biog.groupby('State'):
  d.plot(x='Year_month',y=bioage,title=state)
  plt.show()

In [None]:
# same as previous but on a log scale

plt.figure()

for state, d in biog.groupby('State'):
  y = d[bioage].sum(axis=1)
  plt.plot(d['Year_month'], y, label=state)

plt.yscale('log')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# plt.tight_layout()
plt.show()


## Total Enrollments

In [None]:
tot=enroldf.groupby('Year_month')[enrolage].sum().sum(axis=1).reset_index(name='total')

ax=tot.plot(x='Year_month',y='total',title='Total enrolment records')
plt.show()

In [None]:
for state,d in enrolg.groupby('State'):
  d.plot(x='Year_month',y=enrolage,title=state)
  plt.show()

In [None]:
plt.figure()

for state, d in enrolg.groupby('State'):
  x = pd.to_datetime(d['Year_month'], errors='raise')
  y = d[enrolage].sum(axis=1)
  plt.plot(x,y, label=state)

plt.yscale('log')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# plt.tight_layout()
plt.show()


## Total Demographic Updates

In [None]:
tot=demodf.groupby('Year_month')[demoage].sum().sum(axis=1).reset_index(name='total')

ax=tot.plot(x='Year_month',y='total',title='Total demographic records')
plt.show()

In [None]:
for state,d in demog.groupby('State'):
  d.plot(x='Year_month',y=demoage,title=state)
  plt.show()

In [None]:
plt.figure()

for state, d in demog.groupby('State'):
  x = pd.to_datetime(d['Year_month'], errors='raise')
  y = d[demoage].sum(axis=1)
  plt.plot(x,y, label=state)

plt.yscale('log')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# plt.tight_layout()
plt.show()

# Z-Score & Ratio Analysis

Normalization & Ratio Analysis - Demographic Data

In [None]:
# state-month totals
s = demodf.groupby(['State','Year_month'])[demoage].sum().sum(axis=1).reset_index(name='val')

# total-month totals
t = demodf.groupby('Year_month')[demoage].sum().sum(axis=1).reset_index(name='total')

# normalize by mean
s['norm'] = s['val'] / s.groupby('State')['val'].transform('mean')
t['norm_total'] = t['total'] / t['total'].mean()

# merge + ratio
m = s.merge(t[['Year_month','norm_total']], on='Year_month')
m['ratio'] = m['norm'] / m['norm_total']

# plot
for state, d in m.groupby('State'):
    d.plot(x='Year_month', y='ratio', title=state)
    plt.show()


Standardization & Z-score Analysis - Demographic Data

In [None]:
from sklearn.preprocessing import StandardScaler

# state-month totals
s = demodf.groupby(['State','Year_month'])[demoage].sum().sum(axis=1).reset_index(name='val')

# total-month totals
t = demodf.groupby('Year_month')[demoage].sum().sum(axis=1).reset_index(name='total')

sc = StandardScaler()

# scale per state
s['z_state'] = s.groupby('State')['val'].transform(lambda x: sc.fit_transform(x.values.reshape(-1,1)).ravel())

# scale total
t['z_total'] = sc.fit_transform(t[['total']]).ravel()

# merge + ratio
m = s.merge(t[['Year_month','z_total']], on='Year_month')
m['ratio'] = m['z_state'] / m['z_total']

# plot
for state, d in m.groupby('State'):
    d.plot(x='Year_month', y='ratio', title=state)
    plt.show()


In [None]:
demodf[demodf['Year_month']=='2025-10'][demoage].sum().sum()

Standardization & Z-score Analysis - Enrollment Data

In [None]:
s = enroldf.groupby(['State','Year_month'])[enrolage].sum().sum(axis=1).reset_index(name='val')
t = enroldf.groupby('Year_month')[enrolage].sum().sum(axis=1).reset_index(name='total')

sc = StandardScaler()
s['z_state'] = s.groupby('State')['val'].transform(lambda x: sc.fit_transform(x.values.reshape(-1,1)).ravel())
t['z_total'] = sc.fit_transform(t[['total']]).ravel()
m = s.merge(t[['Year_month','z_total']], on='Year_month')
m['ratio'] = m['z_state'] / m['z_total']

for state, d in m.groupby('State'):
    d.plot(x='Year_month', y='ratio', title=state)
    plt.show()


Standardization & Z-score Analysis - Biometric Data

In [None]:
s = biodf.groupby(['State','Year_month'])[bioage].sum().sum(axis=1).reset_index(name='val')
t = biodf.groupby('Year_month')[bioage].sum().sum(axis=1).reset_index(name='total')

sc = StandardScaler()
s['z_state'] = s.groupby('State')['val'].transform(lambda x: sc.fit_transform(x.values.reshape(-1,1)).ravel())
t['z_total'] = sc.fit_transform(t[['total']]).ravel()
m = s.merge(t[['Year_month','z_total']], on='Year_month')
m['ratio'] = m['z_state'] / m['z_total']

for state, d in m.groupby('State'):
    d.plot(x='Year_month', y='ratio', title=state)
    plt.show()

# Histogram Analysis of Daily Records

Enrollment data

In [None]:
import numpy as np

In [None]:
s = enroldf.groupby('Date')[enrolage].sum().sum(axis=1)
s.plot(kind='hist', bins=100)

In [None]:
dates = enroldf.groupby('Year_month')['Date'].nunique()
dates

In [None]:
s.value_counts().head(10)

In [None]:
len(s)

In [None]:
bins = np.logspace(np.log10(s.min()), np.log10(s.max()), 50)
s[s > 0].plot(kind='hist', bins=bins)
plt.xscale('log')

In [None]:
for state, df in enroldf.groupby('State'):
  s = df.groupby('Date')[enrolage].sum().sum(axis=1)
  bins = np.logspace(np.log10(s.min()), np.log10(s.max()), 50)
  s = s[s > 0]
  plt.figure()
  plt.hist(s, bins=bins)
  plt.xscale('log')
  plt.title(state)

In [None]:
table = enroldf.pivot_table(
    index='State',
    columns='Year_month',
    values='Date',
    aggfunc='nunique'
)

table

Biometric data

In [None]:
s = biodf.groupby('Date')[bioage].sum().sum(axis=1)
bins = np.logspace(np.log10(s.min()), np.log10(s.max()), 50)
s.plot(kind='hist', bins=bins)
plt.xscale('log')

In [None]:
for state, df in biodf.groupby('State'):
  s = df.groupby('Date')[bioage].sum().sum(axis=1)
  bins = np.logspace(np.log10(s.min()), np.log10(s.max()), 50)
  s = s[s > 0]
  plt.figure()
  plt.hist(s, bins=bins)
  plt.xscale('log')
  plt.title(state)

In [None]:
dates = biodf.groupby('Year_month')['Date'].nunique()
dates

In [None]:
table = biodf.pivot_table(
    index='State',
    columns='Year_month',
    values='Date',
    aggfunc='nunique'
)

table

Demographic data

In [None]:
s = demodf.groupby('Date')[demoage].sum().sum(axis=1)
bins = np.logspace(np.log10(s.min()), np.log10(s.max()), 50)
s.plot(kind='hist', bins=bins)
plt.xscale('log')

In [None]:
for state, df in demodf.groupby('State'):
  s = df.groupby('Date')[demoage].sum().sum(axis=1)
  bins = np.logspace(np.log10(s.min()), np.log10(s.max()), 50)
  s = s[s > 0]
  plt.figure()
  plt.hist(s, bins=bins)
  plt.xscale('log')
  plt.title(state)

In [None]:
dates = demodf.groupby('Year_month')['Date'].nunique()
dates

In [None]:
table = demodf.pivot_table(
    index='State',
    columns='Year_month',
    values='Date',
    aggfunc='nunique'
)

table

# Pincode Validity and Coverage Analysis

Load Pincode data from Govt's official Pincodes dataset, saved into huggingface dataset

In [None]:
len(demodf['Pincode'].unique())

In [None]:
from datasets import Features, Value

features = Features({
    "pincode": Value("int64"),
    "statename": Value("string")
})



In [None]:
ds = load_dataset("an42/uidai1", data_files="pincodes1.csv", features=features)
# ds = ds.remove_columns(["latitude", "longitude"])
pindf = ds["train"].to_pandas()

In [None]:
pindf['pincode'] = pindf['pincode'].astype(int)
pindf.columns

Comparision - Pincode data vs Demographic updates pincodes

In [None]:
# p = set(map(int,pindf['pincode'].unique()))
# pd = set(map(int, demodf['Pincode'].unique()))
p = set(pindf['pincode'].unique())
pd = set(demodf['Pincode'].unique())
print(len(p - pd))
print(len(pd - p))

In [None]:
p-pd

In [None]:
pd-p

In [None]:
p_only = pindf[~pindf['pincode'].isin(demodf['Pincode'])]
d_only = demodf[~demodf['Pincode'].isin(pindf['pincode'])]


In [None]:
p_state_counts = p_only.groupby('statename')['pincode'].nunique()
d_state_counts = d_only.groupby('State')['Pincode'].nunique()


In [None]:
from IPython.display import display

display(p_state_counts.reset_index(name='unique_pincodes'))
display(d_state_counts.reset_index(name='unique_pincodes'))

Comparision - Pincode data vs Enrollments pincodes

In [None]:
p = set(pindf['pincode'].unique())
pe = set(enroldf['Pincode'].unique())
print(len(p - pe))
print(len(pe - p))

In [None]:
p_only = pindf[~pindf['pincode'].isin(enroldf['Pincode'])]
e_only = enroldf[~enroldf['Pincode'].isin(pindf['pincode'])]

In [None]:
p_state_counts = p_only.groupby('statename')['pincode'].nunique()
e_state_counts = e_only.groupby('State')['Pincode'].nunique()

In [None]:
display(p_state_counts.reset_index(name='unique_pincodes'))
display(e_state_counts.reset_index(name='unique_pincodes'))

Comparision - Pincode data vs Biometric updates pincodes

In [None]:
p = set(pindf['pincode'].unique())
pb = set(biodf['Pincode'].unique())
print(len(p - pb))
print(len(pb - p))

In [None]:
p_only = pindf[~pindf['pincode'].isin(biodf['Pincode'])]
b_only = biodf[~biodf['Pincode'].isin(pindf['pincode'])]

In [None]:
p_state_counts = p_only.groupby('statename')['pincode'].nunique()
b_state_counts = b_only.groupby('State')['Pincode'].nunique()

In [None]:
display(p_state_counts.reset_index(name='unique_pincodes'))
display(b_state_counts.reset_index(name='unique_pincodes'))