In [190]:
import pandas as pd

In [191]:
files = [
    "bio1.csv",
    "bio2.csv",
    "bio3.csv",
    "bio4.csv"
]

df = pd.concat(
    [pd.read_csv(f) for f in files],
    ignore_index=True
)

df.to_csv("bio_merged.csv", index=False)

In [192]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB


In [189]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, explorative=True)
profile.to_file("demo_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                     | 0/6 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████| 6/6 [00:01<00:00,  3.88it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [193]:
df[df.duplicated(keep=False)].sort_values(
    ["date", "state", "district", "pincode"]
)

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
1480030,01-09-2025,Assam,Sonitpur,784174,0,1
1481176,01-09-2025,Assam,Sonitpur,784174,0,1
1480031,01-09-2025,Assam,Sonitpur,784182,4,4
1481177,01-09-2025,Assam,Sonitpur,784182,4,4
1480032,01-09-2025,Assam,South Salmara Mankachar,783127,3,3
...,...,...,...,...,...,...
352842,31-10-2025,Himachal Pradesh,Mandi,175038,6,4
349533,31-10-2025,Himachal Pradesh,Mandi,175039,1,0
352843,31-10-2025,Himachal Pradesh,Mandi,175039,1,0
349534,31-10-2025,Himachal Pradesh,Mandi,175046,6,0


In [194]:
rows_before = df.shape[0]
print(rows_before)

1861108


In [195]:
df_clean= df.drop_duplicates(
    subset=['date', 'pincode', 'bio_age_5_17', 'bio_age_17_'],
    keep='first'
)



In [196]:
rows_after = df_clean.shape[0]

In [197]:
rows_removed = rows_before - rows_after
percent_removed = (rows_removed / rows_before) * 100

print("Rows before :", rows_before)
print("Rows after  :", rows_after)
print("Rows removed:", rows_removed)
print(f"Decrease %  : {percent_removed:.2f}%")

Rows before : 1861108
Rows after  : 1754015
Rows removed: 107093
Decrease %  : 5.75%


In [199]:
df_clean[(df_clean["pincode"] == 784174) & (df_clean["date"] == "01-09-2025")]

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
1480030,01-09-2025,Assam,Sonitpur,784174,0,1
1487773,01-09-2025,Assam,Biswanath,784174,3,3


In [200]:
df_clean = df_clean.reset_index(drop=True)

In [201]:
print("Duplicate rows:", df_clean.duplicated().sum())

Duplicate rows: 0


In [202]:
print("unique Raw states:", df_clean['state'].nunique())

unique Raw states: 57


In [203]:
print(df_clean["state"].unique())

['Goa' 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu & Kashmir'
 'Jammu and Kashmir' 'Jharkhand' 'Karnataka' 'Kerala'
 'Andaman & Nicobar Islands' 'Andaman and Nicobar Islands'
 'Andhra Pradesh' 'Mizoram' 'Nagaland' 'Odisha' 'Orissa' 'Pondicherry'
 'Puducherry' 'Punjab' 'Rajasthan' 'Sikkim' 'Tamil Nadu' 'Telangana'
 'Tripura' 'Uttar Pradesh' 'Uttarakhand' 'West Bengal' 'Ladakh'
 'Lakshadweep' 'Madhya Pradesh' 'Maharashtra' 'Manipur' 'Meghalaya'
 'Arunachal Pradesh' 'Assam' 'Bihar' 'Chandigarh' 'Chhattisgarh'
 'Dadra and Nagar Haveli' 'Daman & Diu' 'Delhi' 'Daman and Diu'
 'West  Bengal' 'Dadra and Nagar Haveli and Daman and Diu'
 'Dadra & Nagar Haveli' 'West bengal' 'West Bangal' 'ODISHA' 'WEST BENGAL'
 'andhra pradesh' 'odisha' 'Westbengal' 'WESTBENGAL' 'Chhatisgarh'
 'Tamilnadu' 'Uttaranchal' 'west Bengal']


In [204]:
df_clean["state_clean"] = df_clean["state"].str.lower().str.strip()
print(sorted(df_clean["state_clean"].unique()))

['andaman & nicobar islands', 'andaman and nicobar islands', 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chandigarh', 'chhatisgarh', 'chhattisgarh', 'dadra & nagar haveli', 'dadra and nagar haveli', 'dadra and nagar haveli and daman and diu', 'daman & diu', 'daman and diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jammu & kashmir', 'jammu and kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'orissa', 'pondicherry', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'tamilnadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'uttaranchal', 'west  bengal', 'west bangal', 'west bengal', 'westbengal']


In [205]:
fix_map = {
    "orissa": "odisha",
    "pondicherry": "puducherry",
    'chhatisgarh': 'chhattisgarh',
    'balanagar': 'telangana',
    'puttenahalli':'karnataka',
    'raja annamalai puram':'tamil nadu',
    'darbhanga': 'bihar',
    'madanapalle':'andhra pradesh',
    'nagpur':'madhya pradesh',
    
    'west bengli': "west bengal",
    "west bangal": "west bengal",
    "westbengal": "west bengal",
    "west  bengal": "west bengal",
    'uttaranchal':'uttarakhand',
    'jaipur':'rajasthan',

    "jammu & kashmir": "jammu and kashmir",
    'tamilnadu':'tamil nadu',

    "andaman & nicobar islands": "andaman and nicobar islands",

    "dadra & nagar haveli": "dadra and nagar haveli and daman and diu",
    "daman and diu": "dadra and nagar haveli and daman and diu",
    "daman & diu": "dadra and nagar haveli and daman and diu",
    "dadra and nagar haveli": "dadra and nagar haveli and daman and diu",
    "100000": None
}
df_clean["state_clean"] = df_clean["state_clean"].replace(fix_map)
df_clean = df_clean[df_clean["state_clean"].notna()]
print("\nSTEP-2.4 Final Clean Result:")
print("Final Clean States & UTs:", df_clean["state_clean"].nunique())
print(sorted(df_clean["state_clean"].unique()))



STEP-2.4 Final Clean Result:
Final Clean States & UTs: 36
['andaman and nicobar islands', 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jammu and kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal']


In [206]:
df_clean.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,state_clean
0,19-09-2025,Goa,North Goa,403502,0,4,goa
1,19-09-2025,Goa,North Goa,403508,1,4,goa
2,19-09-2025,Goa,North Goa,403513,2,0,goa
3,19-09-2025,Goa,North Goa,403527,2,2,goa
4,19-09-2025,Goa,South Goa,403601,7,3,goa


In [207]:
df_clean['state_clean'] = df_clean['state_clean'].astype(str).str.strip().str.title()
df_clean.head()


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,state_clean
0,19-09-2025,Goa,North Goa,403502,0,4,Goa
1,19-09-2025,Goa,North Goa,403508,1,4,Goa
2,19-09-2025,Goa,North Goa,403513,2,0,Goa
3,19-09-2025,Goa,North Goa,403527,2,2,Goa
4,19-09-2025,Goa,South Goa,403601,7,3,Goa


In [208]:
df_clean.drop(columns=['state'])

Unnamed: 0,date,district,pincode,bio_age_5_17,bio_age_17_,state_clean
0,19-09-2025,North Goa,403502,0,4,Goa
1,19-09-2025,North Goa,403508,1,4,Goa
2,19-09-2025,North Goa,403513,2,0,Goa
3,19-09-2025,North Goa,403527,2,2,Goa
4,19-09-2025,South Goa,403601,7,3,Goa
...,...,...,...,...,...,...
1754010,19-09-2025,West Delhi,110041,102,100,Delhi
1754011,19-09-2025,West Delhi,110043,8,5,Delhi
1754012,19-09-2025,West Delhi,110058,7,5,Delhi
1754013,19-09-2025,North Goa,403102,1,0,Goa


In [281]:
dup_pincode_rows = df_clean[
    df_clean.duplicated(subset=['pincode', 'date'], keep=False)
].sort_values(['pincode', 'date'])

dup_pincode_rows


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,state_clean
1268156,01-03-2025,Delhi,New Delhi,110001,181,159,Delhi
1269660,01-03-2025,Delhi,Central Delhi,110001,10,88,Delhi
1377423,01-09-2025,Delhi,New Delhi,110001,6,4,Delhi
1382001,01-09-2025,Delhi,Central Delhi,110001,0,2,Delhi
338609,01-11-2025,Delhi,New Delhi,110001,1,5,Delhi
...,...,...,...,...,...,...,...
223755,26-10-2025,Bihar,Kishanganj,855117,63,89,Bihar
317478,31-10-2025,Bihar,Katihar,855117,0,1,Bihar
317489,31-10-2025,Bihar,Kishanganj,855117,39,166,Bihar
891204,11-11-2025,Bihar,Purba Champaran,855456,0,1,Bihar


In [295]:
df_clean[(df_clean["pincode"] == 784174) & (df_clean["date"] == "01-09-2025")]

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,state_clean
1386266,01-09-2025,Assam,Sonitpur,784174,0,1,Assam
1393441,01-09-2025,Assam,Biswanath,784174,3,3,Assam


In [296]:
df_agg = (
    df_clean
    .groupby(['date', 'pincode'], as_index=False)
    .agg({
        'state_clean': lambda x: x.mode().iloc[0],
        'district': lambda x: x.mode().iloc[0],
        'bio_age_5_17': 'sum',
        'bio_age_17_': 'sum'
    })
)


In [297]:
df_agg[(df_agg["pincode"] == 784174) & (df_agg["date"] == "01-09-2025")]

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_
106613,01-09-2025,784174,Assam,Biswanath,3,4


In [298]:
dup_pincode_rows = df_agg[
    df_agg.duplicated(subset=['pincode', 'date'], keep=False)
].sort_values(['pincode', 'date'])

dup_pincode_rows

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_


In [299]:
df_agg[(df_agg["pincode"] == 784174) & (df_agg["date"] == "01-09-2025")]

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_
106613,01-09-2025,784174,Assam,Biswanath,3,4


In [300]:
df_agg['date'] = pd.to_datetime(
    df_agg['date'],
    format='%d-%m-%Y'
)


In [301]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408962 entries, 0 to 1408961
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   date          1408962 non-null  datetime64[ns]
 1   pincode       1408962 non-null  int64         
 2   state_clean   1408962 non-null  object        
 3   district      1408962 non-null  object        
 4   bio_age_5_17  1408962 non-null  int64         
 5   bio_age_17_   1408962 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 64.5+ MB


In [302]:
df_agg['date_month']= df_agg['date'].dt.month_name()
df_agg.sample(5)

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month
728198,2025-11-13,758022,Odisha,Kendujhar,6,6,November
1195361,2025-10-25,431516,Maharashtra,Parbhani,3,6,October
605938,2025-11-11,227101,Uttar Pradesh,Lucknow,3,6,November
831964,2025-11-15,638111,Tamil Nadu,Tiruppur,18,16,November
1381500,2025-10-30,500057,Telangana,Hyderabad,2,1,October


In [303]:
df_agg['date_day_name']= df_agg['date'].dt.day_name()
df_agg.head(5)

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month,date_day_name
0,2025-03-01,110001,Delhi,Central Delhi,191,247,March,Saturday
1,2025-03-01,110002,Delhi,Central Delhi,260,427,March,Saturday
2,2025-03-01,110003,Delhi,Central Delhi,347,411,March,Saturday
3,2025-03-01,110005,Delhi,Central Delhi,441,950,March,Saturday
4,2025-03-01,110006,Delhi,North Delhi,901,1526,March,Saturday


In [304]:
df_agg['quater']= df_agg['date'].dt.quarter
df_agg.sample(5)

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month,date_day_name,quater
550574,2025-09-10,798615,Nagaland,Mokokchung,0,1,September,Wednesday,3
345234,2025-09-06,271319,Uttar Pradesh,Gonda,3,0,September,Saturday,3
922883,2025-12-16,827006,Jharkhand,Bokaro,3,1,December,Tuesday,4
423381,2025-12-07,563120,Karnataka,Kolar,3,0,December,Sunday,4
194580,2025-09-03,185152,Jammu And Kashmir,Rajouri,0,3,September,Wednesday,3


In [306]:
df_agg[(df_agg["pincode"] == 784174) & (df_agg["date"] == "2025-09-01")]

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month,date_day_name,quater
106613,2025-09-01,784174,Assam,Biswanath,3,4,September,Monday,3


In [292]:
df_agg.to_csv("biometric.csv", index=False)

In [307]:
df_agg[(df_agg["pincode"] == 784174) & (df_agg["date"] == "2025-09-01")]

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month,date_day_name,quater
106613,2025-09-01,784174,Assam,Biswanath,3,4,September,Monday,3


In [308]:
df_agg.head()

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month,date_day_name,quater
0,2025-03-01,110001,Delhi,Central Delhi,191,247,March,Saturday,1
1,2025-03-01,110002,Delhi,Central Delhi,260,427,March,Saturday,1
2,2025-03-01,110003,Delhi,Central Delhi,347,411,March,Saturday,1
3,2025-03-01,110005,Delhi,Central Delhi,441,950,March,Saturday,1
4,2025-03-01,110006,Delhi,North Delhi,901,1526,March,Saturday,1


In [309]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df_agg, explorative=True)
profile.to_file("biometric.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                     | 0/9 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████| 9/9 [00:01<00:00,  5.32it/s]
  discretized_df.loc[:, column] = self._discretize_column(


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [311]:
df_agg[(df_agg["pincode"] == 784174) & (df_agg["date"] == "2025-09-01")]

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_,date_month,date_day_name,quater
106613,2025-09-01,784174,Assam,Biswanath,3,4,September,Monday,3


In [274]:
df_new = pd.read_csv("biometric.csv")

In [275]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408962 entries, 0 to 1408961
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   date          1408962 non-null  object
 1   pincode       1408962 non-null  int64 
 2   state_clean   1408962 non-null  object
 3   district      1408962 non-null  object
 4   bio_age_5_17  1408962 non-null  int64 
 5   bio_age_17_   1408962 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 64.5+ MB


In [277]:
df_new.head()

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_
0,01-03-2025,110001,Delhi,Central Delhi,191,247
1,01-03-2025,110002,Delhi,Central Delhi,260,427
2,01-03-2025,110003,Delhi,Central Delhi,347,411
3,01-03-2025,110005,Delhi,Central Delhi,441,950
4,01-03-2025,110006,Delhi,North Delhi,901,1526


In [276]:
df_new[(df_new["pincode"] == 784174) & (df_new["date"] == "01-09-2025")]

Unnamed: 0,date,pincode,state_clean,district,bio_age_5_17,bio_age_17_
106613,01-09-2025,784174,Assam,Biswanath,3,4
