In [None]:
import sys
import os
import pandas as pd

# Allow notebook to import from 'src'
sys.path.append(os.path.abspath('../'))

from src.loader import load_category_data
from src.processor import create_master_record

# Define Paths
RAW_PATH = "../data/raw/"
PROCESSED_PATH = "../data/processed/"

# Step 1: Load multiple files from folders
print("Loading Enrolment data...")
enrol_df = load_category_data(RAW_PATH, "enrolment")

print("Loading Demographic data...")
demo_df = load_category_data(RAW_PATH, "demographic")

print("Loading Biometric data...")
bio_df = load_category_data(RAW_PATH, "biometric")

# Step 2: Create Master Dataset
print("Merging into Master DataFrame...")
master_df = create_master_record(enrol_df, demo_df, bio_df)

# Step 3: Basic Feature Engineering & Date Fix
# 'dayfirst=True' addresses the ValueError for dates like 13-09-2025
master_df['date'] = pd.to_datetime(master_df['date'], dayfirst=True, errors='coerce')

# Drop rows where date couldn't be parsed to keep the data clean
if master_df['date'].isnull().any():
    initial_count = len(master_df)
    master_df = master_df.dropna(subset=['date'])
    print(f"Dropped {initial_count - len(master_df)} rows due to unparseable dates.")

# Calculate total activity across all update types
master_df['total_updates'] = master_df.filter(like='update').sum(axis=1)

# Step 4: Save results
os.makedirs(PROCESSED_PATH, exist_ok=True)
master_df.to_csv(f"{PROCESSED_PATH}master_aadhaar_df.csv", index=False)

print("-" * 30)
print(f"Success! Master file created with {len(master_df)} rows.")
print(f"Saved to: {PROCESSED_PATH}master_aadhaar_df.csv")
print("Columns available:", master_df.columns.tolist())

Loading Enrolment data...
Loading Demographic data...
Loading Biometric data...
Merging into Master DataFrame...


ValueError: time data "13-09-2025" doesn't match format "%m-%d-%Y", at position 43. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.