## Import Libraries

In [1]:
import sqlite3
import pandas as pd
import seaborn as sns
import json
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sams.utils import load_data
from sams.config import datasets
import gc
import polars as pl
import duckdb
from pathlib import Path

[32m2025-11-13 12:03:54.674[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m15[0m - [1mPROJ_ROOT path is: C:\Users\Admin\Documents\GitHub\sams[0m
[32m2025-11-13 12:03:54.711[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m
[32m2025-11-13 12:03:54.711[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m


## 1. Load Data

Load student records from SQLite database and parquet files across all education modules (ITI, Diploma, HSS, DEG, CHSE, BSE).

In [2]:
# Use the path from datasets metadata 
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables:", [t[0] for t in tables])

cursor.close()
conn.close()

Tables: ['students', 'institutes', 'results']


In [3]:
db_path = datasets["sams"]["path"]

# Load ITI + Diploma
conn = sqlite3.connect(db_path)
query = """
SELECT academic_year, aadhar_no, student_name, dob,
       admission_status, mark_data, module
FROM students
WHERE module IN ('ITI', 'Diploma');
"""
students_df = pd.read_sql_query(query, conn)
conn.close()

# Extract needed keys (first entry only)
keep_keys = ["YearofPassing", "RollNo", "ExaminationType",
             "HighestQualificationExamBoard", "ExamName"]

def extract_mark(row):
    md = row["mark_data"]
    try:
        md = json.loads(md) if isinstance(md, str) else md
        if isinstance(md, dict): md = [md]
    except:
        md = []
    rec = md[0] if isinstance(md, list) and md else {}
    return {k: rec.get(k) for k in keep_keys}

mark_df = pd.json_normalize(students_df.apply(extract_mark, axis=1))

# Merge + rename
df = pd.concat([students_df.drop(columns=["mark_data"]),
                mark_df.rename(columns={
                    "YearofPassing": "passing_year",
                    "RollNo": "roll_no",
                    "ExaminationType": "exam_type",
                    "HighestQualificationExamBoard": "exam_board",
                    "ExamName": "exam_name"
                })],
               axis=1)

### 1.1 ITI and Diploma Data

- Load ITI and Diploma student records from database
- Parse JSON mark data to extract roll numbers and exam details
- Filter Diploma to include only students with 10th standard as highest qualification

In [4]:
# ITI (all rows)
iti_enrollments = df[df["module"] == "ITI"].reset_index(drop=True)

# Diploma (ONLY exam_name = "10th")
diploma_enrollments = (
    df[(df["module"] == "Diploma") &
       (df["exam_name"].str.lower() == "10th")]
    .reset_index(drop=True)
)

In [5]:
# change the dob in iti_enrollments to YYYY-MM-DD format
iti_enrollments['dob'] = pd.to_datetime(iti_enrollments['dob'], format="%d-%b-%Y", errors="coerce").dt.strftime("%Y-%m-%d")
diploma_enrollments['dob'] = pd.to_datetime(diploma_enrollments['dob'], format="%d-%b-%Y", errors="coerce").dt.strftime("%Y-%m-%d")

In [None]:
iti_marks = load_data(datasets['iti_marks'])
diploma_marks = load_data(datasets['diploma_marks'])

In [6]:
# save df in parquet format
DATA_DIR = Path("C:/Users/Admin/Documents/GitHub/sams/data")
RAW_DATA_DIR = DATA_DIR / "interim"
iti_applications = pd.read_parquet(RAW_DATA_DIR / "iti_applications.pq")
diploma_applications = pd.read_parquet(RAW_DATA_DIR / "diploma_applications.pq")

### 1.2 HSS and DEG Data

- Load Higher Secondary School (HSS) and Degree (DEG) enrollment and application records from parquet files

In [7]:
hss_enrollments = load_data(datasets["hss_enrollments"])
deg_enrollments = load_data(datasets["deg_enrollments"]) 

[32m2025-11-13 12:06:41.081[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_enrollments.pq[0m
[32m2025-11-13 12:07:27.369[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_enrollments.pq[0m
[32m2025-11-13 12:07:27.369[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_enrollments.pq[0m


In [8]:
deg_applications = load_data(datasets["deg_applications"])
hss_applications = load_data(datasets["hss_applications"])

[32m2025-11-13 12:08:06.520[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_applications.pq[0m
[32m2025-11-13 12:08:19.243[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_applications.pq[0m
[32m2025-11-13 12:08:19.243[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_applications.pq[0m


### 1.3 CHSE and BSE Results Data

- Load Council of Higher Secondary Education (CHSE) and Board of Secondary Education (BSE) result records from database
- Assign appropriate exam board names based on module type

In [9]:
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)

query = """
SELECT
    academic_year,
    student_name,
    dob,
    module,
    CASE 
        WHEN module = 'CHSE' THEN 'CHSE, Odisha'
        WHEN module = 'BSE' THEN 'BSE, Odisha'
        ELSE NULL
    END AS exam_board,
    academic_year AS passing_year,    
    roll_no,
    NULL AS roll_no_decrypted,
    exam_type
    
FROM results
WHERE module IN ('CHSE', 'BSE');
"""

df = pd.read_sql_query(query, conn)
conn.close()

# Split into CHSE and BSE datasets
chse_df = df[df["module"] == "CHSE"].reset_index(drop=True)
bse_df  = df[df["module"] == "BSE"].reset_index(drop=True)

In [10]:
bse_df["dob"] = pd.to_datetime(bse_df["dob"], format="%d-%b-%Y", errors="coerce").dt.strftime("%Y-%m-%d")
chse_df['exam_type'] = chse_df['exam_type'].replace({'REGULAR': 'annual'})

## 2. Roll Number Processing

**Function:** `decrypt_roll()` and `process_roll_numbers_len_format()`

Decrypt encrypted roll numbers and validate them using board-specific length rules:
- **BSE (10th standard):** Must be exactly 9 digits
- **CHSE (12th standard):** Must be exactly 8 digits
- Invalid or mismatched rolls are marked as "NA"

**Logic:**
- Decrypt roll numbers using AES encryption (ECB mode)
- Apply regex matching to identify BSE and CHSE Odisha boards
- Validate decrypted roll numbers against expected length for each board

In [11]:
# code for decryption
from base64 import b64decode
from Crypto.Cipher import AES

def decrypt_roll(enc_text: str,
                 key: bytes = b"y6idXfCVRG5t2dkeBnmHy9jLu6TEn5Du",
                 enforce_min_length: bool = False,
                 min_length: int = None) -> str:
    try:
        if not enc_text or not isinstance(enc_text, str):
            return "NA"

        raw = b64decode(enc_text)
        cipher = AES.new(key, AES.MODE_ECB)
        decrypted = cipher.decrypt(raw)

        pad_len = decrypted[-1]
        if pad_len < 1 or pad_len > 16:
            return "NA"
        decrypted = decrypted[:-pad_len]

        roll_no = decrypted.decode("utf-8").strip()
        return roll_no
    except Exception:
        return "NA"    

In [12]:
def process_roll_numbers_len_format(df: pd.DataFrame, roll_col: str = 'roll_no') -> pd.DataFrame:
    """
    Decrypt roll numbers and validate only by length rule:
    - BSE Odisha: length must be 9
    - CHSE Odisha: length must be 8
    - Other boards: keep decrypted roll as-is
    """

    # Decrypt roll numbers
    df['roll_no_decrypted'] = df[roll_col].map(decrypt_roll)

    # Identify Odisha boards 
    board_col = df['exam_board'].fillna("NA").str.upper()
    # Put the condition to pass these input values of board name        
    mask_bse = (board_col.str.contains(r'\bBOARD OF SECONDARY EDUCATION,\s*ODISHA\b', regex=True)  
                | (board_col.str.contains(r'\bBSE\b(?! MADHYAMA).*ODISHA\b', regex=True) & ~board_col.str.contains(r'\bICSE\b|\bCBSE\b', regex=True)))
    
    mask_chse = (board_col.str.contains(r'\bCOUNCIL OF HIGHER SECONDARY EDUCATION,\s*ODISHA\b', regex=True) 
                 | board_col.str.contains(r'\bCHSE\b.*ODISHA\b', regex=True))

    # Apply validation
    if mask_bse.any():
        rolls_bse = df.loc[mask_bse & df['roll_no_decrypted'].notna(), 'roll_no_decrypted'].astype(str)
        valid_bse = rolls_bse.str.len() == 9
        df.loc[mask_bse & ~valid_bse, 'roll_no_decrypted'] = 'NA'

    if mask_chse.any():
        rolls_chse = df.loc[mask_chse & df['roll_no_decrypted'].notna(), 'roll_no_decrypted'].astype(str)
        valid_chse = rolls_chse.str.len() == 8
        df.loc[mask_chse & ~valid_chse, 'roll_no_decrypted'] = 'NA'

    return df

In [13]:
# Columns to keep, then drop everything else and rename finally
keep_cols = [
    'barcode', 'aadhar_no', 'academic_year', 'module', 'student_name',
    'dob', 'examination_board_of_the_highest_qualification','examination_type', 'year_of_passing' , 'roll_no'
]
hss_enrollments = hss_enrollments[keep_cols].copy()
deg_enrollments = deg_enrollments[keep_cols].copy()
rename_map = {
    'examination_board_of_the_highest_qualification': 'exam_board',
    'examination_type': 'exam_type',
    'year_of_passing': 'passing_year'
}
hss_enroll = hss_enrollments.rename(columns=rename_map)
deg_enroll = deg_enrollments.rename(columns=rename_map)

In [14]:
iti_df = process_roll_numbers_len_format(iti_enrollments)
diploma_df = process_roll_numbers_len_format(diploma_enrollments)
hss_df = process_roll_numbers_len_format(hss_enroll)
deg_df = process_roll_numbers_len_format(deg_enroll)

## 3. Student Key Generation

**Function:** `generate_student_key_df()` (6-variable key)

Create unique composite keys to identify students across datasets and academic years.

**Key Components:**
- Student name
- Roll number (decrypted)
- Date of birth
- Passing year
- Exam board
- Exam type

**Logic:**
- Normalize all fields (lowercase, trim whitespace)
- Concatenate fields with underscore separator
- Calculate diagnostics: total records, unique keys, duplicate keys, unique Aadhaar numbers

In [150]:
def encode_part(s: pd.Series, *, na_label="NA", missing_label="MISSING", lower=False) -> pd.Series:
    """
    Encode parts of a student key by handling missing/NA values consistently.
    """
    is_nan = s.isna()
    t = s.astype(str).str.strip()
    t = t.str.strip('"').str.strip("'")   # remove quotes if present

    out = t.copy()

    # Replace explicit NA and missing values
    out = out.mask(t.eq("NA"), na_label)
    out = out.mask(t.eq("") | is_nan, missing_label)

    # Normalize casing if requested
    if lower:
        out = out.where(out.isin([na_label, missing_label]), out.str.lower().str.strip())

    return out

In [101]:
def generate_student_key_df(df, module_name: str) -> pd.DataFrame:
    """
    Clean key columns in-place, then generate a student_key
    and print diagnostics about duplicates.
    """
    new_df = df.copy()

    key_vars = ["passing_year", "dob",
                "roll_no_decrypted", "exam_board", "exam_type"]

    # Normalize and ensure all key parts are strings
    for col in key_vars + ["student_name"]:
        new_df[col] = new_df[col].astype(str).fillna("").str.strip().str.lower()

    # Construct student key safely
    new_df["student_key"] = (
        new_df["student_name"] + "_" +
        new_df["roll_no_decrypted"] + "_" +
        new_df["dob"] + "_" +
        new_df["passing_year"] + "_" +
        new_df["exam_board"] + "_" +
        new_df["exam_type"]
    )

    # Diagnostics
    total_records = len(new_df)
    unique_aadhar = new_df["aadhar_no"].nunique(dropna=True)
    unique_keys = new_df["student_key"].nunique()

    # Problematic duplicates = same key linked to multiple Aadhaar numbers
    dup_check = (
        new_df.groupby("student_key")["aadhar_no"]
        .nunique(dropna=True)
        .reset_index(name="unique_aadhar_count")
    )
    problematic_keys = dup_check[dup_check["unique_aadhar_count"] > 1]["student_key"]
    duplicate_keys_count = len(problematic_keys)

    print(f"\n[{module_name}]")
    print("Total student records:", total_records)
    print("Unique student keys generated:", unique_keys)
    print("Duplicate student keys:", duplicate_keys_count)
    print("Unique Aadhar numbers:", unique_aadhar)

    return new_df

### 3.1 Student Key Generation (4-Variable)

**Function:** `generate_student_key_four_var()` (4-variable key for CHSE and BSE)

Generate module-specific student keys using 4 variables:
- **CHSE:** roll_no + passing_year + exam_board + exam_type
- **BSE:** roll_no + dob + passing_year + exam_board

In [102]:
def generate_student_key_four_var(df: pd.DataFrame, module_name: str) -> pd.DataFrame:
    """
    Generate a standardized 4-variable student key (`student_key_4_var`) 
    for identity matching across datasets.

    The key is built using module-specific rules:
    - CHSE (Higher Secondary): roll_no_decrypted + passing_year + exam_board + exam_type
    - BSE  (Secondary):        roll_no_decrypted + dob + passing_year + exam_board
    - DEG  (Degree):           roll_no_decrypted + passing_year + exam_board + exam_type
    - HSS  (Higher Secondary): roll_no_decrypted + dob + passing_year + exam_board

    All fields are normalized (lowercase, stripped) before concatenation.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing student records.
    module_name : str
        Module name ("CHSE", "BSE", "DEG", or "HSS")

    Returns
    -------
    pd.DataFrame
        DataFrame with a new column `student_key_4_var`
    """

    new_df = df.copy()
    module = module_name.upper()

    # Select key components based on module
    if module in ["CHSE"]:
        key_parts = ["roll_no_decrypted", "passing_year", "exam_board", "exam_type"]
    elif module in ["BSE"]:
        key_parts = ["roll_no_decrypted", "dob", "passing_year", "exam_board"]
    else:
        raise ValueError(f"Invalid module '{module_name}'. Use 'CHSE', 'BSE', 'DEG', or 'HSS'.")

    # Normalize fields
    for col in key_parts:
        new_df[col] = (
            new_df[col].astype(str).fillna("").str.strip().str.lower()
            if col in new_df.columns else ""
        )

    # Create composite key
    new_df["student_key_4_var"] = new_df[key_parts].agg("_".join, axis=1)

    # Summary
    print(f"\n[{module}] Student Key (4-var) Summary")
    print("Total records:", len(new_df))
    print("Unique keys:", new_df["student_key_4_var"].nunique())

    return new_df

In [17]:
iti_key_df = generate_student_key_df(iti_df, "ITI")
diploma_key_df = generate_student_key_df(diploma_df, "Diploma")
hss_key_df = generate_student_key_df(hss_df, "HSS")
deg_key_df = generate_student_key_df(deg_df, "DEG")


[ITI]
Total student records: 559575
Unique student keys generated: 524796
Duplicate student keys: 1807
Unique Aadhar numbers: 518024

[Diploma]
Total student records: 445850
Unique student keys generated: 414078
Duplicate student keys: 4285
Unique Aadhar numbers: 392904

[Diploma]
Total student records: 445850
Unique student keys generated: 414078
Duplicate student keys: 4285
Unique Aadhar numbers: 392904

[DEG]
Total student records: 2054491
Unique student keys generated: 1634565
Duplicate student keys: 1917
Unique Aadhar numbers: 1506304

[DEG]
Total student records: 2054491
Unique student keys generated: 1634565
Duplicate student keys: 1917
Unique Aadhar numbers: 1506304


In [18]:
hss_key_df = generate_student_key_df(hss_df, "HSS")


[HSS]
Total student records: 3453401
Unique student keys generated: 2961052
Duplicate student keys: 5011
Unique Aadhar numbers: 2650769


### 3.2 Merge Enrollment with Applications

**Function:** `merge_enrollment_applications()`

Merge application-level data with student identity information from enrollment records using barcode as the key. Keeps all application rows 

In [146]:
def merge_enrollment_applications(enroll_df, app_df):
    """
    Merge application rows with student identity columns using barcode.
    Keeps all application rows and adds student info from enrollment table.
    """

    # Columns we want from enrollment (as you listed)
    enroll_cols = [
        "barcode", "student_name", "aadhar_no", "dob", "module",
        "academic_year", "exam_board", "exam_type", "passing_year",
        "roll_no", "roll_no_decrypted", "student_key"
    ]
    
    enroll_reduced = enroll_df[enroll_cols].copy()

    # Only barcode from applications
    app_reduced = app_df[["barcode"]].copy()

    # Merge keeping all application rows
    merged = app_reduced.merge(enroll_reduced, on="barcode", how="left")

    return merged

In [147]:
hss_df2 = merge_enrollment_applications(hss_key_df, hss_applications)
deg_df2 = merge_enrollment_applications(deg_key_df, deg_applications)

## 4. Generate Keys and Calculate Summaries for BSE and CHSE

Apply student key generation functions to all modules and calculate year-wise summaries of total observations and unique keys.

In [None]:
bse_df = process_roll_numbers_len_format(bse_df)
chse_df = process_roll_numbers_len_format(chse_df)
bse_key_df = generate_student_key_four_var(bse_df, "BSE")
chse_key_df = generate_student_key_four_var(chse_df, "CHSE")


[BSE] Student Key (4-var) Summary
Total records: 1617255
Unique keys: 1603360
Unique keys: 1603360

[CHSE] Student Key (4-var) Summary
Total records: 1963873

[CHSE] Student Key (4-var) Summary
Total records: 1963873
Unique keys: 1963873
Unique keys: 1963873


In [None]:
# count the total number of students acrooss all years for the both dtaset and check for unique student_key
# table : Year | Num. observations | Num. unique key
bse_summary = (
    bse_key_df.groupby("academic_year")
    .agg(
        student_count=("student_name", "size"),
        num_unique_keys=("student_key_4_var", "nunique")
    )
    .reset_index()
)
bse_summary

Unnamed: 0,academic_year,num_observations,num_unique_keys
0,2023,541391,537314
1,2024,563426,553608
2,2025,512438,512438


In [None]:
chse_summary = (
    chse_key_df.groupby("academic_year")
    .agg(
        student_count=("student_name", "size"),
        num_unique_keys=("student_key_4_var", "nunique")
    )
    .reset_index()
)
chse_summary

Unnamed: 0,academic_year,student_count,num_unique_keys
0,2020,306230,306230
1,2021,295178,295178
2,2022,301650,301650
3,2023,338220,338220
4,2024,357825,357825
5,2025,364770,364770


## 5. Yearly Summaries (All Modules)

Calculate unique Aadhaar and unique key counts across all academic years for HSS, DEG modules.

In [152]:
hss_df2.columns

Index(['barcode', 'student_name', 'aadhar_no', 'dob', 'module',
       'academic_year', 'exam_board', 'exam_type', 'passing_year', 'roll_no',
       'roll_no_decrypted', 'student_key'],
      dtype='object')

In [163]:
# HSS and DEG 
hss = hss_df2.groupby("academic_year").agg(
    hss_app=("academic_year", "size"),
    hss_aadhar=("aadhar_no", "nunique"),
    hss_key=("student_key", "nunique"),
).reset_index()

deg = deg_df2.groupby("academic_year").agg(
    deg_app=("academic_year", "size"),
    deg_aadhar=("aadhar_no", "nunique"),
    deg_key=("student_key", "nunique"),
).reset_index()

# Final merged table
hss_deg_stats = hss.merge(deg, on="academic_year")
hss_deg_stats["academic_year"] = hss_deg_stats["academic_year"].astype("Int64")
hss_deg_stats

Unnamed: 0,academic_year,hss_app,hss_aadhar,hss_key,deg_app,deg_aadhar,deg_key
0,2018,2237155,367579,410423,1519247,204304,246871
1,2019,2060027,355307,387103,1669860,200209,224421
2,2020,2280404,345900,394490,1594363,191842,221414
3,2021,2591931,375367,446480,2176487,250724,257168
4,2022,2772098,407112,466993,3199753,256801,268724
5,2023,3110640,419134,480728,2233931,236544,242242
6,2024,6083906,473096,487117,2246432,262392,268466


### 5.1 Clean Aadhaar Summary (HSS & DEG)

**Function:** `yearly_clean_aadhar_summary()`

Generate comprehensive yearly metrics for HSS and DEG modules:
- Total Aadhaar entries (before cleaning)
- Unique Aadhaar (after excluding null/placeholder values)
- Missing Aadhaar count
- 1-by-1 matches (clean identity matching)

**Logic:**
- Define bad values: empty strings, 'na', 'null', 'nan', etc.
- Clean and normalize Aadhaar and student_key fields
- Remove bad values and deduplicate pairs
- Calculate one-to-one matches where both Aadhaar and key appear exactly once

In [None]:
def yearly_clean_aadhar_summary(df, prefix):

    # fields we need
    data = df[['academic_year', 'aadhar_no', 'student_key']].copy()

    # Normalize strings (trim + lowercase)
    for col in ['academic_year', 'aadhar_no', 'student_key']:
        data[col] = data[col].astype(str).str.strip().str.lower()

    # Define what counts as an invalid entry
    invalid_entries = {"", "null", "nan"}  

    def summarize(year_df):

        total_rows = len(year_df)
        invalid_aadhar = year_df['aadhar_no'].isin(invalid_entries).sum()

        # keep only rows with valid pairs
        valid_rows = year_df[
            (~year_df['aadhar_no'].isin(invalid_entries)) &
            (~year_df['student_key'].isin(invalid_entries))
        ].drop_duplicates(subset=['aadhar_no', 'student_key'])

        unique_aadhar = valid_rows['aadhar_no'].nunique()

        # identify one-to-one Aadhaar <-> student_key pairs
        aadhar_counts = valid_rows['aadhar_no'].value_counts()
        key_counts = valid_rows['student_key'].value_counts()

        one_to_one = valid_rows[
            valid_rows['aadhar_no'].isin(aadhar_counts[aadhar_counts == 1].index) &
            valid_rows['student_key'].isin(key_counts[key_counts == 1].index)
        ]

        return pd.Series({
            f"{prefix}_total_aadhar": total_rows,
            f"{prefix}_unique_aadhar": unique_aadhar,
            f"{prefix}_invalid_aadhar": invalid_aadhar,
            f"{prefix}_one_to_one": len(one_to_one)
        })

    summary = (
        data.groupby("academic_year", group_keys=False)
            .apply(summarize)
            .reset_index()
            .rename(columns={"academic_year": "year"})
    )

    return summary


# Run for HSS & DEG
hss_clean = yearly_clean_aadhar_summary(hss_key_df, "hss")
deg_clean = yearly_clean_aadhar_summary(deg_key_df, "deg")

hss_deg_total_unique_invalid_aadhar_1by1match = (
    hss_clean.merge(deg_clean, on="year", how="outer")
             .sort_values("year")
)

hss_deg_total_unique_invalid_aadhar_1by1match

## 6. Data Quality Analysis

Assess Aadhaar coverage, identify repeated entries, and evaluate identity matching quality across all modules.

In [49]:
# repeat Aadhaar check by academic year
def repeated_aadhaar_summary(df, module_name):
    print(f"\n--- {module_name.upper()}: Repeated Aadhaar by Year ---")
    repeat_summary = (
        df.groupby(['academic_year', 'aadhar_no'])
          .size()
          .reset_index(name='count')
          .query('count > 5')  # Only keep Aadhaar appearing more than five times
          .sort_values(['academic_year', 'count'], ascending=[True, False])
    )
    print(repeat_summary.to_string(index=False))

# Run for ITI and Diploma
repeated_aadhaar_summary(iti_enrollments, "ITI ENROLLMENT")


--- ITI ENROLLMENT: Repeated Aadhaar by Year ---
 academic_year                                    aadhar_no  count
          2017 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=   3181
          2018 UIC27nlODwzAwV13RAZD1vk8kiSxo2GLRDviArS4Ktg=     10
 academic_year                                    aadhar_no  count
          2017 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=   3181
          2018 UIC27nlODwzAwV13RAZD1vk8kiSxo2GLRDviArS4Ktg=     10


### 6.1 Repeated Aadhaar Detection

**Function:** `repeated_aadhaar_summary()`

Identify Aadhaar numbers appearing more than 5 times within the same academic year (potential data quality issues or shared/placeholder values).

In [50]:
repeated_aadhaar_summary(diploma_enrollments, "Diploma ENROLLMENT")


--- DIPLOMA ENROLLMENT: Repeated Aadhaar by Year ---
 academic_year                                    aadhar_no  count
          2018 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=  10118
          2019 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=  13183
 academic_year                                    aadhar_no  count
          2018 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=  10118
          2019 47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=  13183


In [127]:
def missing_from_enrollments(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    # Group by year + Aadhaar and count occurrences
    repeats = (
        df.groupby(['academic_year', 'aadhar_no'])
          .size()
          .reset_index(name='cnt')
    )

    # Keep rows where Aadhaar repeats (>1)
    hashed = (
        repeats[repeats['cnt'] > 1]
            .groupby('academic_year')['cnt']
            .sum()
            .reset_index()
            .rename(columns={'cnt': f'{prefix}_missing_aadhar'})
    )

    return hashed

In [128]:
iti_missing = missing_from_enrollments(iti_key_df, 'iti')
dip_missing = missing_from_enrollments(diploma_key_df, 'diploma')

In [129]:
print('ITI missing (hashed):')
print(iti_missing.head(7).to_string(index=False))

print('\nDiploma missing (hashed):')
print(dip_missing.head(7).to_string(index=False))


ITI missing (hashed):
 academic_year  iti_missing_aadhar
          2017               10949
          2018                8840
          2020                   4

Diploma missing (hashed):
 academic_year  diploma_missing_aadhar
          2018                   12079
          2019                   13329
          2020                     107


### 6.2 Unreliable Aadhaar Index

**Function:** `build_unreliable_aadhaar_index()`

**Logic:**
- Group by year and Aadhaar to count occurrences
- Flag Aadhaar values exceeding threshold or matching known placeholders
- Return DataFrame with unreliable Aadhaar marked

In [137]:
KNOWN_AADHAAR_PLACEHOLDERS = {
    '47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=',  # common placeholder
}

def build_unreliable_aadhaar_index(df: pd.DataFrame,
                                   per_year_threshold: int = 100,
                                   known_placeholders: set[str] | None = None) -> pd.DataFrame:

    known_placeholders = known_placeholders or set()

    # Count occurrences per year
    counts = (
        df.groupby(['academic_year', 'aadhar_no'])
          .size()
          .reset_index(name='cnt')
    )

    # Aadhaar values that repeat above threshold
    heavy = counts[counts['cnt'] >= per_year_threshold][['academic_year', 'aadhar_no']]

    # Aadhaar values that match known placeholder hashes
    known = counts[counts['aadhar_no'].isin(known_placeholders)][['academic_year', 'aadhar_no']]

    # Combine
    out = pd.concat([heavy, known]).drop_duplicates()
    out['unreliable'] = True

    return out


**Function:** `summarize_clean_aadhaar()`

Calculate yearly metrics after excluding unreliable Aadhaar values:
- `unique_aadhar`: Count of unique reliable Aadhaar numbers
- `supposed_missing`: Count of unreliable/placeholder Aadhaar
- `non_unique_aadhar`: Count of Aadhaar that still repeat among reliable rows

**Note:** This approach has limitations. Use `yearly_clean_aadhar_summary()` for more accurate analysis.

In [None]:
def summarize_clean_aadhaar(df: pd.DataFrame,
                            prefix: str,
                            unreliable_idx: pd.DataFrame) -> pd.DataFrame:

    data = df[['academic_year', 'aadhar_no']].copy()

    # Mark unreliable Aadhaar values
    if unreliable_idx is not None and not unreliable_idx.empty:
        data = data.merge(unreliable_idx, on=['academic_year', 'aadhar_no'], how='left')
        data['unreliable'] = data['unreliable'].fillna(False)
    else:
        data['unreliable'] = False

    data['supposed_missing'] = data['unreliable']

    # Keep only reliable Aadhaar rows
    reliable_rows = data[data['unreliable'] == False]

    # Count unique reliable Aadhaar per year
    unique_count = (
        reliable_rows.groupby('academic_year')['aadhar_no']
                     .nunique()
                     .rename(f'{prefix}_unique_aadhar')
    )

    # Count Aadhaar that STILL repeat among reliable rows
    repeated_count = (
        reliable_rows.groupby(['academic_year', 'aadhar_no'])
                     .size()
                     .reset_index(name='cnt')
                     .query('cnt > 1')
                     .groupby('academic_year')
                     .size()
                     .rename(f'{prefix}_non_unique_aadhar')
    )

    # Count unreliable rows per year
    missing_count = (
        data.groupby('academic_year')['supposed_missing']
            .sum()
            .rename(f'{prefix}_supposed_missing')
    )

    # Combine results
    summary = (
        pd.concat([unique_count, missing_count, repeated_count], axis=1)
          .reset_index()
          .fillna(0)
    )

    # Ensure numeric columns are int
    for col in summary.columns:
        if col != 'academic_year':
            summary[col] = summary[col].astype(int)

    return summary


### 6.3 Yearly summary


In [38]:
# Yearly summary for ITI & Diploma with application counts
def create_yearly_summary(enrollment_df, application_df):
    """
    Generate yearly summary combining enrollment and application data.
    
    Parameters:
    - enrollment_df: DataFrame with student_key and aadhar_no (from key generation)
    - application_df: DataFrame with barcode for counting total applications
    """
    
    # Count total applications by year
    app_counts = (
        application_df.groupby('academic_year')
        .size()
        .reset_index(name='total_applications')
    )
    
    # Calculate Aadhaar and 1-by-1 match metrics from enrollment data
    def _metrics(g):
        mapping = g[['aadhar_no', 'student_key']].drop_duplicates()

        # Find unique Aadhaar and unique keys within the year
        unique_aadhar = mapping['aadhar_no'].value_counts() == 1
        unique_key = mapping['student_key'].value_counts() == 1

        # Count 1-by-1 matches (unique Aadhaar to unique key)
        one_to_one = mapping[
            mapping['aadhar_no'].isin(unique_aadhar[unique_aadhar].index) &
            mapping['student_key'].isin(unique_key[unique_key].index)
        ]

        return pd.Series({
            'aadhar': g['aadhar_no'].nunique(),
            '1by1_match': len(one_to_one),
        })

    enroll_summary = (
        enrollment_df.groupby('academic_year', group_keys=False)
          .apply(_metrics)
          .reset_index()
    )
    
    # Merge application counts with enrollment metrics
    summary = app_counts.merge(enroll_summary, on='academic_year', how='outer')
    
    return summary

### 6.4 One-to-One Match Quality (ITI & Diploma)

**Function:** `create_yearly_summary()`

Calculate yearly identity matching quality metrics:
- Total applications by year
- Unique Aadhaar count
- 1-by-1 matches (one Aadhaar maps to exactly one student key)

**Logic:**
- Count applications from application DataFrame
- Deduplicate (Aadhaar, student_key) pairs within each year
- Identify pairs where both Aadhaar and student_key appear exactly once
- Merge application counts with enrollment metrics

In [39]:
# Create summaries for ITI and Diploma
iti_summary = create_yearly_summary(iti_key_df, iti_applications)
diploma_summary = create_yearly_summary(diploma_key_df, diploma_applications)

final_summary = (
    iti_summary.merge(
        diploma_summary, on='academic_year',how='outer', suffixes=('_iti', '_diploma'))
    )

# Rename columns for clarity
final_summary = final_summary.rename(columns={
    'academic_year': 'year',
    'total_applications_iti': 'iti_applications',
    'aadhar_iti': 'iti_aadhar',
    '1by1_match_iti': 'iti_1by1_match',
    'total_applications_diploma': 'diploma_applications',
    'aadhar_diploma': 'diploma_aadhar',
    '1by1_match_diploma': 'diploma_1by1_match',
})

# Final column ordering
final_summary = final_summary[
    [
        'year',
        'iti_applications', 'iti_aadhar', 'iti_1by1_match',
        'diploma_applications', 'diploma_aadhar', 'diploma_1by1_match', 
    ]
]

# Print total applications across all years
print(f"Total ITI Applications: {final_summary['iti_applications'].sum():,}")
print(f"Total Diploma Applications: {final_summary['diploma_applications'].sum():,}\n")

final_summary

  .apply(_metrics)


Total ITI Applications: 2,218,985
Total Diploma Applications: 1,552,260.0



  .apply(_metrics)


Unnamed: 0,year,iti_applications,iti_aadhar,iti_1by1_match,diploma_applications,diploma_aadhar,diploma_1by1_match
0,2017,145612,27337,26065,,,
1,2018,215662,70613,69617,203293.0,49934.0,48543.0
2,2019,261505,64148,64119,186285.0,41590.0,41484.0
3,2020,208581,67412,67407,181710.0,52959.0,52868.0
4,2021,300121,68000,67974,175738.0,54979.0,54949.0
5,2022,303082,74104,74077,244165.0,71768.0,71747.0
6,2023,388380,92085,92069,270782.0,73513.0,73503.0
7,2024,396042,83958,83958,290287.0,76694.0,76682.0


### 6.5 Unique Key count for ITI and Diploma

In [None]:
iti = iti_key_df.groupby('academic_year').agg(
    unique_key=('student_key', 'nunique')
).reset_index()

diploma = diploma_key_df.groupby('academic_year').agg(
    unique_key=('student_key', 'nunique')
).reset_index()

iti.merge(diploma, on='academic_year', suffixes=('_iti', '_diploma')).rename(columns={'academic_year': 'year'})

Unnamed: 0,year,unique_key_iti,unique_key_diploma
0,2018,71194,58826
1,2019,64133,54726
2,2020,67413,52982
3,2021,67987,54964
4,2022,74090,71757
5,2023,92077,73508
6,2024,83958,76688


## 5. Methodology Summary

Overview of the data quality validation approach and key concepts used in this analysis.

---

### 1. Student Key Generation

**Main_goal:** Create unique key for students across different module and check for the 1by1 match with present aadhar

**Approach:**
- Concatenate multiple fields to create a composite key
- Fields used: name + roll_no + dob + passing_year + exam_board + exam_type
- All fields are normalized (lowercase, stripped) for consistency
- Missing values are handled consistently with "MISSING" or "NA" labels


### 2. Roll Number Decryption and Validation

**Process:**
1. **Decryption:** Use AES encryption with ECB mode to decrypt Base64-encoded roll numbers
2. **Validation:** Apply board-specific length rules:
   - BSE (10th standard): Must be exactly 9 digits
   - CHSE (12th standard): Must be exactly 8 digits
3. **Invalid handling:** Mark invalid rolls as "NA" to prevent incorrect matches


### 3. Aadhaar Quality Assessment


**Problems with Aadhaar data:**
- **Missing values:** Same Aadhaar used for multiple students (hashed placeholders)

**Solution - Unreliable Aadhaar Index:**
- Flag Aadhaar values appearing >100 times in a single year 

**Metrics calculated:**
- `unique_aadhar`: Distinct Aadhaar after excluding unreliable ones
- `supposed_missing`: Count of null or unreliable Aadhaar
- `non_unique_aadhar`: Count of Aadhaar IDs still appearing multiple times (WHICH WAS WRONG CONCEPT TO CALCULATE)

### 4. One-to-One (1-by-1) Matching


**Definition:**
A 1-by-1 match occurs when:
- One Aadhaar number maps to exactly one student key (within a year)

**Algorithm:**
1. Create mapping of (Aadhaar, student_key) pairs
2. Count occurrences of each Aadhaar and each student_key
3. Keep only pairs where both appear exactly once

### 5. Yearly Aggregation Strategy

**Key metrics tracked yearly:**
- Applications: Total application records
- Unique Aadhaar: Distinct valid Aadhaar numbers
- Unique Keys: Distinct student identifiers generated
- 1-by-1 Matches: Clean identity matches
- Missing/Unreliable: Data quality issues


### 6. Module-Specific Considerations

**ITI (Industrial Training Institute):**
- Higher missing Aadhaar rates expected, the hash one, loaded from db

**Diploma:**
- Filtered to 10th pass students only, loaded from db

**HSS (Higher Secondary School):**
**DEG (Degree):**
- Both has been imported from pipeline and then merge enrollment to applications dataset on the barcode to include all columns from enrollments dataset at application level

---
