# HSS and Degree Enrollment Summary Analysis

This notebook analyzes enrollment data for Higher Secondary School (HSS) and Degree (DEG) students, focusing on:
- Data quality assessment (Aadhar completeness)
- Student identity matching across datasets
- 1:1 relationship validation between Aadhar numbers and student keys

**Data Sources:**
- `hss_enrollments`: Higher Secondary School enrollment records
- `deg_enrollments`: Degree program enrollment records

In [1]:
# Standard libraries
import os
import json
import sqlite3
from pathlib import Path

# Data handling
import pandas as pd
import numpy as np

from sams.config import datasets
from sams.utils import load_data

from IPython.display import display_html

import duckdb
import importlib
import re

[32m2025-11-12 11:13:06.987[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m15[0m - [1mPROJ_ROOT path is: C:\Users\Admin\Documents\GitHub\sams[0m
[32m2025-11-12 11:13:07.081[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m


## 1. Setup and Data Loading

Import required libraries and load enrollment datasets.

In [2]:
deg_enrollments = load_data(datasets["deg_enrollments"]) 
hss_enrollments = load_data(datasets["hss_enrollments"])

[32m2025-11-12 11:13:38.389[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_enrollments.pq[0m
[32m2025-11-12 11:14:16.685[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_enrollments.pq[0m


## 2. Roll Number Decryption

Decrypt and validate roll numbers from BSE (Board of Secondary Education) and CHSE (Council of Higher Secondary Education) Odisha.

In [4]:
# code for decryption
from base64 import b64decode
from Crypto.Cipher import AES

def decrypt_roll(enc_text: str,
                 key: bytes = b"y6idXfCVRG5t2dkeBnmHy9jLu6TEn5Du",
                 enforce_min_length: bool = False,
                 min_length: int = None) -> str:
    try:
        if not enc_text or not isinstance(enc_text, str):
            return "NA"

        raw = b64decode(enc_text)
        cipher = AES.new(key, AES.MODE_ECB)
        decrypted = cipher.decrypt(raw)

        pad_len = decrypted[-1]
        if pad_len < 1 or pad_len > 16:
            return "NA"
        decrypted = decrypted[:-pad_len]

        roll_no = decrypted.decode("utf-8").strip()
        return roll_no
    except Exception:
        return "NA"    

In [8]:
def process_roll_numbers_len_format(df: pd.DataFrame, roll_col: str = 'roll_no') -> pd.DataFrame:
    """
    Decrypt roll numbers and validate only by length rule:
    - BSE Odisha: length must be 9
    - CHSE Odisha: length must be 8
    - Other boards: keep decrypted roll as-is
    """

    # Decrypt roll numbers
    df['roll_no_decrypted'] = df[roll_col].map(decrypt_roll)

    # Identify Odisha boards 
    board_col = df['examination_board_of_the_highest_qualification'].fillna("NA").str.upper()
    # Put the condition to pass these input values of board name        
    mask_bse = (board_col.str.contains(r'\bBOARD OF SECONDARY EDUCATION,\s*ODISHA\b', regex=True)  
                | (board_col.str.contains(r'\bBSE\b(?! MADHYAMA).*ODISHA\b', regex=True) & ~board_col.str.contains(r'\bICSE\b|\bCBSE\b', regex=True)))
    
    mask_chse = (board_col.str.contains(r'\bCOUNCIL OF HIGHER SECONDARY EDUCATION,\s*ODISHA\b', regex=True) 
                 | board_col.str.contains(r'\bCHSE\b.*ODISHA\b', regex=True))

    # Apply validation
    if mask_bse.any():
        rolls_bse = df.loc[mask_bse & df['roll_no_decrypted'].notna(), 'roll_no_decrypted'].astype(str)
        valid_bse = rolls_bse.str.len() == 9
        df.loc[mask_bse & ~valid_bse, 'roll_no_decrypted'] = 'NA'

    if mask_chse.any():
        rolls_chse = df.loc[mask_chse & df['roll_no_decrypted'].notna(), 'roll_no_decrypted'].astype(str)
        valid_chse = rolls_chse.str.len() == 8
        df.loc[mask_chse & ~valid_chse, 'roll_no_decrypted'] = 'NA'

    return df

### Validate Roll Numbers by Length

Apply board-specific validation rules:
- **BSE Odisha**: Roll number must be 9 characters
- **CHSE Odisha**: Roll number must be 8 characters

In [9]:
hss_df = process_roll_numbers_len_format(hss_enrollments)
deg_df = process_roll_numbers_len_format(deg_enrollments)

### Process Both Datasets

Decrypt and validate roll numbers for HSS and DEG datasets.

In [10]:
def encode_part(s: pd.Series, *, na_label="NA", missing_label="MISSING", lower=False) -> pd.Series:
    """
    Encode parts of a student key by handling missing/NA values consistently.
    """
    is_nan = s.isna()
    t = s.astype(str).str.strip()
    t = t.str.strip('"').str.strip("'")   # remove quotes if present

    out = t.copy()

    # Replace explicit NA and missing values
    out = out.mask(t.eq("NA"), na_label)
    out = out.mask(t.eq("") | is_nan, missing_label)

    # Normalize casing if requested
    if lower:
        out = out.where(out.isin([na_label, missing_label]), out.str.lower().str.strip())

    return out

## 3. Student Key Generation

Create unique composite keys for matching students across datasets based on:
- **HSS**: roll_no + dob + year_of_passing + exam_board
- **DEG**: roll_no + year_of_passing + exam_board + exam_type

In [12]:
def generate_student_key_df(df, module_name: str) -> pd.DataFrame:
    """
    Generate a standardized student key for identity matching across datasets.

    The student key format differs by academic module:
    - DEG (Degree): roll number (decrypted) + passing year + exam board + exam type
    - HSS (Higher Secondary): roll number (decrypted) + date of birth + passing year + exam board

    All components are normalized to lowercase strings and stripped of extra whitespace.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing student records.
    
    module_name : str
        Academic module name. Supported values:
        - "DEG" for Degree students
        - "HSS" for Higher Secondary students

    Returns
    -------
    pd.DataFrame
        DataFrame including a new column `student_key` constructed based on module rules.
    """
    
    new_df = df.copy()

    module_name = module_name.upper()

    if module_name == "DEG":
        key_parts = ["roll_no_decrypted", "year_of_passing", "examination_board_of_the_highest_qualification", "examination_type"]
    elif module_name == "HSS":
        key_parts = ["roll_no_decrypted", "dob", "year_of_passing", "examination_board_of_the_highest_qualification"]
    else:
        raise ValueError(f"Invalid module_name '{module_name}'. Use 'DEG' or 'HSS'.")

    # Normalize fields
    for col in key_parts:
        if col in new_df.columns:
            new_df[col] = (
                new_df[col]
                .astype(str)
                .fillna("")
                .str.strip()
                .str.lower()
            )

    # Build composite student key
    new_df["student_key"] = new_df[key_parts].agg("_".join, axis=1)

    return new_df

In [13]:
hss_key_df = generate_student_key_df(hss_df, "HSS")
deg_key_df = generate_student_key_df(deg_df, "DEG")

In [14]:
hss_key_df.columns

Index(['id', 'barcode', 'aadhar_no', 'academic_year', 'module', 'student_name',
       'gender', 'dob', 'social_category', 'orphan', 'es', 'ph', 'address',
       'state', 'district', 'block', 'pin_code', 'annual_income', 'roll_no',
       'highest_qualification', 'board_exam_name_for_highest_qualification',
       'examination_board_of_the_highest_qualification', 'examination_type',
       'year_of_passing', 'total_marks', 'secured_marks', 'percentage',
       'compartmental_status', 'hss_option_details', 'hss_compartments',
       'full_address', 'roll_no_decrypted', 'student_key'],
      dtype='object')

## 4. Enrollment Summary Statistics

### Basic Aadhar Statistics by Year

Summary includes:
- Total enrollments
- Records with Aadhar
- Unique Aadhar numbers
- Missing Aadhar records

In [15]:
# Comprehensive enrollment summary by year
hss_stats = hss_enrollments.groupby("academic_year").agg(
    hss_total=("aadhar_no", "size"),
    hss_with_aadhar=("aadhar_no", "count"),
    hss_unique_aadhar=("aadhar_no", "nunique"),
    hss_missing_aadhar=("aadhar_no", lambda x: x.isna().sum())
)

deg_stats = deg_enrollments.groupby("academic_year").agg(
    deg_total=("aadhar_no", "size"),
    deg_with_aadhar=("aadhar_no", "count"),
    deg_unique_aadhar=("aadhar_no", "nunique"),
    deg_missing_aadhar=("aadhar_no", lambda x: x.isna().sum())
)

# Combine into one table
summary = hss_stats.join(deg_stats, how="outer").fillna(0).astype(int).sort_index()
summary

Unnamed: 0_level_0,hss_total,hss_with_aadhar,hss_unique_aadhar,hss_missing_aadhar,deg_total,deg_with_aadhar,deg_unique_aadhar,deg_missing_aadhar
academic_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018,439528,387069,367580,52459,312823,247212,204304,65611
2019,411118,375768,355307,35350,254342,228079,200209,26263
2020,431636,374368,345900,57268,242661,209834,191842,32827
2021,498180,403448,375367,94732,264152,259152,250724,5000
2022,574238,491504,420678,82734,379911,374911,256801,5000
2023,560769,479176,419134,81593,289849,284849,236544,5000
2024,537932,520066,473096,17866,310753,305753,262392,5000


### Comprehensive Summary Table

Combined HSS and DEG statistics by academic year.

In [16]:
# Calculate 1:1 Aadhar to student_key matches by year
def count_1to1_matches(df):
    results = {}
    for year in df['academic_year'].dropna().unique():
        year_data = df[(df['academic_year'] == year) & (df['aadhar_no'].notna())]
        if len(year_data) == 0:
            results[year] = 0
            continue
        
        # Count unique mappings in both directions
        aadhar_to_key = year_data.groupby('aadhar_no')['student_key'].nunique()
        key_to_aadhar = year_data.groupby('student_key')['aadhar_no'].nunique()
        
        # Find perfect 1:1 matches
        valid_aadhars = aadhar_to_key[aadhar_to_key == 1].index
        valid_data = year_data[year_data['aadhar_no'].isin(valid_aadhars)]
        valid_data = valid_data[valid_data['student_key'].map(key_to_aadhar) == 1]
        results[year] = valid_data['aadhar_no'].nunique()
    
    return pd.Series(results)

# Add 1:1 match column to existing stats
hss_stats['hss_aadhar_1by1_match_with_key'] = count_1to1_matches(hss_key_df)
deg_stats['deg_aadhar_1by1_match_with_key'] = count_1to1_matches(deg_key_df)

# Combine with reordered columns
hss_ordered = hss_stats[['hss_total', 'hss_with_aadhar', 'hss_unique_aadhar', 'hss_missing_aadhar', 'hss_aadhar_1by1_match_with_key']]
deg_ordered = deg_stats[['deg_total', 'deg_with_aadhar', 'deg_unique_aadhar', 'deg_missing_aadhar', 'deg_aadhar_1by1_match_with_key']]

summary_with_1to1 = hss_ordered.join(deg_ordered, how="outer").fillna(0).astype(int).sort_index()
summary_with_1to1

Unnamed: 0_level_0,hss_total,hss_with_aadhar,hss_unique_aadhar,hss_missing_aadhar,hss_aadhar_1by1_match_with_key,deg_total,deg_with_aadhar,deg_unique_aadhar,deg_missing_aadhar,deg_aadhar_1by1_match_with_key
academic_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018,439528,387069,367580,52459,343000,312823,247212,204304,65611,199797
2019,411118,375768,355307,35350,338509,254342,228079,200209,26263,197006
2020,431636,374368,345900,57268,333045,242661,209834,191842,32827,188168
2021,498180,403448,375367,94732,364136,264152,259152,250724,5000,246372
2022,574238,491504,420678,82734,408129,379911,374911,256801,5000,250505
2023,560769,479176,419134,81593,410087,289849,284849,236544,5000,233414
2024,537932,520066,473096,17866,464638,310753,305753,262392,5000,259001


### Aadhar-to-Student Key 1:1 Relationship Analysis

**What this measures:**  
Count of Aadhar numbers that have a perfect 1:1 match with student keys.

**1:1 Match Criteria:**
- Each Aadhar maps to exactly ONE unique student key
- That student key maps back to exactly ONE unique Aadhar

This helps identify clean, unambiguous student records where Aadhar serves as a reliable unique identifier.