# BSE/CHSE and HSS/DEG Integration

This notebook integrates student records from BSE (Board of Secondary Education) and CHSE (Council of Higher Secondary Education) with HSS (Higher Secondary School) and DEG (Degree) enrollment data using a student key matching approach.

In [1]:
# Standard libraries
import os
import json
import sqlite3
from pathlib import Path

# Data handling
import pandas as pd
import numpy as np

from sams.config import datasets
from sams.utils import load_data

from IPython.display import display_html

import duckdb
import importlib
import re

[32m2025-11-11 16:41:15.567[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m15[0m - [1mPROJ_ROOT path is: C:\Users\Admin\Documents\GitHub\sams[0m
[32m2025-11-11 16:41:15.605[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m
[32m2025-11-11 16:41:15.605[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m


## 1. Database Connection and Exploration

Connect to the SAMS database and verify available tables.

In [2]:
# Use the path from datasets metadata 
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables:", [t[0] for t in tables])

cursor.close()
conn.close()

Tables: ['students', 'institutes', 'results']


In [3]:
db_path = datasets["sams"]["path"]   
conn = sqlite3.connect(db_path)

query = """
SELECT
    module,
    academic_year,
    COUNT(*) AS total_records
FROM results
WHERE module IN ('CHSE', 'BSE')
GROUP BY module, academic_year
"""

df_summary = pd.read_sql_query(query, conn)
conn.close()

print("\nSummary of records in students table for CHSE & BSE")
print(df_summary)


Summary of records in students table for CHSE & BSE
  module  academic_year  total_records
0    BSE           2023         541391
1    BSE           2024         563426
2    BSE           2025         512438
3   CHSE           2020         306230
4   CHSE           2021         295178
5   CHSE           2022         301650
6   CHSE           2023         338220
7   CHSE           2024         357825
8   CHSE           2025         364770


## 2. Extract BSE and CHSE Results

Query student results from the database and prepare separate datasets for BSE and CHSE modules.

In [4]:
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)

query = """
SELECT
    academic_year,
    student_name,
    dob,
    module,
    CASE 
        WHEN module = 'CHSE' THEN 'CHSE, Odisha'
        WHEN module = 'BSE' THEN 'BSE, Odisha'
        ELSE NULL
    END AS exam_board,
    academic_year AS passing_year,    
    roll_no,
    NULL AS roll_no_decrypted,
    exam_type,
    total_marks,
    secured_marks
    
FROM results
WHERE module IN ('CHSE', 'BSE');
"""

df = pd.read_sql_query(query, conn)
conn.close()

# Split into CHSE and BSE datasets
chse_df = df[df["module"] == "CHSE"].reset_index(drop=True)
bse_df  = df[df["module"] == "BSE"].reset_index(drop=True)

In [5]:
bse_df["dob"] = pd.to_datetime(bse_df["dob"], format="%d-%b-%Y", errors="coerce").dt.strftime("%Y-%m-%d")
chse_df['exam_type'] = chse_df['exam_type'].replace({'REGULAR': 'annual'})

## 3. Data Preprocessing

Standardize date formats and examination types across datasets.

In [6]:
bse_df.head()

Unnamed: 0,academic_year,student_name,dob,module,exam_board,passing_year,roll_no,roll_no_decrypted,exam_type,total_marks,secured_marks
0,2023,ABINASH MOHANTY,2008-07-10,BSE,"BSE, Odisha",2023,BjmySEYFgo9nafRPCvdRxQ==,,,600,382
1,2023,ABINASH MOHAPATRA,2008-01-11,BSE,"BSE, Odisha",2023,aiXYy3pk7k14QxayJv9o0Q==,,,600,242
2,2023,AYUSH PRASAD BEHERA,2007-09-30,BSE,"BSE, Odisha",2023,sraISh0Oycu9CczXy1aPIg==,,,600,404
3,2023,CHANDAN KUMAR BEHERA,2008-03-28,BSE,"BSE, Odisha",2023,N6fBCLrD8TtU9RVyK1D19Q==,,,600,271
4,2023,DIBYAM DEBABRATA PANDA,2008-04-18,BSE,"BSE, Odisha",2023,NTQwIKfe5qVXhYlmpgJqYQ==,,,600,459


In [7]:
# exam type of bse_df  amke regular as a annual and none as supplementary
chse_df.head()

Unnamed: 0,academic_year,student_name,dob,module,exam_board,passing_year,roll_no,roll_no_decrypted,exam_type,total_marks,secured_marks
0,2020,LAXMIPRIYA SAHOO,,CHSE,"CHSE, Odisha",2020,JWYDNYx2mEyC7ANeVRFzKA==,,annual,600,266
1,2020,RUPALI CHAND,,CHSE,"CHSE, Odisha",2020,1sD0KMtHBvCxKU70xBP6xQ==,,annual,600,222
2,2020,SUBHASHREE SAMAL,,CHSE,"CHSE, Odisha",2020,ge9u3EzOR+vpHSatNuD1IA==,,annual,600,246
3,2020,SUDEEPA SAHOO,,CHSE,"CHSE, Odisha",2020,tGOeM7RY445B0sPMkoBrew==,,annual,600,245
4,2020,MADHUSMITA DAS,,CHSE,"CHSE, Odisha",2020,NATgaXBHp16tblabaNJHLg==,,annual,600,275


## 4. Roll Number Decryption

Decrypt and validate roll numbers based on board-specific rules:
- **BSE Odisha**: 9-digit roll numbers
- **CHSE Odisha**: 8-digit roll numbers

In [8]:
# code for decryption
from base64 import b64decode
from Crypto.Cipher import AES

def decrypt_roll(enc_text: str,
                 key: bytes = b"y6idXfCVRG5t2dkeBnmHy9jLu6TEn5Du",
                 enforce_min_length: bool = False,
                 min_length: int = None) -> str:
    try:
        if not enc_text or not isinstance(enc_text, str):
            return "NA"

        raw = b64decode(enc_text)
        cipher = AES.new(key, AES.MODE_ECB)
        decrypted = cipher.decrypt(raw)

        pad_len = decrypted[-1]
        if pad_len < 1 or pad_len > 16:
            return "NA"
        decrypted = decrypted[:-pad_len]

        roll_no = decrypted.decode("utf-8").strip()
        return roll_no
    except Exception:
        return "NA"    

In [9]:
def process_roll_numbers_len_format(df: pd.DataFrame, roll_col: str = 'roll_no') -> pd.DataFrame:
    """
    Decrypt roll numbers and validate only by length rule:
    - BSE Odisha: length must be 9
    - CHSE Odisha: length must be 8
    - Other boards: keep decrypted roll as-is
    """

    # Decrypt roll numbers
    df['roll_no_decrypted'] = df[roll_col].map(decrypt_roll)

    # Identify Odisha boards 
    board_col = df['exam_board'].fillna("NA").str.upper()
    # Put the condition to pass these input values of board name        
    mask_bse = (board_col.str.contains(r'\bBOARD OF SECONDARY EDUCATION,\s*ODISHA\b', regex=True)  
                | (board_col.str.contains(r'\bBSE\b(?! MADHYAMA).*ODISHA\b', regex=True) & ~board_col.str.contains(r'\bICSE\b|\bCBSE\b', regex=True)))
    
    mask_chse = (board_col.str.contains(r'\bCOUNCIL OF HIGHER SECONDARY EDUCATION,\s*ODISHA\b', regex=True) 
                 | board_col.str.contains(r'\bCHSE\b.*ODISHA\b', regex=True))

    # Apply validation
    if mask_bse.any():
        rolls_bse = df.loc[mask_bse & df['roll_no_decrypted'].notna(), 'roll_no_decrypted'].astype(str)
        valid_bse = rolls_bse.str.len() == 9
        df.loc[mask_bse & ~valid_bse, 'roll_no_decrypted'] = 'NA'

    if mask_chse.any():
        rolls_chse = df.loc[mask_chse & df['roll_no_decrypted'].notna(), 'roll_no_decrypted'].astype(str)
        valid_chse = rolls_chse.str.len() == 8
        df.loc[mask_chse & ~valid_chse, 'roll_no_decrypted'] = 'NA'

    return df

In [10]:
bse_df = process_roll_numbers_len_format(bse_df)
chse_df = process_roll_numbers_len_format(chse_df)

In [11]:
bse_df.head()

Unnamed: 0,academic_year,student_name,dob,module,exam_board,passing_year,roll_no,roll_no_decrypted,exam_type,total_marks,secured_marks
0,2023,ABINASH MOHANTY,2008-07-10,BSE,"BSE, Odisha",2023,BjmySEYFgo9nafRPCvdRxQ==,001AA0001,,600,382
1,2023,ABINASH MOHAPATRA,2008-01-11,BSE,"BSE, Odisha",2023,aiXYy3pk7k14QxayJv9o0Q==,001AA0002,,600,242
2,2023,AYUSH PRASAD BEHERA,2007-09-30,BSE,"BSE, Odisha",2023,sraISh0Oycu9CczXy1aPIg==,001AA0003,,600,404
3,2023,CHANDAN KUMAR BEHERA,2008-03-28,BSE,"BSE, Odisha",2023,N6fBCLrD8TtU9RVyK1D19Q==,001AA0004,,600,271
4,2023,DIBYAM DEBABRATA PANDA,2008-04-18,BSE,"BSE, Odisha",2023,NTQwIKfe5qVXhYlmpgJqYQ==,001AA0005,,600,459


## 5. Student Key Generation (BSE/CHSE)

Generate unique student identifiers for matching:
- **CHSE**: `roll_no_decrypted` + `passing_year` + `exam_board` + `exam_type`
- **BSE**: `roll_no_decrypted` + `dob` + `passing_year` + `exam_board`

In [12]:
def encode_part(s: pd.Series, *, na_label="NA", missing_label="MISSING", lower=False) -> pd.Series:
    """
    Encode parts of a student key by handling missing/NA values consistently.
    """
    is_nan = s.isna()
    t = s.astype(str).str.strip()
    t = t.str.strip('"').str.strip("'")   # remove quotes if present

    out = t.copy()

    # Replace explicit NA and missing values
    out = out.mask(t.eq("NA"), na_label)
    out = out.mask(t.eq("") | is_nan, missing_label)

    # Normalize casing if requested
    if lower:
        out = out.where(out.isin([na_label, missing_label]), out.str.lower().str.strip())

    return out

In [13]:
def generate_student_key_df(df, module_name: str) -> pd.DataFrame:
    """
    Generate a student_key for identity matching, based on education board rules.

    Student key components differ by module:
    - CHSE (Higher Secondary): roll_no_decrypted + passing_year + exam_board + exam_type
    - BSE  (Secondary):        roll_no_decrypted + dob + passing_year + exam_board

    All fields are normalized to lowercase strings and stripped of whitespace.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing student records.

    module_name : str
        Module name ("CHSE" or "BSE")

    Returns
    -------
    pd.DataFrame
        DataFrame with new column `student_key`
    """

    new_df = df.copy()
    module = module_name.upper()

    # Select student key components based on module
    if module == "CHSE":
        key_parts = ["roll_no_decrypted", "passing_year", "exam_board", "exam_type"]
    elif module == "BSE":
        key_parts = ["roll_no_decrypted", "dob", "passing_year", "exam_board"]
    else:
        raise ValueError(f"Invalid module '{module_name}'. Use 'CHSE' or 'BSE'.")

    # Normalize key fields
    for col in key_parts:
        new_df[col] = (
            new_df[col]
            .astype(str)
            .fillna("")
            .str.strip()
            .str.lower()
        )

    # Build student key
    new_df["student_key"] = new_df[key_parts].agg("_".join, axis=1)

    # Summary
    print(f"\n[{module_name}] Student Key Summary")
    print("Total records:", len(new_df))
    print("Unique student keys:", new_df["student_key"].nunique())

    return new_df


In [None]:
bse__key_df  = generate_student_key_df(bse_df, "BSE")
chse_key_df = generate_student_key_df(chse_df, "CHSE")


[BSE] Student Key Summary
Total records: 1617255
Unique student keys: 1603360
Unique student keys: 1603360


In [None]:
bse__key_df.head()

Unnamed: 0,academic_year,student_name,dob,module,exam_board,passing_year,roll_no,roll_no_decrypted,exam_type,total_marks,secured_marks,student_key
0,2023,ABINASH MOHANTY,2008-07-10,BSE,"bse, odisha",2023,BjmySEYFgo9nafRPCvdRxQ==,001aa0001,,600,382,"001aa0001_2008-07-10_2023_bse, odisha"
1,2023,ABINASH MOHAPATRA,2008-01-11,BSE,"bse, odisha",2023,aiXYy3pk7k14QxayJv9o0Q==,001aa0002,,600,242,"001aa0002_2008-01-11_2023_bse, odisha"
2,2023,AYUSH PRASAD BEHERA,2007-09-30,BSE,"bse, odisha",2023,sraISh0Oycu9CczXy1aPIg==,001aa0003,,600,404,"001aa0003_2007-09-30_2023_bse, odisha"
3,2023,CHANDAN KUMAR BEHERA,2008-03-28,BSE,"bse, odisha",2023,N6fBCLrD8TtU9RVyK1D19Q==,001aa0004,,600,271,"001aa0004_2008-03-28_2023_bse, odisha"
4,2023,DIBYAM DEBABRATA PANDA,2008-04-18,BSE,"bse, odisha",2023,NTQwIKfe5qVXhYlmpgJqYQ==,001aa0005,,600,459,"001aa0005_2008-04-18_2023_bse, odisha"


In [None]:
chse_key_df.head()

Unnamed: 0,academic_year,student_name,dob,module,exam_board,passing_year,roll_no,roll_no_decrypted,exam_type,total_marks,secured_marks,student_key
0,2020,LAXMIPRIYA SAHOO,,CHSE,"chse, odisha",2020,JWYDNYx2mEyC7ANeVRFzKA==,101aa001,annual,600,266,"101aa001_2020_chse, odisha_annual"
1,2020,RUPALI CHAND,,CHSE,"chse, odisha",2020,1sD0KMtHBvCxKU70xBP6xQ==,101aa002,annual,600,222,"101aa002_2020_chse, odisha_annual"
2,2020,SUBHASHREE SAMAL,,CHSE,"chse, odisha",2020,ge9u3EzOR+vpHSatNuD1IA==,101aa003,annual,600,246,"101aa003_2020_chse, odisha_annual"
3,2020,SUDEEPA SAHOO,,CHSE,"chse, odisha",2020,tGOeM7RY445B0sPMkoBrew==,101aa004,annual,600,245,"101aa004_2020_chse, odisha_annual"
4,2020,MADHUSMITA DAS,,CHSE,"chse, odisha",2020,NATgaXBHp16tblabaNJHLg==,101aa005,annual,600,275,"101aa005_2020_chse, odisha_annual"


## 6. Load Enrollment and Application Data

Load HSS and DEG enrollment and application datasets for integration.

In [None]:
deg_enrollments = load_data(datasets["deg_enrollments"]) 
hss_enrollments = load_data(datasets["hss_enrollments"])
deg_applications = load_data(datasets["deg_applications"])
hss_applications = load_data(datasets["hss_applications"])

[32m2025-11-11 15:59:11.954[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_enrollments.pq[0m
[32m2025-11-11 15:59:36.108[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_enrollments.pq[0m
[32m2025-11-11 16:00:22.026[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_applications.pq[0m
[32m2025-11-11 16:00:34.501[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_applications.pq[0m


In [None]:
deg_enrollments.head()

Unnamed: 0,id,barcode,student_name,gender,religion_name,dob,annual_income,address,state,district,...,examination_board_of_the_highest_qualification,examination_type,year_of_passing,roll_no,total_marks,secured_marks,percentage,compartmental_status,deg_option_details,deg_compartments
0,1,18D000005,NHqQtL6JHS7ARG8lWmmkMwYwif+rxScfdlndd1YewK0=,Male,HINDUISM,2000-12-28,"0 - 1,50,000","L-94-B,PHASE-V,DUMUDUMA,BBSR",Odisha,Khurda,...,"CHSE, Odisha",Annual,2018,rFXFPmSkII4WoQCD2iWmGA==,600,419.0,69.83,False,"[{""ReportedInstitute"": ""Basic Science & Humani...",[]
1,2,18D000009,lWnj0IKMRMq3Kmp7BxdjD+xbFH0/HAcPKwT5KGctG84=,Female,HINDUISM,2001-01-15,"0 - 1,50,000","AT-BHURUNGASOLE, PO-NUHAJHALIA, VIA-KOSTHA, PS...",Odisha,Mayurbhanj,...,"CHSE, Odisha",Annual,2018,Uyol5uArJToaCcdfyul0zw==,600,478.0,79.67,False,"[{""ReportedInstitute"": ""Panchayat Samiti (Deg...",[]
2,3,18D000014,rdikqPCn/ufEdn4kV626pMLQr9zLQ2qZGRcjbjEDouY=,Female,HINDUISM,2001-11-03,"0 - 1,50,000","AT/PO- KAMARDA, VIA- KAMARDA",Odisha,Balasore,...,"CHSE, Odisha",Annual,2018,mj1D3ndx5risDBRaSvL/qw==,600,230.0,38.33,False,"[{""ReportedInstitute"": ""Kamarda (Degree) Mahav...",[]
3,4,18D000017,PYmUqx9rFEV4r/fFXzWXfpdgVGs8VP2KH3UzEYauLqw=,Male,HINDUISM,2001-03-10,"0 - 1,50,000","PO/PS-SAHEED NAGAR ,PATHARABANDHA",Odisha,Khurda,...,"CHSE, Odisha",Annual,2018,/tRyA4YR6BB6wzic8MHatg==,600,333.0,55.5,False,"[{""ReportedInstitute"": ""Maharishi (Degree) Col...",[]
4,5,18D000019,tWTPV1Bh+0LwcMk6cQUO6cjhgm9dExJnW6oPLmcusVA=,Female,HINDUISM,2000-05-21,"0 - 1,50,000",AT KUMBHIPADA PO DESIL PS TITILAGARH,Odisha,Bolangir,...,"CHSE, Odisha",Annual,2018,fDOjAv3DM8hJ/6F6kZ674g==,600,236.0,39.33,False,"[{""ReportedInstitute"": ""D.A.V. (Autonomous) Co...",[]


## 7. Prepare HSS/DEG Enrollment Data

Extract relevant columns, decrypt roll numbers, and standardize field names.

In [None]:
keep_cols = [
    "barcode", "student_name", "aadhar_no", "dob", "module",
    "academic_year", "examination_board_of_the_highest_qualification",
    "examination_type", "year_of_passing", "roll_no"
]

def make_enroll_clean(df):
    out = df[keep_cols].copy()
    out["roll_no_decrypted"] = out["roll_no"].map(decrypt_roll).fillna("NA").astype(str).str.strip()
    return out

# Apply for DEG & HSS
deg_enroll = make_enroll_clean(deg_enrollments)
hss_enroll = make_enroll_clean(hss_enrollments)

In [None]:
rename_map = {
    "year_of_passing": "passing_year",
    "examination_board_of_the_highest_qualification": "exam_board",
    "examination_type": "exam_type"
}

deg_enroll = deg_enroll.rename(columns=rename_map)
hss_enroll = hss_enroll.rename(columns=rename_map)

In [None]:
hss_enroll["passing_year"] = (
    pd.to_numeric(hss_enroll["passing_year"], errors="coerce")
    .fillna("")
    .astype(str)
    .str.replace(".0", "", regex=False)
)

## 8. Student Key Generation (HSS/DEG)

Generate student keys for HSS and DEG datasets using the same logic:
- **DEG**: `roll_no_decrypted` + `passing_year` + `exam_board` + `exam_type`
- **HSS**: `roll_no_decrypted` + `dob` + `passing_year` + `exam_board`

In [None]:
def generate_student_key_df(df, module_name: str) -> pd.DataFrame:
    """
    Generate a standardized student key for identity matching across datasets.

    The student key format differs by academic module:
    - DEG (Degree): roll number (decrypted) + passing year + exam board + exam type
    - HSS (Higher Secondary): roll number (decrypted) + date of birth + passing year + exam board

    All components are normalized to lowercase strings and stripped of extra whitespace.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing student records.
    
    module_name : str
        Academic module name. Supported values:
        - "DEG" for Degree students
        - "HSS" for Higher Secondary students

    Returns
    -------
    pd.DataFrame
        DataFrame including a new column `student_key` constructed based on module rules.
    """
    
    new_df = df.copy()

    module_name = module_name.upper()

    if module_name == "DEG":
        key_parts = ["roll_no_decrypted", "passing_year", "exam_board", "exam_type"]
    elif module_name == "HSS":
        key_parts = ["roll_no_decrypted", "dob", "passing_year", "exam_board"]
    else:
        raise ValueError(f"Invalid module_name '{module_name}'. Use 'DEG' or 'HSS'.")

    # Normalize fields
    for col in key_parts:
        if col in new_df.columns:
            new_df[col] = (
                new_df[col]
                .astype(str)
                .fillna("")
                .str.strip()
                .str.lower()
            )

    # Build composite student key
    new_df["student_key"] = new_df[key_parts].agg("_".join, axis=1)

    return new_df

In [None]:
hss_key_df = generate_student_key_df(hss_enroll, "HSS")
deg_key_df = generate_student_key_df(deg_enroll, "DEG")

In [None]:
hss_key_df.head()

Unnamed: 0,barcode,student_name,aadhar_no,dob,module,academic_year,exam_board,exam_type,passing_year,roll_no,roll_no_decrypted,student_key
0,18J1152889,HR81C4EOg2/hE8JRTrn1S6w33F6HtoV5YRG56S79UEc=,�qkm;ú]h¸n ðÿyvâ[ïôw\rxû)tñöšgþ,2003-03-16,HSS,2018,"bse, odisha",Annual,2018,pYe8rJ5KxJ67Y8yXJe6eWw==,328ek0168,"328ek0168_2003-03-16_2018_bse, odisha"
1,18J1152893,RroqbpgtJBXrjGAqGwJEPnBOoDGW7l6AXUWeOI1+mlY=,ga£!oˆ©ª’ïëˆÿvëfëu’jój5›‘f‹ðjôâ,2001-09-06,HSS,2018,"bse, odisha",Annual,2018,+O4ItUgVCxGgjUEKty4yKw==,078cc0010,"078cc0010_2001-09-06_2018_bse, odisha"
2,18J1152906,UaqS4u+DJL0F9xeHIAqVp/b7sijboaNytvQ6rEzCq/Y=,óà®xðjmìøk@ú=ªð tàä±1ž³wá­œü0,2002-11-05,HSS,2018,"bse, odisha",Annual,2018,BAPZomJcVbNhw7VGy0Ks+w==,124bb0022,"124bb0022_2002-11-05_2018_bse, odisha"
3,18J1152926,IDsi45Z1rGc46/6z5Me/lgf+tc9AaoV7u4zpG+XpOMY=,df^»+\t`uö4™”4t¥mágàz¡˜ts,2002-05-12,HSS,2018,"bse, odisha",Annual,2018,KvA4Ys6wwLNAoTFiizpByw==,207cb0090,"207cb0090_2002-05-12_2018_bse, odisha"
4,18J1152933,IDrYpLGzUi0AbxBk/9KcIwOJSNAG8GZt73d/ew9Nw2Q=,,2002-06-25,HSS,2018,"bse, odisha",Annual,2018,XJ5bhexVA3g/ZAb1Ktv0zA==,033fa0010,"033fa0010_2002-06-25_2018_bse, odisha"


In [None]:
hss_key_df.columns

Index(['barcode', 'student_name', 'aadhar_no', 'dob', 'module',
       'academic_year', 'exam_board', 'exam_type', 'passing_year', 'roll_no',
       'roll_no_decrypted', 'student_key'],
      dtype='object')

In [None]:
hss_applications.columns

Index(['aadhar_no', 'barcode', 'academic_year', 'reported_institute',
       'sams_code', 'stream', 'institute_district', 'institute_block',
       'type_of_institute', 'phase', 'year', 'admission_status', 'option_no',
       'num_applications'],
      dtype='object')

## 9. Merge Enrollment with Applications

Combine enrollment data with application records using barcode as the primary key.

In [None]:
def merge_enrollment_applications(enroll_df, app_df):
    """
    Merge application rows with student identity columns using barcode.
    Keeps all application rows and adds student info from enrollment table.
    """

    # Columns we want from enrollment (as you listed)
    enroll_cols = [
        "barcode", "student_name", "aadhar_no", "dob", "module",
        "academic_year", "exam_board", "exam_type", "passing_year",
        "roll_no", "roll_no_decrypted", "student_key"
    ]
    
    enroll_reduced = enroll_df[enroll_cols].copy()

    # Only barcode from applications
    app_reduced = app_df[["barcode"]].copy()

    # Merge keeping all application rows
    merged = app_reduced.merge(enroll_reduced, on="barcode", how="left")

    return merged

In [None]:
hss_df = merge_enrollment_applications(hss_key_df, hss_applications)
deg_df = merge_enrollment_applications(deg_key_df, deg_applications)

In [None]:
deg_key_df = deg_df.drop(columns=["barcode"])
hss_key_df = hss_df.drop(columns=["barcode"])

## 10. Data Integration and Matching Analysis

Prepare final datasets and analyze student key matches between CHSE and DEG systems.

In [None]:
deg_key_df.head()

Unnamed: 0,student_name,aadhar_no,dob,module,academic_year,exam_board,exam_type,passing_year,roll_no,roll_no_decrypted,student_key
0,NHqQtL6JHS7ARG8lWmmkMwYwif+rxScfdlndd1YewK0=,,2000-12-28,DEG,2018,"chse, odisha",annual,2018,rFXFPmSkII4WoQCD2iWmGA==,387ma078,"387ma078_2018_chse, odisha_annual"
1,NHqQtL6JHS7ARG8lWmmkMwYwif+rxScfdlndd1YewK0=,,2000-12-28,DEG,2018,"chse, odisha",annual,2018,rFXFPmSkII4WoQCD2iWmGA==,387ma078,"387ma078_2018_chse, odisha_annual"
2,NHqQtL6JHS7ARG8lWmmkMwYwif+rxScfdlndd1YewK0=,,2000-12-28,DEG,2018,"chse, odisha",annual,2018,rFXFPmSkII4WoQCD2iWmGA==,387ma078,"387ma078_2018_chse, odisha_annual"
3,NHqQtL6JHS7ARG8lWmmkMwYwif+rxScfdlndd1YewK0=,,2000-12-28,DEG,2018,"chse, odisha",annual,2018,rFXFPmSkII4WoQCD2iWmGA==,387ma078,"387ma078_2018_chse, odisha_annual"
4,NHqQtL6JHS7ARG8lWmmkMwYwif+rxScfdlndd1YewK0=,,2000-12-28,DEG,2018,"chse, odisha",annual,2018,rFXFPmSkII4WoQCD2iWmGA==,387ma078,"387ma078_2018_chse, odisha_annual"


In [None]:
chse_key_df

Index(['academic_year', 'student_name', 'dob', 'module', 'exam_board',
       'passing_year', 'roll_no', 'roll_no_decrypted', 'exam_type',
       'total_marks', 'secured_marks', 'student_key'],
      dtype='object')

### 10.1 Overall Match Summary

Calculate total unique student keys and overall conversion rate from CHSE to DEG.

In [None]:
# CHSE and DEG datasets already built earlier in notebook
chse = chse_key_df[["student_key", "student_name", "academic_year"]].copy()
deg  = deg_key_df[["student_key", "student_name", "academic_year"]].copy()

chse["academic_year"] = chse["academic_year"].astype(str)
deg["academic_year"]  = deg["academic_year"].astype(str)

# Unique key sets
chse_keys = set(chse["student_key"])
deg_keys  = set(deg["student_key"])
common_keys = chse_keys & deg_keys


In [None]:
pd.DataFrame({
    "metric": ["CHSE unique keys", "DEG unique keys", "Matched keys", "Conversion %"],
    "value": [
        len(chse_keys),
        len(deg_keys),
        len(common_keys),
        round(len(common_keys) / len(chse_keys) * 100, 2)
    ]
})

# CHSE and DEG matches ACROSS THE YEARS

Unnamed: 0,metric,value
0,CHSE unique keys,1963873.0
1,DEG unique keys,1608579.0
2,Matched keys,945210.0
3,Conversion %,48.13


### 10.2 Year-wise Matched Students

Identify students who appear in both CHSE and DEG systems for the same passing year.

In [None]:
# Build CHSE & DEG mini tables
chse_sub = chse_key_df[["student_key", "passing_year"]].assign(module="CHSE")
deg_sub  = deg_key_df[["student_key", "passing_year"]].assign(module="DEG")

# Stack, keep only common keys
tmp = pd.concat([chse_sub, deg_sub])
tmp = tmp[tmp["student_key"].isin(common_keys)]

# Group & keep only keys present in both
matched = (
    tmp.groupby(["student_key", "passing_year"])["module"]
       .apply(lambda x: ", ".join(sorted(set(x))))
       .reset_index()
)

student_match = matched[matched["module"] == "CHSE, DEG"].sort_values("passing_year", ascending=False)
student_match.head()

Unnamed: 0,student_key,passing_year,module
472605,"139ma026_2024_chse, odisha_annual",2024,"CHSE, DEG"
309418,"122ga263_2024_chse, odisha_annual",2024,"CHSE, DEG"
309432,"122ga272_2024_chse, odisha_annual",2024,"CHSE, DEG"
309430,"122ga271_2024_chse, odisha_annual",2024,"CHSE, DEG"
309428,"122ga270_2024_chse, odisha_annual",2024,"CHSE, DEG"


### 10.3Year-wise Conversion Analysis

Analyze CHSE to DEG conversion rates by passing year to identify enrollment trends over time.

In [None]:
chse = chse_key_df[["student_key","passing_year"]].drop_duplicates()
deg  = deg_key_df[["student_key","passing_year"]].drop_duplicates()

chse["passing_year"] = chse["passing_year"].astype(str)
deg["passing_year"]  = deg["passing_year"].astype(str)

yrs = sorted(set(chse.passing_year) | set(deg.passing_year), reverse=True)

summary = pd.DataFrame({
    "passing_year": yrs,
    "chse_keys": [chse[chse.passing_year==y].student_key.nunique() for y in yrs],
    "deg_keys":  [deg[deg.passing_year==y].student_key.nunique()  for y in yrs],
})

summary["chse→deg"] = [
    len(
        set(chse[chse.passing_year==y].student_key)
        & set(deg[deg.passing_year==y].student_key)
    )
    for y in yrs
]

summary["pct"] = (summary["chse→deg"] / summary["chse_keys"] * 100).round(2)
summary

Unnamed: 0,passing_year,chse_keys,deg_keys,chse→deg,pct
0,2025,364770,0,0,0.0
1,2024,357825,217939,191135,53.42
2,2023,338220,211138,183455,54.24
3,2022,301650,223173,196590,65.17
4,2021,295178,245410,205557,69.64
5,2020,306230,201169,168473,55.02
6,2019,0,202711,0,
7,2018,0,211309,0,
8,2017,0,38689,0,
9,2016,0,17316,0,


: 