In [1]:

# Standard libraries
import os
import json
import sqlite3
from pathlib import Path

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from sams.config import datasets
from sams.utils import load_data

[32m2025-09-08 18:07:50.297[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m15[0m - [1mPROJ_ROOT path is: C:\Users\Admin\Documents\GitHub\sams[0m
[32m2025-09-08 18:07:50.361[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m


In [2]:
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Function to show column names of a table
def describe_table(table_name):
    try:
        print(f"\nVariable of the table: {table_name}")
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        for col in columns:
            print(f"{col[1]} ({col[2]})")
    except sqlite3.Error as e:
        print(f"Error describing table {table_name}: {e}")


# Describe both tables
describe_table("students")

conn.close()


Variable of the table: students
id (INTEGER)
barcode (VARCHAR)
student_name (VARCHAR)
gender (VARCHAR)
religion_name (VARCHAR)
dob (VARCHAR)
nationality (VARCHAR)
annual_income (VARCHAR)
address (VARCHAR)
state (VARCHAR)
district (VARCHAR)
block (VARCHAR)
pin_code (VARCHAR)
social_category (VARCHAR)
domicile (VARCHAR)
s_domicile_category (VARCHAR)
outside_odisha_applicant_state_name (VARCHAR)
odia_applicant_living_outside_odisha_state_name (VARCHAR)
residence_barcode_number (VARCHAR)
tenth_exam_school_address (VARCHAR)
eighth_exam_school_address (VARCHAR)
highest_qualification_exam_board (VARCHAR)
board_exam_name_for_highest_qualification (VARCHAR)
highest_qualification (VARCHAR)
had_two_year_full_time_work_exp_after_tenth (VARCHAR)
gc (VARCHAR)
ph (VARCHAR)
es (VARCHAR)
sports (VARCHAR)
national_cadet_corps (VARCHAR)
pm_care (VARCHAR)
orphan (VARCHAR)
income_barcode (VARCHAR)
tfw (VARCHAR)
ews (VARCHAR)
boc (VARCHAR)
boc_regd_no (VARCHAR)
course_name (VARCHAR)
course_period (VARCHAR)

In [257]:
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)

query = """
SELECT
    id,
    academic_year,
    barcode,
    aadhar_no,
    dob,
    highest_qualification,
    mark_data
FROM students
WHERE module = 'ITI'
"""

students_df = pd.read_sql_query(query, conn)
conn.close()
# Function to parse JSON strings safely
def parse_option_data(val):
    """
    Safely parses a JSON string into a Python object.

    Parameters:
    ----------
    val : str, list, or any
        The value to be parsed. If it's a JSON string, it will be decoded.
        If it's already a list, it is returned unchanged.
        All other types return an empty list.

    Returns:
    -------
    list
        Parsed list from JSON string, or the original list, or an empty list if parsing fails.
    """
    if isinstance(val, str):
        try:
            return json.loads(val)
        except json.JSONDecodeError:
            return []
    elif isinstance(val, list):
        return val
    else:
        return []

students_df['mark_data'] = students_df['mark_data'].apply(parse_option_data)
df_exploded = students_df.explode('mark_data').reset_index(drop=True)
option_details = pd.json_normalize(df_exploded['mark_data'])
iti_df = pd.concat([df_exploded.drop(columns=['mark_data']), option_details], axis=1)

iti_df.head()

Unnamed: 0,id,academic_year,barcode,aadhar_no,dob,highest_qualification,ExamName,YearofPassing,RollNo,ExaminationType,TotalMarks,SecuredMarks,Percentage,CompartmentalStatus,CompartmentalFailMark,SubjectWiseMarks,HighestQualificationBoardExamName,HighestQualificationExamBoard
0,7404718,2017,17T0181047,DeGxV1bCBqOBygfLzt3u268Es6xBaRZq3tf+eEp6WDw=,01-Jan-2002,10TH,10th Pass,2017,PoLOLFcCUYZ/iR42DWBy2A==,Annual,600,291.0,,No,,"Mark SL :50,Mark Math :47,Mark Science :30,Mar...",,"Board of Secondary Education, Odisha, Cuttack-..."
1,7404719,2017,17T0114510,6/Vau/OfdeJ6NAV7iD6bmnbzGEexPB8LP/JcN1eroCs=,20-Apr-2000,10TH,10th Pass,2015,Gb1fbGJ4hl2z+FvcTECm4A==,Annual,600,390.0,,No,,"Mark SL :49,Mark Math :91,Mark Science :54,Mar...",,"Board of Secondary Education, Odisha, Cuttack-..."
2,7404720,2017,17T0346747,EPFmTPfdOGPrc/oLdkNxJ+t1ocsxYmM+j+/2AL7aEBQ=,07-Oct-2001,10TH,10th Pass,2017,LjH/WJQkl9UJzVJCu+CaSQ==,Annual,600,399.0,,No,,"Mark SL :76,Mark Math :68,Mark Science :45,Mar...",,"Board of Secondary Education, Odisha, Cuttack-..."
3,7404721,2017,17T0288737,yg4tiEAGopVFYnXgybQlTen34PqwEI08id2hvZS9spw=,05-Nov-1999,10 TH,10th Pass,2015,3czUb8VotFM3QA1R8yE/Jw==,Annual,600,305.0,,No,,"Mark SL :46,Mark Math :50,Mark Science :49,Mar...",,"Board of Secondary Education, Odisha, Cuttack-..."
4,7404722,2017,17T0151332,5HZd0Oz9ono6DBV2LKOoQiVGsyUreUWNxVEOdVgcrcs=,20-Apr-1999,12 TH,10th Pass,2014,BpFNV6ZmrmdtbEcHJS6eGg==,Annual,600,260.0,,No,,"Mark SL :43,Mark Math :31,Mark Science :47,Mar...",,"Board of Secondary Education, Odisha, Cuttack-..."


In [230]:
iti_df.columns

Index(['id', 'academic_year', 'barcode', 'aadhar_no', 'dob',
       'highest_qualification', 'ExamName', 'YearofPassing', 'RollNo',
       'ExaminationType', 'TotalMarks', 'SecuredMarks', 'Percentage',
       'CompartmentalStatus', 'CompartmentalFailMark', 'SubjectWiseMarks',
       'HighestQualificationBoardExamName', 'HighestQualificationExamBoard'],
      dtype='object')

In [None]:
# Drop unnecessary columns and rename selected ones
iti_df = (
    iti_df.rename(columns={
        "id": "sams_id",
        "YearofPassing": "passing_year",
        "RollNo": "roll_no",
        "HighestQualificationExamBoard": "exam_board_name_of_highest_qual"
    })
    .drop(columns=[
        "ExamName", "ExaminationType", "TotalMarks", "SecuredMarks", 
        "Percentage", "CompartmentalStatus", "CompartmentalFailMark", 
        "SubjectWiseMarks", "HighestQualificationBoardExamName"
    ])
)

# Verify result
iti_df.columns


Index(['sams_id', 'academic_year', 'barcode', 'aadhar_no', 'dob',
       'highest_qualification', 'ExamName', 'passing_year', 'roll_no',
       'ExaminationType', 'TotalMarks', 'SecuredMarks', 'Percentage',
       'CompartmentalStatus', 'CompartmentalFailMark', 'SubjectWiseMarks',
       'HighestQualificationBoardExamName', 'exam_board_name_of_highest_qual'],
      dtype='object')

In [240]:
iti_df.isna().sum()

sams_id                              0
academic_year                        0
barcode                              0
aadhar_no                            0
dob                                  0
highest_qualification                0
ExamName                             0
passing_year                         0
roll_no                              0
ExaminationType                      0
TotalMarks                           0
SecuredMarks                         9
Percentage                           0
CompartmentalStatus                  0
CompartmentalFailMark                0
SubjectWiseMarks                     0
HighestQualificationBoardExamName    0
exam_board_name_of_highest_qual      0
dtype: int64

In [241]:
key_vars = ['aadhar_no', 'dob', 'highest_qualification', 'passing_year', 'roll_no', 'exam_board_name_of_highest_qual']

summary = []

for col in key_vars:
    total = iti_df[col].count()  
    
    unique_vals = iti_df[col].nunique()

    missing = (
        iti_df[col].isna().sum()
        + (iti_df[col] == "").sum()
        + (iti_df[col] == "NA").sum()
        )
    available = total - missing
    
    summary.append({
        "variable": col,
        "total_enrolled": total,
        "unique_values": unique_vals,
        "missing_count": missing,
        "available_count": available
    })

availability_report = pd.DataFrame(summary)
availability_report


Unnamed: 0,variable,total_enrolled,unique_values,missing_count,available_count
0,aadhar_no,559575,518025,0,559575
1,dob,559575,11007,0,559575
2,highest_qualification,559575,1121,87171,472404
3,passing_year,559575,52,0,559575
4,roll_no,559575,473015,0,559575
5,exam_board_name_of_highest_qual,559575,53,1644,557931


In [None]:
# Focus only on key columns
key_columns = ["aadhar_no","dob","highest_qualification","passing_year","roll_no","exam_board_name_of_highest_qual"]

missing_summary = pd.DataFrame({
    "nan_missing": iti_df[key_columns].isna().sum(),
    "na_string": (iti_df[key_columns] == "NA").sum(),
    "blank_string": (iti_df[key_columns] == "").sum()
})

missing_summary["total"] = (
    missing_summary["nan_missing"]
    + missing_summary["na_string"]
    + missing_summary["blank_string"]
)

missing_summary

Unnamed: 0,nan_missing,na_string,blank_string,total
aadhar_no,0,0,0,0
dob,0,0,0,0
highest_qualification,0,72198,14973,87171
passing_year,0,0,0,0
roll_no,0,0,0,0
exam_board_name_of_highest_qual,0,1644,0,1644


In [247]:
na_by_year = (
    iti_df.groupby("academic_year")["exam_board_name_of_highest_qual"]
      .apply(lambda s: (s == "NA").sum())
      .reset_index(name="na_string_count")
)

print(na_by_year)

   academic_year  na_string_count
0           2017              102
1           2018              183
2           2019               52
3           2020              218
4           2021              312
5           2022              296
6           2023              481
7           2024                0


In [None]:
iti_df.head()

Unnamed: 0,sams_id,academic_year,barcode,aadhar_no,dob,highest_qualification,passing_year,roll_no,exam_board_name_of_highest_qual,student_key
0,7404718,2017,17T0181047,DeGxV1bCBqOBygfLzt3u268Es6xBaRZq3tf+eEp6WDw=,01-Jan-2002,10TH,2017,PoLOLFcCUYZ/iR42DWBy2A==,"Board of Secondary Education, Odisha, Cuttack-...",2017_01-Jan-2002_10th_PoLOLFcCUYZ/iR42DWBy2A==...
1,7404719,2017,17T0114510,6/Vau/OfdeJ6NAV7iD6bmnbzGEexPB8LP/JcN1eroCs=,20-Apr-2000,10TH,2015,Gb1fbGJ4hl2z+FvcTECm4A==,"Board of Secondary Education, Odisha, Cuttack-...",2015_20-Apr-2000_10th_Gb1fbGJ4hl2z+FvcTECm4A==...
2,7404720,2017,17T0346747,EPFmTPfdOGPrc/oLdkNxJ+t1ocsxYmM+j+/2AL7aEBQ=,07-Oct-2001,10TH,2017,LjH/WJQkl9UJzVJCu+CaSQ==,"Board of Secondary Education, Odisha, Cuttack-...",2017_07-Oct-2001_10th_LjH/WJQkl9UJzVJCu+CaSQ==...
3,7404721,2017,17T0288737,yg4tiEAGopVFYnXgybQlTen34PqwEI08id2hvZS9spw=,05-Nov-1999,10 TH,2015,3czUb8VotFM3QA1R8yE/Jw==,"Board of Secondary Education, Odisha, Cuttack-...",2015_05-Nov-1999_10th_3czUb8VotFM3QA1R8yE/Jw==...
4,7404722,2017,17T0151332,5HZd0Oz9ono6DBV2LKOoQiVGsyUreUWNxVEOdVgcrcs=,20-Apr-1999,12 TH,2014,BpFNV6ZmrmdtbEcHJS6eGg==,"Board of Secondary Education, Odisha, Cuttack-...",2014_20-Apr-1999_12th_BpFNV6ZmrmdtbEcHJS6eGg==...


In [None]:
iti_df['highest_qualification'].value_counts()

highest_qualification
10th                     359892
12th                      76918
NA                        72198
                          14973
Graduate and above        10263
                          ...  
Intermedia in Arts            1
diploma                       1
PLUS 2 VOCATIONAL             1
M.Sc.(CS)                     1
Master in Engineering         1
Name: count, Length: 1121, dtype: int64

In [None]:
from sams.preprocessing.iti_diploma_nodes import _fix_qual_names

iti_df["highest_qualification_std"] = _fix_qual_names(df["highest_qualification"])
iti_df["highest_qualification_std"].value_counts()


highest_qualification_std
10th                  369713
12th                   83941
na                     72199
                       14973
Graduate and above     14487
                       ...  
plus tow arts              1
bscphysics                 1
1th pass                   1
arts pass                  1
bce odisha                 1
Name: count, Length: 389, dtype: int64

In [None]:
# ---- Config: key fields for the fingerprint ----
KEY_FIELDS = ['highest_qualification', 'YearofPassing', 'RollNo', 'HighestQualificationExamBoard']

# ---- Helpers ----
def is_missing_series(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    return s.isna() | s.eq("") | s.eq("NA")

def row_has_all_key_fields(df: pd.DataFrame, cols) -> pd.Series:
    mask = pd.Series(True, index=df.index)
    for c in cols:
        mask &= ~is_missing_series(df[c])
    return mask

# ---- A1. Scale: rows & students ----
total_rows = len(iti_df)

aadhar = iti_df['aadhar_no'].astype(str).str.strip()
aadhar_missing = is_missing_series(aadhar)
valid_aadhar = ~aadhar_missing

students_total = aadhar[valid_aadhar].nunique()
rows_with_missing_aadhar = int(aadhar_missing.sum())

# ---- A2. Redundancy: students with >1 row ----
rows_per_student = (
    iti_df.loc[valid_aadhar]
          .groupby(aadhar[valid_aadhar])
          .size()
)
students_multi_rows = int((rows_per_student > 1).sum())
pct_students_multi_rows = round(100 * students_multi_rows / max(students_total, 1), 2)

# ---- A3. Coverage: students with at least one complete record (all 4 key fields present in the same row) ----
complete_row_mask = row_has_all_key_fields(iti_df, KEY_FIELDS)
has_complete_row_per_student = complete_row_mask[valid_aadhar].groupby(aadhar[valid_aadhar]).any()
students_with_complete = int(has_complete_row_per_student.sum())
pct_students_with_complete = round(100 * students_with_complete / max(students_total, 1), 2)

# Display summary 
summary = pd.DataFrame([
    {"Metric": "Total rows", "Value": total_rows},
    {"Metric": "Rows with missing/invalid Aadhaar", "Value": rows_with_missing_aadhar},
    {"Metric": "Students (unique Aadhaar)", "Value": students_total},
    {"Metric": "Students with >1 row", "Value": f"{students_multi_rows} ({pct_students_multi_rows}%)"},
    {"Metric": "Students with ≥1 complete record (4 key fields)", "Value": f"{students_with_complete} ({pct_students_with_complete}%)"},
])

summary

In [None]:
from sams.preprocessing.iti_diploma_nodes import _fix_qual_names

def encode_part(s: pd.Series, *, na_label="NA", missing_label="MISSING", lower=False) -> pd.Series:
    """
    Encodes a string Series by:
    - Replacing literal 'NA' with na_label
    - Replacing blanks, whitespace, or NaN with missing_label
    - Lowercasing (optional) for standardization, excluding placeholders
    """
    is_nan = s.isna()
    t = s.astype(str).str.strip()
    out = t.copy()

    # Literal 'NA' -> na_label
    out = out.mask(t.eq("NA"), na_label)

    # True NaN or empty/whitespace -> missing_label
    out = out.mask(t.eq("") | is_nan, missing_label)

    if lower:
        out = out.where(out.isin([na_label, missing_label]), out.str.lower())

    return out


# Generate student_key with standardized highest_qualification
# Generate student_key with standardized highest_qualification
iti_df['student_key'] = (
    encode_part(iti_df['passing_year']) + "_" +
    encode_part(iti_df['dob']) + "_" +
    encode_part(iti_df["highest_qualification"]) + "_" + 
    encode_part(iti_df['roll_no']) + "_" +
    encode_part(iti_df['exam_board_name_of_highest_qual'])
)


# Quick quality checks
# hq_enc = encode_part(df['highest_qualification'], lower=True)

# print("Students with 'NA' as their highest qualification (kept as-is):", (hq_enc == "NA").sum())
# print("Students missing highest qualification info (marked as 'MISSING'):", (hq_enc == "MISSING").sum())

# Summary of keys
total_keys = len(iti_df['student_key'])
total_records = len(iti_df)
unique_aadhar = iti_df["aadhar_no"].nunique()

unique_keys = iti_df['student_key'].nunique()
non_unique_key_uses = total_keys - unique_keys

print("Total student records processed for generating key:", total_keys)
print("Unique Aadhaar numbers:", unique_aadhar)
print("Unique student key generated:", unique_keys)
# print("Duplicate key uses (How many keys were reused):", non_unique_key_uses)


# Duplicate key diagnostics
key_counts = iti_df['student_key'].value_counts()
duplicate_keys = key_counts[key_counts > 1]
duplicate_rows = iti_df['student_key'].isin(duplicate_keys.index).sum()

print("Duplicate student_keys (How many unique keys were reused):", len(duplicate_keys))
# print("Total rows with duplicated keys(How many rows are involved in key duplication):", duplicate_rows)

# Preview updated DataFrame
df = iti_df.head()

Total student records processed for generating key: 559575
Unique Aadhaar numbers: 518025
Unique student key generated: 543916
Duplicate student_keys (How many unique keys were reused): 14874


Unnamed: 0,sams_id,academic_year,barcode,aadhar_no,dob,highest_qualification,passing_year,roll_no,exam_board_name_of_highest_qual,student_key,highest_qualification_std
0,7404718,2017,17T0181047,DeGxV1bCBqOBygfLzt3u268Es6xBaRZq3tf+eEp6WDw=,01-Jan-2002,10TH,2017,PoLOLFcCUYZ/iR42DWBy2A==,"Board of Secondary Education, Odisha, Cuttack-...",2017_01-Jan-2002_10TH_PoLOLFcCUYZ/iR42DWBy2A==...,10th
1,7404719,2017,17T0114510,6/Vau/OfdeJ6NAV7iD6bmnbzGEexPB8LP/JcN1eroCs=,20-Apr-2000,10TH,2015,Gb1fbGJ4hl2z+FvcTECm4A==,"Board of Secondary Education, Odisha, Cuttack-...",2015_20-Apr-2000_10TH_Gb1fbGJ4hl2z+FvcTECm4A==...,10th
2,7404720,2017,17T0346747,EPFmTPfdOGPrc/oLdkNxJ+t1ocsxYmM+j+/2AL7aEBQ=,07-Oct-2001,10TH,2017,LjH/WJQkl9UJzVJCu+CaSQ==,"Board of Secondary Education, Odisha, Cuttack-...",2017_07-Oct-2001_10TH_LjH/WJQkl9UJzVJCu+CaSQ==...,10th
3,7404721,2017,17T0288737,yg4tiEAGopVFYnXgybQlTen34PqwEI08id2hvZS9spw=,05-Nov-1999,10 TH,2015,3czUb8VotFM3QA1R8yE/Jw==,"Board of Secondary Education, Odisha, Cuttack-...",2015_05-Nov-1999_10 TH_3czUb8VotFM3QA1R8yE/Jw=...,10th
4,7404722,2017,17T0151332,5HZd0Oz9ono6DBV2LKOoQiVGsyUreUWNxVEOdVgcrcs=,20-Apr-1999,12 TH,2014,BpFNV6ZmrmdtbEcHJS6eGg==,"Board of Secondary Education, Odisha, Cuttack-...",2014_20-Apr-1999_12 TH_BpFNV6ZmrmdtbEcHJS6eGg=...,12th


In [196]:
df.columns

Index(['sams_id', 'academic_year', 'barcode', 'aadhar_no', 'dob',
       'highest_qualification', 'passing_year', 'roll_no',
       'exam_board_name_of_highest_qual', 'student_key'],
      dtype='object')

In [None]:
# have to check for the uniqueness of each variable, 
# like enroll count and unquie variable ('aadhar_no', 'dob','highest_qualification', 'passing_year', 'roll_no','exam_board_name_of_highest_qual') count

In [195]:
high_qual_count = df['highest_qualification'].value_counts()
high_qual_count

highest_qualification
10th                     359892
12th                      76918
NA                        72198
                          14973
Graduate and above        10263
                          ...  
Intermedia in Arts            1
diploma                       1
PLUS 2 VOCATIONAL             1
M.Sc.(CS)                     1
Master in Engineering         1
Name: count, Length: 1121, dtype: int64

In [None]:
na_by_year = (
    df.groupby("academic_year")["exam_board_name_of_highest_qual"]
      .apply(lambda s: (s == "NA").sum())
      .reset_index(name="na_string_count")
)

print(na_by_year)

   academic_year  na_string_count
0           2017              102
1           2018              183
2           2019               52
3           2020              218
4           2021              312
5           2022              296
6           2023              481
7           2024                0


In [183]:
rows_with_duplicated_keys = df[df['student_key'].isin(duplicate_keys.index)]

print(f"Total rows with duplicated composite keys: {len(rows_with_duplicated_keys)}")

Total rows with duplicated composite keys: 57705


In [184]:
rows_with_duplicated_keys.sample(5)

Unnamed: 0,sams_id,academic_year,barcode,aadhar_no,dob,highest_qualification,passing_year,roll_no,exam_board_name_of_highest_qual,student_key
223990,7628708,2020,20T0001304,w0xEqev+9oEfD/MyabsPOE6RUN8lUpx4tuBb692UEOc=,04-Oct-2001,10th,2017,zpYPVPKkyc2AWkMEZJGe8w==,"Board of Secondary Education, Odisha, Cuttack-...",2017_04-Oct-2001_zpYPVPKkyc2AWkMEZJGe8w==_Boar...
107100,7511818,2018,18T0445707,lH/7VmdItB99eMkCN/KOyPhDPIebYmOlWNYNvlVikrE=,15-Apr-2002,10th,2018,SH3OqhzJpeUi1vWjnhFKqg==,"Board of Secondary Education, Odisha, Cuttack-...",2018_15-Apr-2002_SH3OqhzJpeUi1vWjnhFKqg==_Boar...
234404,7639122,2020,20T00595811,2GjDm+A9wn4tOGg10s+vlvAJ36Yo71K/sONy0wZ4I2Q=,28-Mar-2000,,2016,6u1d1T1CpyBohL0mj3GfBQ==,"Board of Secondary Education, Odisha, Cuttack-...",2016_28-Mar-2000_6u1d1T1CpyBohL0mj3GfBQ==_Boar...
38039,7442757,2018,18T00425011,7vYpSJ+/EDL5JfTVAufkeLZAwfVHUP6JHNcfpF/XGwk=,04-Jan-1998,10th,2013,5c3VfBnaShSwrvEpCdgiLQ==,"Board of Secondary Education, Odisha, Cuttack-...",2013_04-Jan-1998_5c3VfBnaShSwrvEpCdgiLQ==_Boar...
451399,7856117,2023,23T09361110,OrS6q1qU3W7JmgUkUgiKjavml3VLaE0pNRNyQj1ZutU=,06-Nov-2006,10th,2022,32odOEti2gXn3tNcaH2Qag==,"Board of Secondary Education, Odisha, Cuttack-...",2022_06-Nov-2006_32odOEti2gXn3tNcaH2Qag==_Boar...


In [185]:
# Count how many times each composite key appears
key_counts = df['student_key'].value_counts()

# Keep only keys that appear more than once
duplicate_keys = key_counts[key_counts > 1]
duplicate_keys

student_key
2010_15-May-1993_9wIzGIy7YkWroQ7fX790ag==_Board of Secondary Education, Odisha, Cuttack-753001                                                     7
2015_06-May-1999_5cx6HNlPN9N2PAHg/EB5Fg==_Board of Secondary Education, Odisha, Cuttack-753001                                                     6
2014_15-Jun-1999_2wkNbSPM350/xZ773pooCQ==_Board of Secondary Education, Odisha, Cuttack-753001                                                     6
2002_21-Apr-1987_jWlOQX0hinwWcp7RPujpQw==_Board of Secondary Education, Odisha, Cuttack-753001                                                     6
2015_30-Jul-1998_PhYyRVzo5FqtDeZZW3sT2A==_Central Board of Secondary Education, 02-Community Centre, Shaiksha Kendra, Preet Vihar, Delhi-110301    6
                                                                                                                                                  ..
2018_06-Mar-2002_mUjMSEtTdhV1PIf9uan01A==_Board of Secondary Education, Odisha, Cuttack-753001

In [186]:
# Count how many times each composite_key appears
key_counts = df['student_key'].value_counts()

# Filter only duplicate keys (count > 1)
duplicate_keys = key_counts[key_counts > 1]

# Merge the count info back into the dataframe
dup_df = df[df['student_key'].isin(duplicate_keys.index)].copy()
dup_df['dup_count'] = dup_df['student_key'].map(duplicate_keys)

# Sort so the most repeated keys appear first, and group within key
dup_df = dup_df.sort_values(by=['dup_count', 'student_key', 'sams_id', 'academic_year'], 
                            ascending=[False, True, True, True])

# Preview
dup_df.head()


Unnamed: 0,sams_id,academic_year,barcode,aadhar_no,dob,highest_qualification,passing_year,roll_no,exam_board_name_of_highest_qual,student_key,dup_count
26853,7431571,2017,17T0182328,nKRxoM4K/9+CxLTs+Rdv5/BJGXU6PkZEmBdOzCY3Ahw=,15-May-1993,HIGHER SECONDARY,2010,9wIzGIy7YkWroQ7fX790ag==,"Board of Secondary Education, Odisha, Cuttack-...",2010_15-May-1993_9wIzGIy7YkWroQ7fX790ag==_Boar...,7
100193,7504911,2018,18T01361710,nKRxoM4K/9+CxLTs+Rdv5/BJGXU6PkZEmBdOzCY3Ahw=,15-May-1993,10th,2010,9wIzGIy7YkWroQ7fX790ag==,"Board of Secondary Education, Odisha, Cuttack-...",2010_15-May-1993_9wIzGIy7YkWroQ7fX790ag==_Boar...,7
167615,7572333,2019,19T0079425,nKRxoM4K/9+CxLTs+Rdv5/BJGXU6PkZEmBdOzCY3Ahw=,15-May-1993,,2010,9wIzGIy7YkWroQ7fX790ag==,"Board of Secondary Education, Odisha, Cuttack-...",2010_15-May-1993_9wIzGIy7YkWroQ7fX790ag==_Boar...,7
232055,7636773,2020,20T0314988,nKRxoM4K/9+CxLTs+Rdv5/BJGXU6PkZEmBdOzCY3Ahw=,15-May-1993,Graduate and above,2010,9wIzGIy7YkWroQ7fX790ag==,"Board of Secondary Education, Odisha, Cuttack-...",2010_15-May-1993_9wIzGIy7YkWroQ7fX790ag==_Boar...,7
308878,7713596,2021,21T0029982,nKRxoM4K/9+CxLTs+Rdv5/BJGXU6PkZEmBdOzCY3Ahw=,15-May-1993,10th,2010,9wIzGIy7YkWroQ7fX790ag==,"Board of Secondary Education, Odisha, Cuttack-...",2010_15-May-1993_9wIzGIy7YkWroQ7fX790ag==_Boar...,7


In [None]:
# Filter for 2020
df_2020 = df[df['academic_year'] == 2018]

# Find duplicate Aadhar numbers in 2020
dup_aadhar_2020 = df_2020['aadhar_no'].value_counts()
dup_aadhar_2020 = dup_aadhar_2020[dup_aadhar_2020 > 1]

# print("Duplicate Aadhar numbers in 2020:")
# print(dup_aadhar_2020)

# Total number of distinct duplicated Aadhar numbers
print("\nTotal distinct duplicated Aadhar numbers in 2020:", len(dup_aadhar_2020))


In [187]:
df.columns

Index(['sams_id', 'academic_year', 'barcode', 'aadhar_no', 'dob',
       'highest_qualification', 'passing_year', 'roll_no',
       'exam_board_name_of_highest_qual', 'student_key'],
      dtype='object')

In [None]:
# Per-year aggregates to check unique aadhar and key
grp = df.groupby("academic_year")

summary = pd.concat([
    # totals
    grp["aadhar_no"].count().rename("total_enrolled"),
    # uniques
    grp["aadhar_no"].nunique().rename("unique_aadhar"),
    grp["student_key"].nunique().rename("unique_student_keys"),
    
    
    # distinct duplicated VALUES (keys/aadhar that occur >1)
    # grp["student_key"].apply(lambda s: s.value_counts().gt(1).sum()).rename("duplicate_student_keys"),
    # grp["aadhar_no"].apply(lambda s: s.value_counts().gt(1).sum()).rename("duplicate_aadhar"),
], axis=1).reset_index()
# Add difference column

# Add difference columns
summary["diff_enroll_vs_aadhar"] = summary["total_enrolled"] - summary["unique_aadhar"]
summary["diff_enroll_vs_keys"] = summary["total_enrolled"] - summary["unique_student_keys"]

# Add one row for totals across all years
summary.loc["Total"] = [
    "All Years",
    summary["total_enrolled"].sum(),
    summary["unique_aadhar"].sum(),
    summary["unique_student_keys"].sum(),
    summary["diff_enroll_vs_aadhar"].sum(),
    summary["diff_enroll_vs_keys"].sum()
]

summary

Unnamed: 0,academic_year,total_enrolled,unique_aadhar,unique_student_keys,diff_enroll_vs_aadhar,diff_enroll_vs_keys
0,2017,34654,27337,30243,7317,4411
1,2018,75211,70613,71432,4598,3779
2,2019,64148,64148,64126,0,22
3,2020,67415,67412,67409,3,6
4,2021,68000,68000,67991,0,9
5,2022,74104,74104,74085,0,19
6,2023,92085,92085,92081,0,4
7,2024,83958,83958,83957,0,1
Total,All Years,559575,547657,551324,11918,8251


In [None]:
summary = (
    df.groupby("academic_year")
      .agg(
          total_enrolled=("aadhar_no", "count"),
          unique_aadhar=("aadhar_no", "nunique"),
          unique_keys=("student_key", "nunique")
      )
      .reset_index()
)

# Add difference column
summary["diff_keys_vs_aadhar"] = summary["unique_keys"] - summary["unique_aadhar"]

# Add one row for totals across all years
summary.loc["Total"] = [
    "All Years",
    summary["total_enrolled"].sum(),
    summary["unique_aadhar"].sum(),
    summary["unique_keys"].sum(),
    summary["diff_keys_vs_aadhar"].sum()
]

print(summary)


In [None]:
# Per-year aggregates to check unique aadhar and student_key
grp = df.groupby("academic_year")

summary = pd.concat([
    # total records per year
    grp["aadhar_no"].count().rename("student_records"),
    # unique counts
    grp["student_key"].nunique().rename("unique_student_keys"),
    grp["aadhar_no"].nunique().rename("unique_aadhar"),
    # duplicated counts (values that appear >1)
    grp["student_key"].apply(lambda s: s.value_counts().gt(1).sum()).rename("duplicate_student_keys"),
    grp["aadhar_no"].apply(lambda s: s.value_counts().gt(1).sum()).rename("duplicate_aadhar"),
], axis=1).reset_index()

summary


In [None]:
# Aadhaar → how many distinct keys in the same year?
aadhar_key_counts = (
    df.groupby(["academic_year", "aadhar_no"])["student_key"]
      .nunique()
      .reset_index(name="n_keys")
)

# Filter only those with more than 1 key
trouble_students = aadhar_key_counts[aadhar_key_counts["n_keys"] > 1]
trouble_students.head()
df

Unnamed: 0,academic_year,aadhar_no,n_keys
25,2017,+3/iaW/UMl82KYssYR0sd2zl9HAjGDOCBHl56OMeO84=,2
28,2017,+3DI3Zw9wsP/GOHK7V4w6RKaXw8FeAUdLAbdBA8IFf4=,2
31,2017,+3XJqw5EG91YwZA5u/NtO52uOyhjkDk/J9KCoXODx9Q=,2
37,2017,+4lDGPuOAq0sFuF9djEfwECb9HhTveq7U/qE0WO/ZYI=,2
43,2017,+5Rl8dotsgjRIJP9bRf8nqZJ7T9vKO/lnTyWjl8cpaE=,2


In [148]:
import sqlite3
import pandas as pd

db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)

# Get list of all columns
cursor = conn.execute("PRAGMA table_info(students)")
all_columns = [col[1] for col in cursor.fetchall()]
print("All columns in students table:", all_columns)


All columns in students table: ['id', 'barcode', 'student_name', 'gender', 'religion_name', 'dob', 'nationality', 'annual_income', 'address', 'state', 'district', 'block', 'pin_code', 'social_category', 'domicile', 's_domicile_category', 'outside_odisha_applicant_state_name', 'odia_applicant_living_outside_odisha_state_name', 'residence_barcode_number', 'tenth_exam_school_address', 'eighth_exam_school_address', 'highest_qualification_exam_board', 'board_exam_name_for_highest_qualification', 'highest_qualification', 'had_two_year_full_time_work_exp_after_tenth', 'gc', 'ph', 'es', 'sports', 'national_cadet_corps', 'pm_care', 'orphan', 'income_barcode', 'tfw', 'ews', 'boc', 'boc_regd_no', 'course_name', 'course_period', 'beauty_culture_type', 'sams_code', 'reported_institute', 'reported_branch_or_trade', 'institute_district', 'type_of_institute', 'phase', 'year', 'admission_status', 'enrollment_status', 'applied_status', 'date_of_application', 'application_status', 'aadhar_no', 'registrat

In [None]:
conn = sqlite3.connect(db_path)

query = """
SELECT
    id,
    barcode,
    student_name,
    gender,
    dob,
    aadhar_no,
    highest_qualification,
    mark_data
FROM students
WHERE module = 'ITI'
  AND academic_year = 2017
  AND aadhar_no = '47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU='
"""

row_df = pd.read_sql_query(query, conn)
conn.close()
row_df.head()

Unnamed: 0,id,barcode,student_name,gender,dob,aadhar_no,academic_year,highest_qualification,year_of_passing,roll_no,...,mark_data,reported_institute,reported_branch_or_trade,type_of_institute,phase,admission_status,applied_status,application_status,date_of_application,option_data
0,7413376,17T0413558,SANTOSH KUMAR PANDA,Male,30-Apr-2000,+GL8v/l9lFbqydlpzyJ5x63fJZL2UfJFsvYukfaLEvc=,2017,10TH PASS,,,...,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2...","ITI Bhubaneswar, Khurda",Refrigeration & Air Conditioning Technician(NSQF),Govt.,3.0,Yes,Yes,Accepted,02-Aug-2017,[]
1,7434901,17T0201080,SANTOSH KUMAR PANDA,Male,30-Apr-2000,+GL8v/l9lFbqydlpzyJ5x63fJZL2UfJFsvYukfaLEvc=,2017,10TH CLASS PASS,,,...,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2...",,,,,No,No,Pending,02-Jun-2017,"[{""Phase"": ""1"", ""Option_No"": ""1"", ""Institute_N..."
2,7434903,17T0377676,SANTOSH KUMAR PANDA,Male,30-Apr-2000,+GL8v/l9lFbqydlpzyJ5x63fJZL2UfJFsvYukfaLEvc=,2017,Matriculation,,,...,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2...",,,,,No,No,Pending,21-Jul-2017,"[{""Phase"": ""1"", ""Option_No"": ""1"", ""Institute_N..."


In [None]:
# Show only non-null columns for that aadhar no
non_nulls = row_df.loc[:, row_df.notna().any(axis=0)]
non_nulls.T

Unnamed: 0,0,1,2
id,7413376,7434901,7434903
barcode,17T0413558,17T0201080,17T0377676
student_name,SANTOSH KUMAR PANDA,SANTOSH KUMAR PANDA,SANTOSH KUMAR PANDA
gender,Male,Male,Male
dob,30-Apr-2000,30-Apr-2000,30-Apr-2000
aadhar_no,+GL8v/l9lFbqydlpzyJ5x63fJZL2UfJFsvYukfaLEvc=,+GL8v/l9lFbqydlpzyJ5x63fJZL2UfJFsvYukfaLEvc=,+GL8v/l9lFbqydlpzyJ5x63fJZL2UfJFsvYukfaLEvc=
academic_year,2017,2017,2017
highest_qualification,10TH PASS,10TH CLASS PASS,Matriculation
mark_data,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2...","[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2...","[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2..."
reported_institute,"ITI Bhubaneswar, Khurda",,


In [None]:
conn = sqlite3.connect(db_path)

query = """
SELECT
    id,
    barcode,
    student_name,
    gender,
    dob,
    aadhar_no,
    highest_qualification,
    mark_data
FROM students
WHERE module = 'ITI'
  AND academic_year = 2017
  AND aadhar_no = '47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU='
"""

sketcky_aadhar = pd.read_sql_query(query, conn)
conn.close()
sketcky_aadhar.head()

Unnamed: 0,id,barcode,student_name,gender,dob,aadhar_no,highest_qualification,mark_data
0,7404734,17T0236830,PRIYABRATA BEHERA,Male,28-Oct-2000,47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=,10TH PASS,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2..."
1,7405343,17T0216512,LABANYA SUNDAR PATRA,Male,02-Mar-1996,47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=,12TH,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2..."
2,7405378,17T0023407,UTTAM KUMAR MOHANTY,Male,10-Sep-2001,47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=,10TH,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2..."
3,7405670,17T0424126,SATYAJIT JENA,Male,14-Apr-2000,47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=,,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2..."
4,7406777,17T0069387,MADHUSMITA BADI,Female,20-Feb-2000,47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=,,"[{""ExamName"": ""10th Pass"", ""YearofPassing"": ""2..."


In [155]:
import sqlite3
import pandas as pd

conn = sqlite3.connect(db_path)

query = """
SELECT
    id,
    barcode,
    student_name
FROM students
WHERE module = 'ITI'
  AND barcode IN ('17T00017310', '17T0002705')
"""

barcode_df = pd.read_sql_query(query, conn)
conn.close()

print(barcode_df.T)  # transpose for easier viewing


                              0            1
id                      7439328      7411375
barcode             17T00017310   17T0002705
student_name  VIKUNARAYAN SAHOO  SILU MALANA


In [None]:
def missing_summary(df, cols):
    total = len(df)
    summary = []
    for col in cols:
        nan_count = df[col].isna().sum()
        na_string = (df[col] == "NA").sum()
        empty_count = (df[col] == "").sum()
        summary.append({
            "column": col,
            "'NA'": na_string,
            "'' (empty)": empty_count,
            "Total Missing": nan_count + na_string + empty_count,
        })
    return pd.DataFrame(summary)

print(f"Total rows in dataset: {len(df)}")

key_columns = [
    "highest_qualification",
    "YearofPassing",
    "RollNo",
    "HighestQualificationExamBoard"
]

missing_report = missing_summary(df, key_columns)
display(missing_report)


Total rows in dataset: 559575


Unnamed: 0,column,'NA','' (empty),Total Missing
0,highest_qualification,72198,14973,87171
1,YearofPassing,0,0,0
2,RollNo,0,0,0
3,HighestQualificationExamBoard,1644,0,1644


In [None]:
# Inspect the result
print(f"Unique keys in iti_df: {df['composite_key'].nunique()} / {len(df)}")

Unique keys in iti_df: 542756 / 559575


In [None]:
# Find rows with duplicate keys
duplicates_df = df[df.duplicated('composite_key', keep=False)]

# How many rows are involved in duplicates
print(f"Rows with duplicate composite keys: {len(duplicates_df)}")

# How many unique keys are duplicated
print(f"Unique composite keys duplicated: {duplicates_df['composite_key'].nunique()}")

Rows with duplicate composite keys: 32730
Unique composite keys duplicated: 15911


In [None]:
# Look at a few duplicate groups
example_dupes = duplicates_df.groupby('composite_key').head(5)
example_dupes[['aadhar_no', 'highest_qualification', 'YearofPassing', 'RollNo', 'HighestQualificationExamBoard', 'composite_key']].head()
example_dupes

Unnamed: 0,aadhar_no,highest_qualification,YearofPassing,RollNo,HighestQualificationExamBoard,composite_key
0,DeGxV1bCBqOBygfLzt3u268Es6xBaRZq3tf+eEp6WDw=,10TH,2017,PoLOLFcCUYZ/iR42DWBy2A==,"Board of Secondary Education, Odisha, Cuttack-...",10th_2017_PoLOLFcCUYZ/iR42DWBy2A==_Board of Se...
2,EPFmTPfdOGPrc/oLdkNxJ+t1ocsxYmM+j+/2AL7aEBQ=,10TH,2017,LjH/WJQkl9UJzVJCu+CaSQ==,"Board of Secondary Education, Odisha, Cuttack-...",10th_2017_LjH/WJQkl9UJzVJCu+CaSQ==_Board of Se...
7,AM1BwT3d2R+NQ2nuBdCHt4Qhw83J+GG5svrRrRWvjrE=,,2017,ygiPoVgEWXgegbZaYL66Aw==,"Board of Secondary Education, Odisha, Cuttack-...",_2017_ygiPoVgEWXgegbZaYL66Aw==_Board of Second...
31,2Pneod71Uf+OJ17wca36YJ/UPUBG6vBtc/s6awUm8aw=,10TH PASS,2017,h/PsI1W/Ko0xE9jIPRfFBw==,"Board of Secondary Education, Odisha, Cuttack-...",10th pass_2017_h/PsI1W/Ko0xE9jIPRfFBw==_Board ...
34,guVWBg8CbeqpP+7HsPcu9ixqGj+4K5cUD5fFkniTcLs=,10th,2017,oTiyDPHbNOxVEaJG+XFmqA==,"Board of Secondary Education, Odisha, Cuttack-...",10th_2017_oTiyDPHbNOxVEaJG+XFmqA==_Board of Se...


In [43]:
conflicts = (
    duplicates_df.groupby('composite_key')['aadhar_no']
    .nunique()
    .reset_index(name='unique_aadhar_count')
    .query("unique_aadhar_count > 1")
)

print(f"Composite keys linked to multiple Aadhaar numbers: {len(conflicts)}")


Composite keys linked to multiple Aadhaar numbers: 1128
