### Libraries


In [1]:
# Standard libraries
import os
import json
import sqlite3
from pathlib import Path

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Project-specific configuration
from sams.config import RAW_DATA_DIR
from sams.utils import load_data
from sams.config import datasets

[32m2025-09-23 10:44:40.012[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m15[0m - [1mPROJ_ROOT path is: C:\Users\Admin\Documents\GitHub\sams[0m
[32m2025-09-23 10:44:40.124[0m | [1mINFO    [0m | [36msams.config[0m:[36m<module>[0m:[36m92[0m - [1mLoaded 0 geocodes from cache[0m


### Loading the data

In [2]:
# Use the path from datasets metadata 
db_path = datasets["sams"]["path"]
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables:", [t[0] for t in tables])

cursor.close()
conn.close()

Tables: ['students', 'institutes']


In [3]:
# # Connect to the database
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # Delete rows for DEG module in 2021
# print("Deleting rows")
# cursor.execute("DELETE FROM students WHERE module = 'DEG' AND academic_year = 2023")

# # Commit and close
# conn.commit()
# cursor.close()
# conn.close()

# print("Removed")

In [4]:
import sqlite3

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# List all columns in the students table
cursor.execute("PRAGMA table_info(students)")
columns = cursor.fetchall()

print("\nColumns in 'students' table:")
for col in columns:
    print(col[1])  # col[1] is the column name

conn.close()


Columns in 'students' table:
id
barcode
student_name
gender
religion_name
dob
nationality
annual_income
address
state
district
block
pin_code
social_category
domicile
s_domicile_category
outside_odisha_applicant_state_name
odia_applicant_living_outside_odisha_state_name
residence_barcode_number
tenth_exam_school_address
eighth_exam_school_address
highest_qualification_exam_board
board_exam_name_for_highest_qualification
highest_qualification
had_two_year_full_time_work_exp_after_tenth
gc
ph
es
sports
national_cadet_corps
pm_care
orphan
income_barcode
tfw
ews
boc
boc_regd_no
course_name
course_period
beauty_culture_type
sams_code
reported_institute
reported_branch_or_trade
institute_district
type_of_institute
phase
year
admission_status
enrollment_status
applied_status
date_of_application
application_status
aadhar_no
registration_number
mark_data
module
academic_year
contact_no
option_data
examination_board_of_the_highest_qualification
examination_type
year_of_passing
roll_no
total_mar

In [3]:
deg_enrollments = load_data(datasets["deg_enrollments"]) 

[32m2025-09-23 10:44:46.273[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m75[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_enrollments.pq[0m


In [4]:
deg_enrollments.shape

(2054491, 32)

In [5]:
deg_enrollments.columns

Index(['id', 'barcode', 'student_name', 'gender', 'religion_name', 'dob',
       'annual_income', 'address', 'state', 'district', 'block', 'pin_code',
       'social_category', 'board_exam_name_for_highest_qualification',
       'highest_qualification', 'ph', 'es', 'sports', 'orphan', 'aadhar_no',
       'module', 'academic_year',
       'examination_board_of_the_highest_qualification', 'examination_type',
       'year_of_passing', 'roll_no', 'total_marks', 'secured_marks',
       'percentage', 'compartmental_status', 'deg_option_details',
       'deg_compartments'],
      dtype='object')

In [None]:
deg_enrollments = load_data(datasets["deg_enrollments"])


In [6]:
deg_applications = load_data(datasets["deg_applications"]) 
# deg_marks = load_data(datasets["deg_marks"])

[32m2025-09-23 11:05:35.404[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m75[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_applications.pq[0m


In [9]:
hss_enrollments = load_data(datasets["hss_enrollments"])
hss_enrollments.head()

[32m2025-09-23 13:10:37.305[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m75[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_enrollments.pq[0m


Unnamed: 0,barcode,aadhar_no,academic_year,module,student_name,gender,dob,social_category,orphan,es,...,board_exam_name_for_highest_qualification,examination_board_of_the_highest_qualification,examination_type,year_of_passing,total_marks,secured_marks,percentage,compartmental_status,hss_option_details,hss_compartments
0,22H4912639,£4vÐ0ã\nO„©³%É@äÁXÔAŒ\¡2Çœ©+#´,2022,HSS,7YsWktVbq816K4kzi0Tvla4urZgbH4A+60ZFliLBvqM=,Male,22-Apr-2004,SC,NO,NO,...,"Board of Secondary Education, Orissa, Cuttack-...","BSE, Odisha",Annual,2022,600,520.0,86.67,NO,"[{""ReportedInstitute"": ""Government (SSD) Highe...",[]
1,22H1266274,£4vÐ0ã\nO„©³%É@äÁXÔAŒ\¡2Çœ©+#´,2022,HSS,7YsWktVbq816K4kzi0Tvla4urZgbH4A+60ZFliLBvqM=,Male,22-Apr-2004,SC,NO,NO,...,"Board of Secondary Education, Orissa, Cuttack-...","BSE, Odisha",Annual,2022,600,520.0,86.67,NO,"[{""ReportedInstitute"": ""Buxi Jagabandhu Bidyad...",[]
2,21H4032033,£4wQ²îµËI¶Ã……?%½¯†I¥±ÄáW^M#$Ã,2021,HSS,by31dul/d+d92Kzto+gtizZh1ToLWKGOiBxytowUXGY=,Female,26-Nov-2005,OBC,NO,NO,...,"Board of Secondary Education, Orissa, Cuttack-...","BSE, Odisha",Annual,2021,600,523.0,87.17,NO,"[{""ReportedInstitute"": ""North Odisha Higher Se...",[]
3,23H1211722,£44'“æQXj=%µšØNZXV¸¾¶x¤ö±,2023,HSS,xMg2ie5HzvOo5JwFBLKTqsgt9YbOGBR9v5MbRQTUg0I=,Male,18-Jun-2007,GEN,NO,NO,...,"Board of Secondary Education, Orissa, Cuttack-...","BSE, Odisha",Annual,2023,600,271.0,45.17,NO,"[{""ReportedInstitute"": ""Government Higher Seco...",[]
4,24H8792210,£44'“æQXj=%µšØNZXV¸¾¶x¤ö±,2024,HSS,xMg2ie5HzvOo5JwFBLKTqsgt9YbOGBR9v5MbRQTUg0I=,Male,18-Jun-2007,SEBC,NO,NO,...,"Board of Secondary Education, Orissa, Cuttack-...","BSE, Odisha",Annual,2023,600,271.0,45.17,NO,"[{""ReportedInstitute"": ""Government Higher Seco...",[]


In [11]:
from sams.utils import load_data
from sams.config import datasets
hss_marks = load_data(datasets['hss_marks'])

[32m2025-09-23 17:06:40.386[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m75[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_marks.pq[0m


In [12]:
hss_marks.head()

Unnamed: 0,aadhar_no,barcode,academic_year,module,board_exam_name_for_highest_qualification,highest_qualification,examination_board_of_the_highest_qualification,examination_type,year_of_passing,total_marks,secured_marks,percentage,compartmental_status,comp_subject,comp_fail_mark,comp_pass_mark
0,,18J0000045,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,426.0,71.0,NO,,,
1,,18J0000057,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,203.0,33.83,NO,,,
2,,18J0000067,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,292.0,48.67,NO,,,
3,,18J0000075,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,289.0,48.17,NO,,,
4,,18J0000081,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,409.0,68.17,NO,,,


In [None]:
hss_marks.head()


Unnamed: 0,aadhar_no,barcode,academic_year,module,board_exam_name_for_highest_qualification,highest_qualification,examination_board_of_the_highest_qualification,examination_type,year_of_passing,total_marks,secured_marks,percentage,compartmental_status,comp_index,comp_subject,comp_fail_mark,comp_pass_mark
0,,18J0000045,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,426.0,71.0,NO,,,,
1,,18J0000057,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,203.0,33.83,NO,,,,
2,,18J0000067,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,292.0,48.67,NO,,,,
3,,18J0000075,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,289.0,48.17,NO,,,,
4,,18J0000081,2018,HSS,"Board of Secondary Education, Orissa, Cuttack-...",10TH,"BSE, Odisha",Annual,2018,600,409.0,68.17,NO,,,,


In [14]:
hss_marks.columns

Index(['aadhar_no', 'barcode', 'academic_year', 'module',
       'board_exam_name_for_highest_qualification', 'highest_qualification',
       'examination_board_of_the_highest_qualification', 'examination_type',
       'year_of_passing', 'total_marks', 'secured_marks', 'percentage',
       'compartmental_status', 'comp_subject', 'comp_fail_mark',
       'comp_pass_mark'],
      dtype='object')

In [13]:
hss_marks.shape

(3463531, 16)

In [12]:
hss_applications = load_data(datasets['hss_applications'])
hss_applications.shape

[32m2025-09-23 13:23:48.587[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m75[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\hss_applications.pq[0m


(1855, 14)

In [7]:
deg_applications.shape

(14640073, 15)

In [17]:
deg_applications.head()

Unnamed: 0,aadhar_no,academic_year,barcode,stream,subject,admission_status,reported_institute,sams_code,institute_district,institute_block,type_of_institute,year,phase,option_no,num_applications
0,,2018,18D000058,Arts,Education,NOT SELECTED,"Balikhanda (Degree) College, Balikhanda",2099301,Balasore,Simulia,Non-Govt Aided (662 Categories),2018,1,3,10
1,,2018,18D000058,Arts,Education,NOT SELECTED,"Jambeswar (Degree) Mahavidyalaya, Garsang",2079301,Balasore,Khaira,Non-Govt Aided (662 Categories),2018,1,4,10
2,,2018,18D000058,Arts,Odia,NOT SELECTED,"Charampa (Degree) College, Charampa",4022301,Bhadrak,Bhadrak,Non-Govt Aided (488 Categories),2018,1,1,10
3,,2018,18D000058,Arts,Odia,NOT SELECTED,Biranchi Narayan Madhab Arjuna (Degree) Colleg...,4072303,Bhadrak,Tihidi,Non-Govt Aided (488 Categories),2018,1,2,10
4,,2018,18D000058,Arts,Sociology,NOT SELECTED,"Maa Sarada Devi (Degree) College, Kothar",4069302,Bhadrak,Dhamnagar,Non-Govt Aided (662 Categories),2018,1,5,10


In [18]:
deg_marks.head()

Unnamed: 0,aadhar_no,academic_year,barcode,board_exam_name_for_highest_qualification,highest_qualification,module,examination_board_of_the_highest_qualification,examination_type,year_of_passing,total_marks,secured_marks,percentage,compartmental_status,comp_subject,comp_fail_mark,comp_pass_mark
0,,2018,18D000005,"Council of Higher Secondary Education, Odisha",+2 SCIENCE,DEG,"CHSE, Odisha",Annual,2018,600,419.0,69.83,NO,,,
1,,2018,18D000009,"Council of Higher Secondary Education, Odisha",+2 ARTS,DEG,"CHSE, Odisha",Annual,2018,600,478.0,79.67,NO,,,
2,,2018,18D000014,"Council of Higher Secondary Education, Odisha",+2 ARTS,DEG,"CHSE, Odisha",Annual,2018,600,230.0,38.33,NO,,,
3,,2018,18D000017,"Council of Higher Secondary Education, Odisha",+2 SCIENCE,DEG,"CHSE, Odisha",Annual,2018,600,333.0,55.5,NO,,,
4,,2018,18D000019,"Council of Higher Secondary Education, Odisha",+2 ARTS,DEG,"CHSE, Odisha",Annual,2018,600,236.0,39.33,NO,,,


In [None]:
deg_marks.head()

Unnamed: 0,aadhar_no,academic_year,barcode,board_exam_name_for_highest_qualification,highest_qualification,module,examination_board_of_the_highest_qualification,examination_type,year_of_passing,total_marks,secured_marks,percentage,compartmental_status,comp_subject,comp_fail_mark,comp_pass_mark
0,,2018,18D000005,"Council of Higher Secondary Education, Odisha",+2 SCIENCE,DEG,"CHSE, Odisha",Annual,2018,600,419.0,69.83,NO,,,
1,,2018,18D000009,"Council of Higher Secondary Education, Odisha",+2 ARTS,DEG,"CHSE, Odisha",Annual,2018,600,478.0,79.67,NO,,,
2,,2018,18D000014,"Council of Higher Secondary Education, Odisha",+2 ARTS,DEG,"CHSE, Odisha",Annual,2018,600,230.0,38.33,NO,,,
3,,2018,18D000017,"Council of Higher Secondary Education, Odisha",+2 SCIENCE,DEG,"CHSE, Odisha",Annual,2018,600,333.0,55.5,NO,,,
4,,2018,18D000019,"Council of Higher Secondary Education, Odisha",+2 ARTS,DEG,"CHSE, Odisha",Annual,2018,600,236.0,39.33,NO,,,


In [7]:
deg_marks.shape

(2064490, 16)

In [None]:
deg_enrollments.head()

In [31]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

query = """
SELECT 
    module,
    academic_year,
    COUNT(*) AS count_students
FROM students
WHERE module = "DEG"
GROUP BY academic_year
"""

cursor.execute(query)
rows = cursor.fetchall()

for row in rows:
    print(f"Year: {row[1]}, Count: {row[2]}")

cursor.close()
conn.close()

Year: 2018, Count: 312823
Year: 2019, Count: 254342
Year: 2020, Count: 242661
Year: 2021, Count: 264152
Year: 2022, Count: 379911
Year: 2023, Count: 289849
Year: 2024, Count: 310753


In [7]:
deg_applications = load_data(datasets["deg_applications"])

[32m2025-09-15 10:31:28.501[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from C:\Users\Admin\Documents\GitHub\sams\data\interim\deg_applications.pq[0m


In [8]:
deg_applications.columns

Index(['barcode', 'aadhar_no', 'academic_year', 'year', 'phase',
       'reported_institute', 'sams_code', 'institute_district',
       'institute_block', 'type_of_institute', 'stream', 'subject',
       'option_no', 'admission_status', 'num_applications'],
      dtype='object')

In [None]:
deg_applications = deg_applications.dropna(subset=['aadhar_no', 'num_applications', 'academic_year'])

deg_applications['num_applications'] = pd.to_numeric(deg_applications['num_applications'], errors='coerce')

# Group and aggregate
summary = (
    deg_applications.groupby('academic_year')
    .agg(
        Number_of_Students=('aadhar_no', pd.Series.nunique),
        Mean=('num_applications', 'mean'),
        Std_Dev=('num_applications', 'std'),
        P25=('num_applications', lambda x: x.quantile(0.25)),
        Median=('num_applications', 'median'),
        P75=('num_applications', lambda x: x.quantile(0.75)),
    )
    .reset_index()
    .rename(columns={
        'academic_year': 'Academic Year',
        'Number_of_Students': 'Number of Students',
        'Std_Dev': 'Std Dev',
        'P25': '25th',
        'P75': '75th'
    })
)

summary = summary.round(2)
print(summary)

   Academic Year  Number of Students   Mean  Std Dev  25th  Median  75th
0           2018              202946   7.28     4.24   5.0     5.0   9.0
1           2019              198918   7.98     4.25   5.0     6.0  10.0
2           2020              190598   7.98     4.27   5.0     6.0  10.0
3           2021              249005  11.34     7.12   5.0    10.0  15.0
4           2022              255005  10.72     5.38   5.0    10.0  14.0
5           2023              234901  10.63     7.34   5.0     8.0  14.0
6           2024              260542  10.10     7.68   5.0     7.0  12.0
