In [428]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
from sams.config import datasets, SCTEVT_DIR, TABLES_DIR
import glob
import os
from loguru import logger
import re
from sams.utils import load_data
import sqlite3 as sqlite
from sams.config import SAMS_DB
from sams.analysis.utils import save_table_excel
 


# Data Preparation 

In [429]:
# List all xlsx files in the SCTEVT_DIR
xlsx_files = glob.glob(os.path.join(SCTEVT_DIR / "ITI_admission_and_results", "*.xlsx"))
sorted(xlsx_files)

['/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2018.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2019_21.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2020_21.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2021_21_Part_1.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2021_21_Part_2.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2022_21_Part_1.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/AdmittedTrainee2022_21_Part_2.xlsx',
 '/Users/ymohanty/Documents/GitHub/sams/data/external/sctevt/ITI_admission_and_results/ExamResultSheet_2018Year1.xlsx',
 '/Users/ymohanty/Documents/GitHub/s

In [430]:

# Convert all xlsx files in a directory to csv files
def convert_xlsx_to_csv(directory=SCTEVT_DIR / "ITI_admission_and_results"):
    xlsx_files = glob.glob(str(directory / "*.xlsx"))
    for xlsx_file in xlsx_files:
        csv_file = os.path.splitext(xlsx_file)[0] + ".csv"
        if not os.path.exists(csv_file):
            df = pd.read_excel(xlsx_file)
            logger.info(f"Converting {xlsx_file} to {csv_file}")
            df.to_csv(csv_file, index=False)
        

convert_xlsx_to_csv()



In [431]:


def load_admitted_trainee_csv(directory=SCTEVT_DIR / "ITI_admission_and_results"):
    files = glob.glob(str(directory / "AdmittedTrainee*.csv"))
    dfs = []
    for file in files:

        basename = file.split('/')[-1]
        year = basename.split("_")[0][-4:]
        if "_" not in basename:
            year = basename.split(".")[0][-4:]
        df = pd.read_csv(file)
        df['year'] = int(year)
        dfs.append(df)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()

admitted_trainees = load_admitted_trainee_csv(SCTEVT_DIR / "ITI_admission_and_results")

  df = pd.read_csv(file)


In [432]:

def load_exam_results_csv(directory=SCTEVT_DIR / "ITI_admission_and_results"):
    files = glob.glob(str(directory / "ExamResultSheet*.csv"))
    dfs = []
    for file in files:
        basename = file.split('/')[-1]
        year = basename.split("_")[1][:4]
        match = re.search(r'Year(\d{1})', basename)
        exam_year = int(match.group(1)) if match else None
        df = pd.read_csv(file)
        df['year'] = int(year)
        df['exam_year'] = int(exam_year)
        dfs.append(df)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()
    
exam_result_sheet = load_exam_results_csv(SCTEVT_DIR / "ITI_admission_and_results")

In [433]:
def clean_exam_result_sheet(df: pd.DataFrame, path = SCTEVT_DIR / "ITI_admission_and_results/ODISHA-Complete Result of CTS of Session 2022& 2023.csv") -> pd.DataFrame:
    df = df[["Roll No", "Overall Result", "year", "exam_year"]].rename(columns={"Roll No": "roll_num", "Overall Result": "overall_result"})
    df['roll_num'] = df['roll_num'].astype(str).str.strip("'")
    df['overall_result'] = df['overall_result'].apply(lambda x: x.split(" ")[0] if isinstance(x, str) else x)
    df['overall_result'] = df['overall_result'].str.lower()

    results_2022_23 = pd.read_csv(path)
    results_2022_23 = results_2022_23[["prnnumber", "status", "admission_year"]].rename(columns={"prnnumber": "roll_num", "status": "overall_result", "admission_year": "year"})
    results_2022_23["exam_year"] = 1
    results_2022_23["exam_year"] = results_2022_23.apply(lambda x: 2 if x["year"] == 2022 else 1, axis=1)
    results_2022_23["roll_num"] = "00" + results_2022_23["roll_num"].astype(str).str[1:]

    df = pd.concat([df, results_2022_23], ignore_index=True)
    return df

def clean_admitted_trainee_sheet(df: pd.DataFrame) -> pd.DataFrame:
    df = df[["year", "Roll_Num", "ITI_Code", "Trade_Name", "Gender"]].rename(columns={"Roll_Num": "roll_num", "ITI_Code": "iti_code", "Trade_Name": "trade", "Gender":"gender"})
    df['roll_num'] = df['roll_num'].astype(str).str.strip("'")
    df.loc[df["year"] == 2018, "trade"] = df.loc[df["year"] == 2018, "trade"].astype(str) + " (NSQF)"
    return df


    


In [434]:
admitted_trainees_cleaned = clean_admitted_trainee_sheet(admitted_trainees)
exam_result_sheet_cleaned = clean_exam_result_sheet(exam_result_sheet)

In [435]:
admitted_trainees_cleaned[admitted_trainees_cleaned["year"] == 2022]["roll_num"].nunique()



53966

In [436]:
def combine_admitted_and_exam_results(admitted_df: pd.DataFrame, exam_df: pd.DataFrame) -> pd.DataFrame:
    combined_df = pd.merge(admitted_df, exam_df, on=["roll_num", "year"], how="left")
    combined_df["matched"] = combined_df["overall_result"].notnull()
    # pivoted = combined_df.pivot_table(
    #     index=["year", "roll_num", "iti_code", "trade", "gender"],
    #     columns="exam_year",
    #     values="overall_result",
    #     aggfunc="first"
    # ).reset_index()

    # pivoted = pivoted.rename(
    #     columns={1.0: "overall_result_y1", 2.0: "overall_result_y2"}
    # )

    # return combined_df


    pivoted = combined_df.pivot_table(
        index=["year", "roll_num", "iti_code", "trade", "gender", "matched"],
        columns="exam_year",
        values="overall_result",
        aggfunc="first"
    ).reset_index()

    pivoted = pivoted.rename(
        columns={1.0: "overall_result_y1", 2.0: "overall_result_y2"}
    )

    # Merge back to admitted_df to retain all roll_num
    result = pd.merge(
        admitted_df,
        pivoted[["year", "roll_num", "overall_result_y1", "overall_result_y2"]],
        on=["year", "roll_num"],
        how="left"
    )

    return result

sctevt_df = combine_admitted_and_exam_results(admitted_trainees_cleaned, exam_result_sheet_cleaned)

    

In [437]:
sctevt_df[sctevt_df["year"] == 2022]["overall_result_y2"].count()

32918

In [438]:
# Get institute level information and trade information from SAMS
sams_iti_enrollments = load_data(datasets["iti_enrollments"])
trades = sams_iti_enrollments[["reported_branch_or_trade", "course_period"]].rename(columns={"reported_branch_or_trade": "trade"}).drop_duplicates()
conn = sqlite.connect(SAMS_DB)
query = "SELECT * FROM institutes WHERE module = 'ITI'"
institutes = pd.read_sql_query(query, conn)[["ncvtmis_code", "type_of_institute"]].rename(columns={"ncvtmis_code":"iti_code"}).drop_duplicates()

[32m2025-05-28 12:27:30.267[0m | [1mINFO    [0m | [36msams.utils[0m:[36mload_data[0m:[36m70[0m - [1mLoading data from /Users/ymohanty/Documents/GitHub/sams/data/interim/iti_enrollments.pq[0m


In [439]:
# Combine to get institute and trade information
sctevt_df = pd.merge(sctevt_df, institutes, on="iti_code", how="left")
sctevt_df = pd.merge(sctevt_df, trades, on="trade", how="left")
sctevt_df

Unnamed: 0,year,roll_num,iti_code,trade,gender,overall_result_y1,overall_result_y2,type_of_institute,course_period
0,2019,00190821000063,PR21000357,Computer Operator and Programming Assistant (N...,Female,pass,,Pvt.,1 Year
1,2019,00190821000064,PR21000357,Electrician (NSQF),Male,pass,pass,Pvt.,2 Years
2,2019,00190821000065,PR21000357,Fitter (NSQF),Male,pass,pass,Pvt.,2 Years
3,2019,00190821000066,PR21000357,Welder (NSQF),Male,fail,,Pvt.,1 Year
4,2019,00190821000067,PR21000357,Electrician (NSQF),Male,pass,pass,Pvt.,2 Years
...,...,...,...,...,...,...,...,...,...
260099,2021,00210821018669,GU21000526,Mechanic Diesel (NSQF),Male,pass,,Govt.,1 Year
260100,2021,00210821018670,GU21000526,Mechanic Diesel (NSQF),Male,pass,,Govt.,1 Year
260101,2021,00210821018671,GU21000526,Mechanic Diesel (NSQF),Male,fail,,Govt.,1 Year
260102,2021,00210821018672,GU21000526,Mechanic Diesel (NSQF),Male,pass,,Govt.,1 Year


In [440]:
# Dropouts
sctevt_df["dropout"] = False
sctevt_df["dropout"] = sctevt_df.apply(lambda x: True if (pd.isna(x["overall_result_y1"]) and x["course_period"] == "1 Year") or (pd.isna(x["overall_result_y2"]) and x["course_period"] == "2 Years") else x["dropout"], axis=1) 

# Deck exhibits

In [441]:
def pretty_pivot(pivoted_df: pd.DataFrame) -> pd.DataFrame:
    pivoted_df = pivoted_df.copy()
    pivoted_df["Total"] = pivoted_df.sum(axis=1)
    cols = ["Total"] + [col for col in pivoted_df.columns if col != "Total"]
    pivoted_df = pivoted_df[cols]
    pivoted_df = pivoted_df.astype(int)
    return pivoted_df

In [442]:
# SAMS Students admitted ITI
students_admitted_over_time = pd.pivot_table(
    sams_iti_enrollments,
    index="year",
    columns="type_of_institute",
    values="aadhar_no",
    aggfunc="nunique"
)
students_admitted_over_time = students_admitted_over_time[students_admitted_over_time.index > 2017]
sams_students_admitted_over_time = pretty_pivot(students_admitted_over_time)
sams_students_admitted_over_time



type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,50340,14863,35477
2019,47791,15781,32010
2020,44007,14265,29742
2021,45801,12449,33352
2022,41402,13208,28194
2023,63688,21764,41924
2024,60568,23420,37148


In [443]:
# SAMS Students admitted female
students_admitted_over_time = pd.pivot_table(
    sams_iti_enrollments[sams_iti_enrollments["gender"] == "Female"],
    index="year",
    columns="type_of_institute",
    values="aadhar_no",
    aggfunc="nunique"
)
students_admitted_over_time = students_admitted_over_time[students_admitted_over_time.index > 2017]
sams_students_admitted_over_time_female = pretty_pivot(students_admitted_over_time)

In [444]:
# SCTEVT dropout all 
sctevt_dropout_over_time = pd.pivot_table(
    sctevt_df,
    index="year",
    columns="type_of_institute",
    values="dropout",
    aggfunc="sum"
)
sctevt_dropout_over_time = pretty_pivot(sctevt_dropout_over_time)
sctevt_dropout_over_time

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,5386,3170,2216
2019,3727,2127,1600
2020,9374,3512,5862
2021,12356,3451,8905
2022,14466,4452,10014


In [445]:
# SCTEVT enrollment
sctevt_admitted_over_time = pd.pivot_table(
    sctevt_df,
    index="year",
    columns="type_of_institute",
    values="roll_num",
    aggfunc="nunique"
)
sctevt_admitted_over_time = pretty_pivot(sctevt_admitted_over_time)
sctevt_admitted_over_time

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,50000,13847,36153
2019,49029,15394,33635
2020,49691,16794,32897
2021,57401,16574,40827
2022,53948,18945,35003


In [446]:
# SCTEVT enrollment female
sctevt_admitted_female = pd.pivot_table(
    sctevt_df[sctevt_df["gender"] == "Female"],
    index="year",
    columns="type_of_institute",
    values="roll_num",
    aggfunc="nunique",
)
sctevt_admitted_female = pretty_pivot(sctevt_admitted_female)
sctevt_admitted_female

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,4605,2434,2171
2019,4496,2261,2235
2020,5790,2976,2814
2021,7756,3351,4405
2022,7402,4261,3141


In [447]:
# SCTEVT pass  total / government / private
sctevt_pass = pd.pivot_table(
    sctevt_df[(sctevt_df["overall_result_y1"] == "pass") & (sctevt_df["course_period"] == "1 Year") | 
              (sctevt_df["overall_result_y2"] == "pass") & (sctevt_df["course_period"] == "2 Years")],
    index="year",
    columns="type_of_institute",
    values="roll_num",
    aggfunc="nunique",
)
sctevt_pass = pretty_pivot(sctevt_pass )
sctevt_pass

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,26898,9196,17702
2019,33638,11574,22064
2020,30024,10942,19082
2021,37334,11517,25817
2022,34542,12688,21854


In [448]:
# SCTEVT pass female 
sctevt_pass_female = sctevt_df[sctevt_df["gender"] == "Female"]
sctevt_pass_female = pd.pivot_table(
    sctevt_pass_female[(sctevt_df["overall_result_y1"] == "pass") & (sctevt_df["course_period"] == "1 Year") | 
              (sctevt_df["overall_result_y2"] == "pass") & (sctevt_df["course_period"] == "2 Years")],
    index="year",
    columns="type_of_institute",
    values="roll_num",
    aggfunc="nunique",
)
sctevt_pass_female = pretty_pivot(sctevt_pass_female)
sctevt_pass_female


  sctevt_pass_female[(sctevt_df["overall_result_y1"] == "pass") & (sctevt_df["course_period"] == "1 Year") |


type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,2256,1542,714
2019,2273,1478,795
2020,2539,1694,845
2021,3508,2213,1295
2022,3941,2699,1242


In [449]:
# SCTEVT enrollment male
sctevt_admitted_male = pd.pivot_table(
    sctevt_df[sctevt_df["gender"] == "Male"],
    index="year",
    columns="type_of_institute",
    values="roll_num",
    aggfunc="nunique",
)
sctevt_admitted_male= pretty_pivot(sctevt_admitted_male)
sctevt_admitted_male

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,45395,11413,33982
2019,44533,13133,31400
2020,43901,13818,30083
2021,49645,13223,36422
2022,46545,14684,31861


In [450]:
# SCTEVT dropout female
sctevt_dropout_over_time_female = pd.pivot_table(
    sctevt_df[sctevt_df["gender"] == "Female"],
    index="year",
    columns="type_of_institute",
    values="dropout",
    aggfunc="sum"
)
sctevt_dropout_over_time_female = pretty_pivot(sctevt_dropout_over_time_female)
sctevt_dropout_over_time_female

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,788,529,259
2019,497,347,150
2020,1617,604,1013
2021,2586,687,1899
2022,2786,1124,1662


In [451]:
sctevt_df["retained"] = sctevt_df["dropout"].apply(lambda x: not x)
# SCTEVT retained all
sctevt_retained_over_time = pd.pivot_table(
    sctevt_df,
    index="year",
    columns="type_of_institute",
    values="retained",
    aggfunc="sum"
)
sctevt_retained_over_time = pretty_pivot(sctevt_retained_over_time)
sctevt_retained_over_time

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,44614,10677,33937
2019,45302,13267,32035
2020,40317,13282,27035
2021,45045,13123,31922
2022,39482,14493,24989


In [452]:
# SCTEVT retained female
sctevt_retained_over_time_female = pd.pivot_table(
    sctevt_df[sctevt_df["gender"] == "Female"],
    index="year",
    columns="type_of_institute",
    values="retained",
    aggfunc="sum"
)
sctevt_retained_over_time_female = pretty_pivot(sctevt_retained_over_time_female)
sctevt_retained_over_time_female

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,3817,1905,1912
2019,3999,1914,2085
2020,4173,2372,1801
2021,5170,2664,2506
2022,4616,3137,1479


In [453]:
# SCTEVT dropout male
sctevt_dropout_over_time_male = pd.pivot_table(
    sctevt_df[sctevt_df["gender"] == "Male"],
    index="year",
    columns="type_of_institute",
    values="dropout",
    aggfunc="sum"
)
sctevt_dropout_over_time_male = pretty_pivot(sctevt_dropout_over_time_male)
sctevt_dropout_over_time_male

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,4598,2641,1957
2019,3230,1780,1450
2020,7757,2908,4849
2021,9770,2764,7006
2022,11679,3328,8351


In [454]:
# SCTEVT retained male
sctevt_retained_over_time_male = pd.pivot_table(
    sctevt_df[sctevt_df["gender"] == "Male"],
    index="year",
    columns="type_of_institute",
    values="retained",
    aggfunc="sum"
)
sctevt_retained_over_time_male = pretty_pivot(sctevt_retained_over_time_male)
sctevt_retained_over_time_male

type_of_institute,Total,Govt.,Pvt.
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,40797,8772,32025
2019,41303,11353,29950
2020,36144,10910,25234
2021,39875,10459,29416
2022,34866,11356,23510


In [455]:
# Export to Excel
dfs = [
    sams_students_admitted_over_time,
    sams_students_admitted_over_time_female,
    sctevt_admitted_over_time,
    sctevt_admitted_female,
    sctevt_admitted_male,
    sctevt_pass,
    sctevt_pass_female,
    sctevt_dropout_over_time,
    sctevt_dropout_over_time_female,
    sctevt_retained_over_time,
    sctevt_retained_over_time_female,
    sctevt_dropout_over_time_male,
    sctevt_retained_over_time_male
]
sheet_names = [
    "SAMS Students Admitted",
    "SAMS Students Admitted (Female)",
    "SCTEVT Students Admitted",
    "SCTEVT Students Admitted (Female)",
    "SCTEVT Students Admitted (Male)",
    "SCTEVT Pass",
    "SCTEVT Pass (Female)",
    "SCTEVT Dropout",
    "SCTEVT Dropout (Female)",
    "SCTEVT Retained",
    "SCTEVT Retained (Female)",
    "SCTEVT Dropout (Male)",
    "SCTEVT Retained (Male)"
]
save_table_excel(
    dfs=dfs,
    sheet_names=sheet_names,
    outfile=TABLES_DIR / "sctevt_iti_admission_and_results.xlsx",
    index=[True]*len(dfs)
)


