In [58]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from dotenv import load_dotenv
from typing import Dict, Union
from pprint import pprint
from pathlib import Path
from llm_assessor import (
    ranged_error
)

load_dotenv(override=True)

True

In [None]:
# Helper Functions

def extract_total_cost(x: Union[int, Dict]) -> float:

    try: 
        z = literal_eval(x)["total_cost"]
    except ValueError: 
        z = x
    
    return z

In [None]:
# Set working directory - Not required if using Jupyter outside of VScode
workdir = os.environ["workdir"]
os.chdir(workdir)
os.getcwd()

In [None]:
# Load Dataframe
data_dir = Path("./validation_results")

teacher_marked_questions = pd.read_csv(data_dir / "processed_data" / "student_answers_llm_graded.csv")

In [None]:
teacher_marked_questions.head()

# Compute performance statistics

In [None]:
teacher_marked_questions_reduced = teacher_marked_questions[~teacher_marked_questions.question_type.isin(["bs_mcq", "bs_computation"])]

In [None]:
# Token Costing
teacher_marked_questions_reduced['llm_grading_total_cost'] = teacher_marked_questions_reduced.apply(lambda x: extract_total_cost(x.llm_graded_answer_token_costing) + extract_total_cost(x.llm_awarded_marks_token_costing), axis=1)

token_cost_by_question_type = teacher_marked_questions.groupby(by=["subject_id", "question_type"], as_index=False).agg({"llm_grading_total_cost": ["mean", "sum"], "question_id": "count"})
token_cost_by_question_type.columns = ['subject_id', 'question_type', 'mean_llm_grading_total_cost', 'sum_llm_grading_total_cost', 'number_of_questions']

token_cost_by_question_type

In [62]:
# Hitrate
## Mean overall
mean_hitrate = np.nanmean(teacher_marked_questions_reduced.llm_mark_hitrate)

## Mean by question type
mean_hitrate_by_question_type = teacher_marked_questions_reduced.groupby(by=["question_type"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})
mean_hitrate_by_question_type.columns = ["_".join(col_name).rstrip('_') for col_name in mean_hitrate_by_question_type.columns]

print(f"Overall Mean:  {mean_hitrate:.2f}")
mean_hitrate_by_question_type

Overall Mean:  0.30


  mean_hitrate_by_question_type = teacher_marked_questions_reduced.groupby(by=["question_type"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})


Unnamed: 0,question_type,number_of_questions_count,proportion_of_questions_covered_sum,proportion_of_questions_covered_nanmean
0,hs_analyse,4,0,0.0
1,hs_explain,8,3,0.375
2,hs_judgement,4,1,0.25
3,hs_spag,4,2,0.5


In [63]:
# Hitrate by Question ID
teacher_marked_questions_reduced.head()

question_hit_rate = teacher_marked_questions_reduced.groupby(by=['question_id', "question_type", "total_marks"], as_index=False).agg({"llm_mark_hitrate": ["count", "sum", np.nanmean]})
question_hit_rate.columns = ["question_id", "question_type", "total_marks", "number_of_questions", "hitrate_frequency", "mean_hitrate"]

question_hit_rate

  question_hit_rate = teacher_marked_questions_reduced.groupby(by=['question_id', "question_type", "total_marks"], as_index=False).agg({"llm_mark_hitrate": ["count", "sum", np.nanmean]})


Unnamed: 0,question_id,question_type,total_marks,number_of_questions,hitrate_frequency,mean_hitrate
0,1.0,hs_explain,4,4,1,0.25
1,2.0,hs_explain,12,4,2,0.5
2,3.0,hs_analyse,8,4,0,0.0
3,4.1,hs_judgement,16,4,1,0.25
4,4.2,hs_spag,4,4,2,0.5


In [None]:
# Mean Absolute Error
teacher_marked_questions_reduced["marks_mean_absolute_error"] = teacher_marked_questions_reduced.apply(lambda row: ranged_error(x=row['llm_awarded_marks'], range_of_values=[row['awarded_marks']]), axis=1)

# Overall
marks_mea = np.mean(teacher_marked_questions_reduced.marks_mean_absolute_error)

## Mean by question type
marks_mea_by_question_type = teacher_marked_questions_reduced.groupby(by=["question_type"], as_index = False).agg({"question_id": "count", "marks_mean_absolute_error": "mean"}).rename(columns={"question_id": "number_of_questions"})

print(f"Overall Mean:  {marks_mea:.2f}")
marks_mea_by_question_type

In [None]:
# Mean Absolute Error by question deviation
question_type_mark_deviations = teacher_marked_questions_reduced.groupby(by=["question_type", "marks_mean_absolute_error"], as_index = False).agg({"question_id": "count"})
question_type_totals = teacher_marked_questions_reduced.groupby(by=["question_type"], as_index = False).agg({"question_id": "count"})

question_type_mark_deviation_joined = pd.merge(question_type_mark_deviations, question_type_totals, on = "question_type", how="left")
question_type_mark_deviation_joined['questions_proportion'] = question_type_mark_deviation_joined.apply(lambda x: x['question_id_x']/ x['question_id_y'], axis = 1)

question_type_mark_deviation_joined = question_type_mark_deviation_joined.rename(columns={"question_id_x": "number_of_questions", "question_id_y": "number_of_questions_by_question_type"})
question_type_mark_deviation_joined

In [None]:
overall_stats = pd.DataFrame({"mean_hitrate": [np.round(mean_hitrate, 2)], "marks_mean_abolute_error": [np.round(marks_mea, 2)], "number_of_questions": [teacher_marked_questions.shape[0]], "elapsed_time_in_minutes": [np.round( np.sum(teacher_marked_questions.elapsed_time_in_seconds), 0) / 60], "llm_grading_total_cost": np.round( np.sum(token_cost_by_question_type.sum_llm_grading_total_cost), 2 )})
overall_stats

In [None]:
# Save validation statistics tables
savedir = Path(workdir) / "validation_results" / "validation_statistics"

Path(savedir).mkdir(parents=True, exist_ok=True)

In [None]:
teacher_marked_questions.to_csv(savedir / "examiner_llm_comparison.csv", index=False)

In [None]:
overall_stats.to_csv(savedir / "mean_hitrate.csv", index=False)
mean_hitrate_by_question_type.to_csv(savedir / "mean_hitrate_by_question_type.csv", index=False)
marks_mea_by_question_type.to_csv(savedir / "mea_marks_by_question_type.csv", index=False)
question_type_mark_deviation_joined.to_csv(savedir / "question_type_marks_deviation.csv", index=False)
question_hit_rate.to_csv(savedir / "question_hit_rate.csv", index=False)
token_cost_by_question_type.to_csv(savedir / "token_cost_by_question_type", index=False)