In [None]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from dotenv import load_dotenv
from typing import Dict, Union
from pprint import pprint
from pathlib import Path
from llm_assessor import (
    ranged_error
)

load_dotenv(override=True)

In [None]:
# Helper Functions
def extract_total_cost(x: Union[int, Dict]) -> float:

    try: 
        z = literal_eval(x)["total_cost"]
    except ValueError: 
        z = x
    except TypeError:
        z = x
    
    return float(z)

In [None]:
# Set working directory - Not required if using Jupyter outside of VScode
workdir = os.environ["workdir"]
os.chdir(workdir)
os.getcwd()

In [None]:
# Load Dataframe
data_dir = Path("./validation_results")

teacher_marked_questions = pd.read_csv(data_dir / "processed_data" / "student_answers_llm_graded.csv")

# Compute performance statistics

In [None]:
question_types_to_exclude = []
teacher_marked_questions_reduced = teacher_marked_questions[~teacher_marked_questions.question_type.isin(question_types_to_exclude)]

In [None]:
# Token Costing
teacher_marked_questions_reduced['llm_grading_total_cost'] = teacher_marked_questions_reduced.apply(lambda x: extract_total_cost(x.llm_graded_answer_token_costing) + extract_total_cost(x.llm_awarded_marks_token_costing), axis=1)

token_cost_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index=False).agg({"llm_grading_total_cost": ["mean", "sum"], "question_id": "count"})
token_cost_by_question_type.columns = ['subject_id', 'question_type', 'mean_llm_grading_total_cost', 'sum_llm_grading_total_cost', 'number_of_questions']

token_cost_by_question_type

In [None]:
# Hitrate
## Mean overall
mean_hitrate_by_subject = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})
mean_hitrate_by_subject.columns = ["_".join(col_name).rstrip('_') for col_name in mean_hitrate_by_subject.columns]
mean_hitrate_by_subject

In [None]:
# Same Level hitrate
## Mean overall
mean_same_level_hitrate_by_subject = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index = False).agg({"question_id": "count", "llm_level_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_level_hitrate": "proportion_of_questions_covered"})
mean_same_level_hitrate_by_subject.columns = ["_".join(col_name).rstrip('_') for col_name in mean_same_level_hitrate_by_subject.columns]
mean_same_level_hitrate_by_subject

In [None]:
# same level pm1 hitrate
## Mean overall
# mean_same_level_pm1_hitrate_by_subject = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index = False).agg({"question_id": "count", "same_level_hitrate_pm1": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "same_level_hitrate_pm1": "proportion_of_questions_covered"})
# mean_same_level_pm1_hitrate_by_subject.columns = ["_".join(col_name).rstrip('_') for col_name in mean_same_level_pm1_hitrate_by_subject.columns]
# mean_same_level_pm1_hitrate_by_subject

In [None]:
# Hitrate
## Mean by question type
mean_hitrate_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "mark_hitrate_proportion_of_questions_covered"})
mean_hitrate_by_question_type.columns = ["_".join(col_name).rstrip('_') for col_name in mean_hitrate_by_question_type.columns]

mean_hitrate_by_question_type

In [None]:
# Level Hitrate
## Mean by question type
mean_level_hitrate_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "llm_level_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_level_hitrate": "level_hitrate_proportion_of_questions_covered"})
mean_level_hitrate_by_question_type.columns = ["_".join(col_name).rstrip('_') for col_name in mean_level_hitrate_by_question_type.columns]

mean_level_hitrate_by_question_type

In [None]:
# Hitrate by Question ID
question_hit_rate = teacher_marked_questions_reduced.groupby(by=['subject_id','question_id', "question_type", "total_marks"], as_index=False).agg({"llm_mark_hitrate": ["count", "sum", np.nanmean]})
question_hit_rate.columns = ["subject_id","question_id", "question_type", "total_marks", "number_of_questions", "hitrate_frequency", "mean_hitrate"]

question_hit_rate = question_hit_rate.sort_values(by=["subject_id","question_id","question_type", "mean_hitrate"], ascending=[True, True, True, True])
question_hit_rate

In [None]:
# Level Hitrate by Question ID
level_question_hit_rate = teacher_marked_questions_reduced.groupby(by=['subject_id','question_id', "question_type", "total_marks"], as_index=False).agg({"llm_level_hitrate": ["count", "sum", np.nanmean]})
level_question_hit_rate.columns = ["subject_id","question_id", "question_type", "total_marks", "number_of_questions", "level_hitrate_frequency", "mean_level_hitrate"]

level_question_hit_rate = level_question_hit_rate.sort_values(by=["subject_id","question_id","question_type", "mean_level_hitrate"], ascending=[True, True, True, True])
level_question_hit_rate

In [None]:
# Mean Absolute Error
teacher_marked_questions_reduced["marks_mean_absolute_error"] = teacher_marked_questions_reduced.apply(lambda row: ranged_error(x=row['llm_awarded_marks'], range_of_values=[row['awarded_marks']]), axis=1)

# Overall
marks_mea = np.mean(teacher_marked_questions_reduced.marks_mean_absolute_error)

## Mean by question type
marks_mea_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "marks_mean_absolute_error": "mean"}).rename(columns={"question_id": "number_of_questions"})
marks_mea_by_question_type = marks_mea_by_question_type.sort_values(by=["subject_id", "question_type", "number_of_questions"], ascending=[True, True, True])

print(f"Overall Mean:  {marks_mea:.2f}")
marks_mea_by_question_type

In [None]:
# Mean scaled error metric

# Overall
scaled_metric_mean = np.nanmean(teacher_marked_questions_reduced.scaled_error_metric)

## Mean by question type
scaled_metric_mean_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "scaled_error_metric": "mean"}).rename(columns={"question_id": "number_of_questions"})
scaled_metric_mean_by_question_type = scaled_metric_mean_by_question_type.sort_values(by=["subject_id", "question_type", "number_of_questions"], ascending=[True, True, True])

print(f"Overall Mean:  {scaled_metric_mean:.2f}")
scaled_metric_mean_by_question_type

In [None]:
# Mean Absolute Error by question deviation
question_type_mark_deviations = teacher_marked_questions_reduced.groupby(by=["question_type", "marks_mean_absolute_error"], as_index = False).agg({"question_id": "count"})
question_type_totals = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count"})

question_type_mark_deviation_joined = pd.merge(question_type_mark_deviations, question_type_totals, on = "question_type", how="left")
question_type_mark_deviation_joined['questions_proportion'] = question_type_mark_deviation_joined.apply(lambda x: x['question_id_x']/ x['question_id_y'], axis = 1)

question_type_mark_deviation_joined = question_type_mark_deviation_joined.rename(columns={"question_id_x": "number_of_questions", "question_id_y": "number_of_questions_by_question_type"})
ordered_cols = ["subject_id", "question_type", "marks_mean_absolute_error", "number_of_questions", "number_of_questions_by_question_type", "questions_proportion"]
question_type_mark_deviation_joined = question_type_mark_deviation_joined[ordered_cols]
question_type_mark_deviation_joined = question_type_mark_deviation_joined.sort_values(by=["subject_id", "question_type", "marks_mean_absolute_error"], ascending=[True, True, True]).reset_index(drop=True)

question_type_mark_deviation_joined

In [None]:
overall_stats = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index=False).agg({
    "llm_mark_hitrate": 'mean', 
    'llm_level_hitrate': 'mean', 
    #'same_level_hitrate_pm1': 'mean', 
    'scaled_error_metric': 'mean', 
    'question_id': 'count', 
    'elapsed_time_in_seconds': 'sum', 
    'llm_grading_total_cost': 'sum'})

overall_stats['mean_hitrate'] = overall_stats['llm_mark_hitrate'].apply(lambda x: np.round(x, 2))
overall_stats['mean_llm_level_hitrate'] = overall_stats['llm_level_hitrate'].apply(lambda x: np.round(x, 2))
#overall_stats['mean_same_level_hitrate_pm1'] = overall_stats['same_level_hitrate_pm1'].apply(lambda x: np.round(x, 2))
overall_stats['mean_scaled_error_metric'] = overall_stats['scaled_error_metric'].apply(lambda x: np.round(x, 2))


overall_stats['number_of_questions'] = overall_stats.question_id
overall_stats['elapsed_time_in_minutes'] = overall_stats['elapsed_time_in_seconds'].apply(lambda x: x/ 60)
overall_stats['llm_grading_total_cost'] = overall_stats['llm_grading_total_cost'].apply(lambda x: np.round(x, 2))

overall_stats = overall_stats[[
    'subject_id', 
    'mean_hitrate', 
    'mean_llm_level_hitrate', 
    #'mean_same_level_hitrate_pm1', 
    'mean_scaled_error_metric', 
    'number_of_questions', 
    'elapsed_time_in_minutes', 
    'llm_grading_total_cost']]

overall_stats

In [None]:
# Save validation statistics tables
savedir = Path(workdir) / "validation_results" / "validation_statistics"

Path(savedir).mkdir(parents=True, exist_ok=True)

In [None]:
teacher_marked_questions.to_csv(savedir / "examiner_llm_comparison.csv", index=False)

In [None]:
overall_stats.to_csv(savedir / "mean_hitrate.csv", index=False)
mean_hitrate_by_question_type.to_csv(savedir / "mean_hitrate_by_question_type.csv", index=False)
mean_level_hitrate_by_question_type.to_csv(savedir / "mean_level_hitrate_by_question_type.csv", index=False)
marks_mea_by_question_type.to_csv(savedir / "mea_marks_by_question_type.csv", index=False)
question_type_mark_deviation_joined.to_csv(savedir / "question_type_marks_deviation.csv", index=False)
question_hit_rate.to_csv(savedir / "question_hit_rate.csv", index=False)
level_question_hit_rate.to_csv(savedir / "level_question_hit_rate.csv", index=False)
token_cost_by_question_type.to_csv(savedir / "token_cost_by_question_type.csv", index=False)
scaled_metric_mean_by_question_type.to_csv(savedir / "scaled_metric_mean_by_question_type.csv", index = False)
mean_same_level_hitrate_by_subject.to_csv(savedir / "mean_same_level_hitrate_by_subject.csv", index = False)
#mean_same_level_pm1_hitrate_by_subject.to_csv(savedir / "mean_same_level_pm1_hitrate_by_subject.csv", index = False)