In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from dotenv import load_dotenv
from typing import Dict, Union
from pprint import pprint
from pathlib import Path
from llm_assessor import (
    ranged_error
)

load_dotenv(override=True)

ERROR:root:Failed to update token costs. Using static costs.
  TOKEN_COSTS = TOKEN_COSTS_STATIC


True

In [19]:
# Helper Functions

def extract_total_cost(x: Union[int, Dict]) -> float:

    try: 
        z = literal_eval(x)["total_cost"]
    except ValueError: 
        z = x
    except TypeError:
        z = x
    
    return float(z)

In [3]:
# Set working directory - Not required if using Jupyter outside of VScode
workdir = os.environ["workdir"]
os.chdir(workdir)
os.getcwd()

'/Users/junaid/Developer/audenshaw_exam_validation'

In [4]:
# Load Dataframe
data_dir = Path("./validation_results")

teacher_marked_questions = pd.read_csv(data_dir / "processed_data" / "student_answers_llm_graded.csv")

# Compute performance statistics

In [20]:
teacher_marked_questions_reduced = teacher_marked_questions[~teacher_marked_questions.question_type.isin([])]

In [24]:
# Token Costing
teacher_marked_questions_reduced['llm_grading_total_cost'] = teacher_marked_questions_reduced.apply(lambda x: extract_total_cost(x.llm_graded_answer_token_costing) + extract_total_cost(x.llm_awarded_marks_token_costing), axis=1)

token_cost_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index=False).agg({"llm_grading_total_cost": ["mean", "sum"], "question_id": "count"})
token_cost_by_question_type.columns = ['subject_id', 'question_type', 'mean_llm_grading_total_cost', 'sum_llm_grading_total_cost', 'number_of_questions']

token_cost_by_question_type

Unnamed: 0,subject_id,question_type,mean_llm_grading_total_cost,sum_llm_grading_total_cost,number_of_questions
0,aqa_history,hs_analyse,0.058919,0.41243,7
1,aqa_history,hs_explain,0.11674,1.634365,14
2,aqa_history,hs_judgement,0.115154,0.806075,7
3,aqa_history,hs_spag,0.016621,0.11635,7
4,edexcel_business_studies,bs_analyse,0.016589,0.132715,8
5,edexcel_business_studies,bs_calculate,0.004922,0.07875,16
6,edexcel_business_studies,bs_discuss,0.01596,0.06384,4
7,edexcel_business_studies,bs_evaluate,0.026015,0.10406,4
8,edexcel_business_studies,bs_explain,0.010349,0.248365,24
9,edexcel_business_studies,bs_identify,0.004517,0.01807,4


In [48]:
# Hitrate
## Mean overall
mean_hitrate_by_subject = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})
mean_hitrate_by_subject.columns = ["_".join(col_name).rstrip('_') for col_name in mean_hitrate_by_subject.columns]
mean_hitrate_by_subject

  mean_hitrate_by_subject = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})


Unnamed: 0,subject_id,number_of_questions_count,proportion_of_questions_covered_sum,proportion_of_questions_covered_nanmean
0,aqa_history,35,9,0.272727
1,edexcel_business_studies,108,55,0.591398


In [50]:
# Hitrate
## Mean by question type
mean_hitrate_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})
mean_hitrate_by_question_type.columns = ["_".join(col_name).rstrip('_') for col_name in mean_hitrate_by_question_type.columns]

mean_hitrate_by_question_type

  mean_hitrate_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "llm_mark_hitrate": ["sum", np.nanmean]}).rename(columns={"question_id": "number_of_questions", "llm_mark_hitrate": "proportion_of_questions_covered"})


Unnamed: 0,subject_id,question_type,number_of_questions_count,proportion_of_questions_covered_sum,proportion_of_questions_covered_nanmean
0,aqa_history,hs_analyse,7,0,0.0
1,aqa_history,hs_explain,14,4,0.307692
2,aqa_history,hs_judgement,7,1,0.142857
3,aqa_history,hs_spag,7,4,0.571429
4,edexcel_business_studies,bs_analyse,8,3,0.5
5,edexcel_business_studies,bs_calculate,16,10,0.714286
6,edexcel_business_studies,bs_discuss,4,0,0.0
7,edexcel_business_studies,bs_evaluate,4,1,0.333333
8,edexcel_business_studies,bs_explain,24,9,0.391304
9,edexcel_business_studies,bs_identify,4,3,1.0


In [35]:
# Hitrate by Question ID
teacher_marked_questions_reduced.head()

question_hit_rate = teacher_marked_questions_reduced.groupby(by=['subject_id','question_id', "question_type", "total_marks"], as_index=False).agg({"llm_mark_hitrate": ["count", "sum", np.nanmean]})
question_hit_rate.columns = ["subject_id","question_id", "question_type", "total_marks", "number_of_questions", "hitrate_frequency", "mean_hitrate"]

question_hit_rate = question_hit_rate.sort_values(by=["subject_id","question_id","question_type", "mean_hitrate"], ascending=[True, True, True, True])
question_hit_rate

  question_hit_rate = teacher_marked_questions_reduced.groupby(by=['subject_id','question_id', "question_type", "total_marks"], as_index=False).agg({"llm_mark_hitrate": ["count", "sum", np.nanmean]})


Unnamed: 0,subject_id,question_id,question_type,total_marks,number_of_questions,hitrate_frequency,mean_hitrate
0,aqa_history,1.0,hs_explain,4,6,2,0.333333
1,aqa_history,2.0,hs_explain,12,7,2,0.285714
2,aqa_history,3.0,hs_analyse,8,6,0,0.0
3,aqa_history,4.1,hs_judgement,16,7,1,0.142857
4,aqa_history,4.2,hs_spag,4,7,4,0.571429
5,edexcel_business_studies,1.1,bs_mcq,1,4,4,1.0
6,edexcel_business_studies,1.2,bs_mcq,1,4,4,1.0
7,edexcel_business_studies,1.3,bs_explain,3,4,2,0.5
8,edexcel_business_studies,1.4,bs_explain,3,4,1,0.25
9,edexcel_business_studies,2.1,bs_mcq,2,4,4,1.0


In [36]:
# Mean Absolute Error
teacher_marked_questions_reduced["marks_mean_absolute_error"] = teacher_marked_questions_reduced.apply(lambda row: ranged_error(x=row['llm_awarded_marks'], range_of_values=[row['awarded_marks']]), axis=1)

# Overall
marks_mea = np.mean(teacher_marked_questions_reduced.marks_mean_absolute_error)

## Mean by question type
marks_mea_by_question_type = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count", "marks_mean_absolute_error": "mean"}).rename(columns={"question_id": "number_of_questions"})
marks_mea_by_question_type = marks_mea_by_question_type.sort_values(by=["subject_id", "question_type", "number_of_questions"], ascending=[True, True, True])

print(f"Overall Mean:  {marks_mea:.2f}")
marks_mea_by_question_type

Overall Mean:  0.66


Unnamed: 0,subject_id,question_type,number_of_questions,marks_mean_absolute_error
0,aqa_history,hs_analyse,7,1.714286
1,aqa_history,hs_explain,14,0.928571
2,aqa_history,hs_judgement,7,2.857143
3,aqa_history,hs_spag,7,0.857143
4,edexcel_business_studies,bs_analyse,8,0.375
5,edexcel_business_studies,bs_calculate,16,0.25
6,edexcel_business_studies,bs_discuss,4,1.0
7,edexcel_business_studies,bs_evaluate,4,0.75
8,edexcel_business_studies,bs_explain,24,0.708333
9,edexcel_business_studies,bs_identify,4,0.0


In [40]:
# Mean Absolute Error by question deviation
question_type_mark_deviations = teacher_marked_questions_reduced.groupby(by=["question_type", "marks_mean_absolute_error"], as_index = False).agg({"question_id": "count"})
question_type_totals = teacher_marked_questions_reduced.groupby(by=["subject_id", "question_type"], as_index = False).agg({"question_id": "count"})

question_type_mark_deviation_joined = pd.merge(question_type_mark_deviations, question_type_totals, on = "question_type", how="left")
question_type_mark_deviation_joined['questions_proportion'] = question_type_mark_deviation_joined.apply(lambda x: x['question_id_x']/ x['question_id_y'], axis = 1)

question_type_mark_deviation_joined = question_type_mark_deviation_joined.rename(columns={"question_id_x": "number_of_questions", "question_id_y": "number_of_questions_by_question_type"})
ordered_cols = ["subject_id", "question_type", "marks_mean_absolute_error", "number_of_questions", "number_of_questions_by_question_type", "questions_proportion"]
question_type_mark_deviation_joined = question_type_mark_deviation_joined[ordered_cols]
question_type_mark_deviation_joined = question_type_mark_deviation_joined.sort_values(by=["subject_id", "question_type", "marks_mean_absolute_error"], ascending=[True, True, True]).reset_index(drop=True)

question_type_mark_deviation_joined

Unnamed: 0,subject_id,question_type,marks_mean_absolute_error,number_of_questions,number_of_questions_by_question_type,questions_proportion
0,aqa_history,hs_analyse,0.0,1,7,0.142857
1,aqa_history,hs_analyse,1.0,2,7,0.285714
2,aqa_history,hs_analyse,2.0,3,7,0.428571
3,aqa_history,hs_analyse,4.0,1,7,0.142857
4,aqa_history,hs_explain,0.0,5,14,0.357143
5,aqa_history,hs_explain,1.0,5,14,0.357143
6,aqa_history,hs_explain,2.0,4,14,0.285714
7,aqa_history,hs_judgement,0.0,1,7,0.142857
8,aqa_history,hs_judgement,2.0,2,7,0.285714
9,aqa_history,hs_judgement,3.0,1,7,0.142857


In [64]:
overall_stats = teacher_marked_questions_reduced.groupby(by=["subject_id"], as_index=False).agg({"llm_mark_hitrate": 'mean', 'question_id': 'count', 'elapsed_time_in_seconds': 'sum', 'llm_grading_total_cost': 'sum'})

overall_stats['mean_hitrate'] = overall_stats['llm_mark_hitrate'].apply(lambda x: np.round(x, 2))
overall_stats['number_of_questions'] = overall_stats.question_id
overall_stats['elapsed_time_in_minutes'] = overall_stats['elapsed_time_in_seconds'].apply(lambda x: x/ 60)
overall_stats['llm_grading_total_cost'] = overall_stats['llm_grading_total_cost'].apply(lambda x: np.round(x, 2))

overall_stats = overall_stats[['subject_id', 'mean_hitrate', 'number_of_questions', 'elapsed_time_in_minutes', 'llm_grading_total_cost']]
overall_stats

Unnamed: 0,subject_id,mean_hitrate,number_of_questions,elapsed_time_in_minutes,llm_grading_total_cost
0,aqa_history,0.27,35,4.807091,2.97
1,edexcel_business_studies,0.59,108,7.600546,1.04


In [65]:
# Save validation statistics tables
savedir = Path(workdir) / "validation_results" / "validation_statistics"

Path(savedir).mkdir(parents=True, exist_ok=True)

In [66]:
teacher_marked_questions.to_csv(savedir / "examiner_llm_comparison.csv", index=False)

In [67]:
overall_stats.to_csv(savedir / "mean_hitrate.csv", index=False)
mean_hitrate_by_question_type.to_csv(savedir / "mean_hitrate_by_question_type.csv", index=False)
marks_mea_by_question_type.to_csv(savedir / "mea_marks_by_question_type.csv", index=False)
question_type_mark_deviation_joined.to_csv(savedir / "question_type_marks_deviation.csv", index=False)
question_hit_rate.to_csv(savedir / "question_hit_rate.csv", index=False)
token_cost_by_question_type.to_csv(savedir / "token_cost_by_question_type.csv", index=False)