In [39]:
import os
import logging
import pandas as pd
import yaml
from pathlib import Path
from collections import OrderedDict
from typing import Union

In [40]:
# Set working directory - Not required if using Jupyter outside of VScode
workdir = os.environ["workdir"]
os.chdir(workdir)
os.getcwd()

'/Users/junaid/Developer/audenshaw_exam_validation'

In [41]:
logging.basicConfig(
    level=logging.INFO,
    format='%(process)d-%(levelname)s-%(message)s',
    datefmt="%Y-%m-%d %H:%M:%S",
    )


In [42]:
# Extract Level from Mark Scheme YAML Helper Function
def extract_level_structure(markstring):
    
    levels_dict = OrderedDict()
    data = yaml.safe_load(markstring)

    for level, content in data.items():
        # Extract marks for marking level, if present in mark scheme text
        try:
            marks_for_level = content.get("marks_awarded")
            sorted_marks_for_level = list(
                {
                int(marks_for_level["minimum"]), 
                int(marks_for_level["maximum"]) 
                })

            try:
                level_number = int( level.split("_")[-1] )
                levels_dict[level_number] = sorted_marks_for_level      
            except ValueError as e:
                # Skip if Level Number is not included
                pass                       
        
        except AttributeError as e:
            pass

    # If there is Level with 0 marks, add it
    try: 
        level_marks = levels_dict.keys()
        levels_marks_extracted = levels_dict.values()
        
        if [0] not in levels_marks_extracted:
            levels_dict[0] = [0]
        
        if 0 in level_marks:
            levels_dict.move_to_end(key=0, last=False)

    except ValueError as e:
        pass

    # Output
    levels_extracted = dict(levels_dict) if len(levels_dict.keys()) > 0 else None

    return levels_extracted

def detect_marking_levels(markstring: str) -> Union[bool, None]:
    """
    Detects if a YAML string contains marking levels.

    Parameters
    ----------
    markstring: str
        The YAML string to be parsed.

    Returns
    -------
    level_detected_flag: Union[bool, None]
        - True if marking levels are detected.
        - None if no marking levels are detected or an error occurs during parsing.
    """

    try:
        data = yaml.safe_load(markstring)

        yaml_keys = list(data.keys())
        yaml_level_keys = [key for key in yaml_keys if 'Level_' in key]
        yaml_level_marking_extracted_levels = [int(lvl) if lvl.isdigit() else None for lvl in [key.split("_")[-1] for key in yaml_level_keys]] 

        level_detected_flag = True if len( yaml_level_marking_extracted_levels ) > 0 else None
    except Exception:
        level_detected_flag = None

    return level_detected_flag


In [43]:
data_dir = Path(f"{workdir}/data")

question_info = pd.read_csv(data_dir / "questions.csv")
mark_scheme_table = pd.read_csv(data_dir / "mark_scheme.csv")
student_answers = pd.read_csv(data_dir / "answers.csv")


In [44]:
question_answers_merged = pd.merge(student_answers, question_info, on = ["question_id", "subject_id"], how = "left")

In [53]:
# Add a flag if a mark scheme contains marking levels
mark_scheme_table["questions_with_marking_level_flag"] = mark_scheme_table["structured_mark_scheme_text"].apply(lambda x: detect_marking_levels(x))

In [54]:
# Join Mark Scheme Text
question_answers_ms_merged = pd.merge(question_answers_merged, mark_scheme_table[["question_id", "subject_id", "structured_mark_scheme_text", "questions_with_marking_level_flag"]], on = ["question_id", "subject_id"], how = "left")

# Rename Columns
question_answers_ms_merged = question_answers_ms_merged.rename(columns = {"structured_mark_scheme_text": "mark_scheme_text"})

In [55]:
columns = ["subject_id", "question_id", "question_type", "student_id","question_text", "mark_scheme_text", "questions_with_marking_level_flag", "context", "answer_text", "awarded_marks", "total_marks", "answer_id", "linked_answer_id", "topic_id", "answer_scanned_image"]
student_answers_pivoted_merged = question_answers_ms_merged[columns]
student_answers_pivoted_merged = student_answers_pivoted_merged.sort_values(by=["student_id", "question_id"]).reset_index(drop = True)

In [56]:
# Check for Duplicates and ensure tables are the same size
assert student_answers.drop_duplicates().shape[0] == student_answers_pivoted_merged.drop_duplicates().shape[0]

In [64]:
# Extract Level and Marks for each level for question types that have levels 
valid_types = list(set(student_answers_pivoted_merged[student_answers_pivoted_merged.questions_with_marking_level_flag == 1]["question_type"].tolist()))

student_answers_pivoted_merged['level_structure'] = student_answers_pivoted_merged.apply(lambda row: extract_level_structure(row['mark_scheme_text']) if row['question_type'] in valid_types else None, axis = 1)

In [65]:
# Save questions and answers
savedir = Path(workdir)  / "validation_results" / "processed_data"
Path(savedir).mkdir(parents=True, exist_ok=True)

student_answers_pivoted_merged.to_csv(savedir / "student_answers_augmented.csv", index=False)