In [10]:
import os
import logging
import pandas as pd
from pathlib import Path

In [11]:
# Set working directory - Not required if using Jupyter outside of VScode
workdir = os.environ["workdir"]
os.chdir(workdir)
os.getcwd()

'/Users/junaid/Developer/audenshaw_exam_validation'

In [12]:
logging.basicConfig(
    level=logging.INFO,
    format='%(process)d-%(levelname)s-%(message)s',
    datefmt="%Y-%m-%d %H:%M:%S",
    )


In [13]:
data_dir = Path(f"{workdir}/data")

question_info = pd.read_csv(data_dir / "questions.csv")
mark_scheme_table = pd.read_csv(data_dir / "mark_scheme.csv")
student_answers = pd.read_csv(data_dir / "answers.csv")


In [14]:
question_answers_merged = pd.merge(student_answers, question_info, on = ["question_id", "subject_id"], how = "left")

In [15]:
# Join Mark Scheme Text
question_answers_ms_merged = pd.merge(question_answers_merged, mark_scheme_table[["question_id", "subject_id", "structured_mark_scheme_text"]], on = ["question_id", "subject_id"], how = "left")

# Rename Columns
question_answers_ms_merged = question_answers_ms_merged.rename(columns = {"structured_mark_scheme_text": "mark_scheme_text"})

In [16]:
columns = ["subject_id", "question_id", "question_type", "student_id","question_text", "mark_scheme_text", "context", "answer_text", "awarded_marks", "total_marks", "answer_id", "linked_answer_id", "topic_id", "answer_scanned_image"]
student_answers_pivoted_merged = question_answers_ms_merged[columns]
student_answers_pivoted_merged = student_answers_pivoted_merged.sort_values(by=["student_id", "question_id"]).reset_index(drop = True)

In [17]:
# Check for Duplicates and ensure tables are the same size
assert student_answers.drop_duplicates().shape[0] == student_answers_pivoted_merged.drop_duplicates().shape[0]

In [18]:
# Save questions and answers
savedir = Path(workdir)  / "validation_results" / "processed_data"
Path(savedir).mkdir(parents=True, exist_ok=True)

student_answers_pivoted_merged.to_csv(savedir / "student_answers_augmented.csv", index=False)