In [None]:
import ast
import base64
import concurrent.futures
import cv2
import glob
import json
import numpy as np
import os
import pandas as pd
import pdfplumber
import re
import requests
import time
from collections import OrderedDict
from dotenv import load_dotenv
from itertools import compress
from pathlib import Path
from PIL import Image
from pdf2image import convert_from_path
from typing import List

load_dotenv(override=True)

In [None]:
# Helper Functions
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

def extract_json_from_markdown(markdown_string):
    # Remove the triple backticks and 'json' from the input string
    json_string = markdown_string.strip().strip('```json').strip('```').strip()
    
    # Parse the cleaned JSON string into a Python dictionary
    data = json.loads(json_string)

    if isinstance(data, list):
        return data
    else: 
        return [data]

def extract_tuple(text):
    # Define the regex patterns to find the tuples
    pattern_ab = r"\('([a-z])', '([a-z])'\)"
    pattern_c = r"\('([a-z])',?\)"
    pattern_empty = r"\(\)"
    
    # Search for the tuples in the text
    match_ab = re.search(pattern_ab, text)
    match_c = re.search(pattern_c, text)
    match_empty = re.search(pattern_empty, text)
    
    # Check for the tuples and return the corresponding result
    if match_ab:
        return match_ab.groups()
    elif match_c and not match_ab:
        return (match_c.groups()[0],)
    elif match_empty:
        return ()
    else:
        return None

def merge_dict_on_key(dicts: List, key: str):

    if len(dicts) == 1:
        comb_dict = dicts[0]
    else:
        comb_dict = {}

        values_from_dict = [dict[key] for dict in dicts]
        joined_values_from_dict = " ".join(values_from_dict)

        comb_dict = {**dicts[0]}
        comb_dict[key] = joined_values_from_dict
        
    return comb_dict

def merge_dict_on_common_keys(dicts):

  if len(dicts) == 1:
      comb_dict = dicts[0]
  else:
      dict_keys = list(dicts[0].keys())
      comb_dict = {}

      for key in dict_keys:
          vals = [record[key] for record in dicts] 
          dedup_vals = list(OrderedDict.fromkeys(vals))

          if len(dedup_vals) == 1:
              comb_dict[key] = dedup_vals[0]
          else: 
              comb_dict[key] = "".join(dedup_vals)
      
  return comb_dict

def dict_reorder(
        input_dict, 
        keys_order
        ):

    ordered_dict = OrderedDict()

    for _, key in enumerate(keys_order):
        ordered_dict[key] = input_dict[key]

    return ordered_dict

def json_to_markdown(json_list):
  """
  Converts a list of JSON objects to a markdown document.

  Args:
    json_list: A list of dictionaries representing JSON objects.

  Returns:
    A string containing the markdown document.
  """

  markdown_text = ""
  for item in json_list:
    markdown_text += f"{'-'*40}\n\n"
    for key, value in item.items():
      # Convert value to string if it's not already
      if not isinstance(value, str):
        value = str(value)
      # Escape special characters for markdown
      escaped_value = value.replace("#", "\\#")
      markdown_text += f"## {key}\n{escaped_value}\n\n"
  
  return markdown_text

def crop_image_borders_from_path(image_path, top_crop, bottom_crop, left_crop, right_crop):
  """Crops the borders of an image.

  Args:
    image_path: The path to the image file.
    top_crop: Number of pixels to crop from the top.
    bottom_crop: Number of pixels to crop from the bottom.
    left_crop: Number of pixels to crop from the left.
    right_crop: Number of pixels to crop from the right.

  Returns:
    The cropped image as a NumPy array.
  """

  img = cv2.imread(image_path)
  height, width, channels = img.shape

  # Calculate new dimensions
  new_height = height - top_crop - bottom_crop
  new_width = width - left_crop - right_crop

  # Crop the image
  cropped_img = img[top_crop:height-bottom_crop, left_crop:width-right_crop]

  return cropped_img

def crop_image_borders(image, top_crop, bottom_crop, left_crop, right_crop):
  """Crops the borders of an image.

  Args:
    image: The PIL Image.
    top_crop: Number of pixels to crop from the top.
    bottom_crop: Number of pixels to crop from the bottom.
    left_crop: Number of pixels to crop from the left.
    right_crop: Number of pixels to crop from the right.

  Returns:
    The cropped image as a NumPy array.
  """

  img = np.array(image)
  height, width, channels = img.shape

  # Calculate new dimensions
  new_height = height - top_crop - bottom_crop
  new_width = width - left_crop - right_crop

  # Crop the image
  cropped_img = img[top_crop:height-bottom_crop, left_crop:width-right_crop]

  return cropped_img
  
def extract_parentheses_content(text):
  """Extracts the content within parentheses from a string.

  Args:
    text: The input string.

  Returns:
    The extracted content within parentheses.
  """

  match = re.search(r'\((.*)\)', text)
  return match.group(0) if match else None

def _merge_dicts(dict1, dict2):
    res = {**dict1, **dict2}
    return res


In [None]:
# Prompts
extract_paper_reference_prompt = """
You are given a cover page for an exam paper. 
Extract the following data from the page and express the output as a json object.

1. Candidate Number
2. Maximum mark for the paper
3. The Subject of the Exam 
4. The Paper Reference
5. The Exam Board that is issuing the paper
6. The Paper number
7. Month/Year of the Exam
8. Title of the Exam paper

Obey the following format

{
"candidate_number": int,
"max_mark": int,
"exam_subject": str,
"paper_reference": str,
"exam_board": str,
"paper_number": int,
"month_year": yyyy/mm,
"paper_title": str
}
"""

extract_question_parts_prompt = """
You are given a page from an exam paper. 
Extract the question parts mentioned in the page. 
Every question number will have a lower case letter denoting the question part, they will be enclosed in brackets.
Read the page and return in the letters of the questions as a tuple.

If no question parts are detected, return an empty tuple. Return nothing but the tuple.
"""

extract_question_parts_hist_prompt = """
You are given a page from an exam paper. 
Extract the question number mentioned in the page - the question number is enclosed in a box, with two squares, near the top left of the page.
The question number is in printed text, it is not handwritten, do not return any handwritten numbers.
Read the page and return question numbers identified in a tuple.

If no question numbers are detected, return an empty tuple. Return nothing but the tuple.
"""

extract_handwriting_prompt = """
You are given a page from an exam paper. Your role is to extract the following details from the page:
1. Handwritten Text

Obey the following format

{
"answer_text": str
}

If there is no information about the fields requested, return the value of those fields to be an empty string. 
Do not invent anything.
"""

mcq_prompt = """
You are given a page from an exam paper. Your role is to extract the following details from the page:
1. Printed Question Text
2. Answer Text

Obey the following format
[{
"question_text": str,
"answer_text": str
},
{
"question_text": str,
"answer_text": str
}]

Here are some guidelines that you should follow:
1. There are some multiple choice questions on this page, for these questions, extract only the question text and report only the letter for the selected options.
2. If there are multiple questions, report all of them in a list of jsons obeying the format specified.
3. If there is no information about the fields requested, return the value of those fields to be an empty string. 
4. Do not invent anything.
""".strip()

In [None]:
# Model Configuration
model_name = "gpt-4o"

# OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")
headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

In [None]:
subject_id = "edexcel_business_studies"
candidate_id = 5002

In [None]:
# Load PDF
pdfpath = Path(f"./data/{subject_id}/")

images = convert_from_path(pdfpath / f"{candidate_id}.pdf")

In [86]:
# Subset to pages that have questions
question_images = images[1:22]

In [87]:
# Save a subset of pages
image_savedir = Path(f"./saved_imgs/{subject_id}/{candidate_id}")
image_savedir.mkdir(parents=True, exist_ok=True)

# Define a function to save a single image
def save_image(idx, image, image_savedir):
    image.save(image_savedir / f"image_page{idx}.png", 'png')

# Use ThreadPoolExecutor to parallelize the saving process
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create a list of tasks
    tasks = [executor.submit(save_image, idx, image, image_savedir=image_savedir) for idx, image in enumerate(images)]
    
    # Wait for all tasks to complete
    for task in tasks:
        task.result()  # This will also re-raise any exceptions caught during the saving process


# Extract Paper Reference

In [None]:
cover_page_encoded = encode_image(image_savedir / "image_page0.png")

In [None]:
payload = {
    "model": model_name,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": f"{extract_paper_reference_prompt}"
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{cover_page_encoded}"
            }
          }
        ]
      }
    ],
    "max_tokens": 500
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response_content = response.json()["choices"][0]["message"]["content"]


In [None]:
paper_reference = extract_json_from_markdown(response_content)[0]["paper_reference"]
print(paper_reference)

# Encode Images and Prepare DataFrame

In [None]:
img_paths = list(glob.glob(str(image_savedir)+"/*.png"))
img_paths = sorted(img_paths)
img_paths

In [None]:
page_num = [int( i.split(".")[0].split("/")[-1].split("_")[1][4:] ) for i in img_paths]
encoded_imgs = [encode_image(v) for v in img_paths]

print(dict(zip(img_paths, page_num)))

In [None]:
# Make a DataFrame
img_df = pd.DataFrame({"page_number": page_num, "paper_reference": paper_reference, "subject_id": subject_id, "encoded_image_cropped": encoded_imgs})
img_df = img_df.sort_values(by='page_number').reset_index(drop=True)
img_df = img_df[1:22].reset_index(drop=True)
img_df

# Extract Question Number & Question Parts

In [None]:
# Logic to cache 
## Check if we have a saved dataframe corresponding to the paper reference
dir_contents = glob.glob("./extracted_questions_by_page_number/**", recursive=True)

sanitised_paper_ref = paper_reference.replace("/", "-")
matching_files_lst = [i for i in dir_contents if sanitised_paper_ref in i]

if len(matching_files_lst) == 0:
    # Extract Question Numbers
    # Add Question Parts - Use a multithreaded approach
    def extract_question_part(idx, row):
        print("Processing Page:\n", row["page_number"])

        payload = {
            "model": model_name,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"{extract_question_parts_prompt}"
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{row['encoded_image_cropped']}"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": 800
        }

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response_content = response.json()["choices"][0]["message"]["content"]

        return idx, response_content

    extracted_tuples = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Map the process_row function to each row in the DataFrame, passing the index as well
        futures = [executor.submit(extract_question_part, idx, row) for idx, row in img_df.iterrows()]

        for future in concurrent.futures.as_completed(futures):
            try:
                idx, response_content = future.result()
                extracted_tuples.append((idx, response_content))
            except Exception as e:
                print(f"Exception occurred: {e}")

    # Sort the results by the original index to retain the order
    extracted_tuples.sort(key=lambda x: x[0])

    # Extract just the responses in the correct order
    extracted_tuples_ordered = [response for idx, response in extracted_tuples]

    img_df["question_parts"] = [ast.literal_eval(i) for i in extracted_tuples_ordered]
    img_df["contains_a"] = img_df['question_parts'].apply(lambda x: 'a' in x)
    img_df["number_of_question_parts"] = img_df['question_parts'].apply(lambda x: len(x))

    # Save Extracted Numbers Table
    extracted_question_by_page_num_df = img_df[img_df.number_of_question_parts > 0].reset_index(drop=True)
    extracted_question_by_page_num_df = extracted_question_by_page_num_df[['page_number', "paper_reference", "question_parts"]]
    
    extracted_questions_savedir = Path(f"./extracted_questions_by_page_number/{subject_id}/")
    extracted_questions_savedir.mkdir(parents=True, exist_ok=True)

    sanitised_paper_ref = paper_reference.replace("/", "-")
    extracted_question_by_page_num_df.to_csv(extracted_questions_savedir / f"{ sanitised_paper_ref }_extracted_questions_by_page.csv" ,index=False)

else:
    print("Using Cached Extracted Question Number Table")
    # Match DataFrames
    extracted_question_by_page_num_df = pd.read_csv(matching_files_lst[0])

    img_df = pd.merge(img_df, extracted_question_by_page_num_df[['page_number', 'question_parts']], how="left", on =["page_number"])
    img_df['question_parts'] = img_df.apply(lambda x: () if pd.isnull(x['question_parts']) else ast.literal_eval(x['question_parts']), axis=1)
    img_df["contains_a"] = img_df['question_parts'].apply(lambda x: 'a' in x)
    img_df["number_of_question_parts"] = img_df['question_parts'].apply(lambda x: len(x))

In [None]:
# Add a Question Number - presumes all pages are in correct order
counter = 0 # starting question - 1
q_num = []
q_num_dict = []

for i in list(img_df.contains_a):
    counter += i
    q_num.append( counter )
    q_num_dict.append({"question_number": counter})

img_df["question_number"] = q_num

img_df

# Extract Question and Answer Text from Page

In [None]:
# Extract Question and Answer Text from page
def extract_question_answer_text(idx, row):
    print("Processing Page Number:\n", row['page_number'])

    if row['number_of_question_parts'] == 0:
        prompt = extract_handwriting_prompt
    else:
        prompt = mcq_prompt

    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{prompt}"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{row['encoded_image_cropped']}",
                            "detail": "auto"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 1000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    response_content = response.json()["choices"][0]["message"]["content"]

    return idx, response_content

extracted_questions = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    
    # Map the row function to each row in the DataFrame, passing the index as well
    futures = [executor.submit(extract_question_answer_text, idx, row) for idx, row in img_df.iterrows()]

    for future in concurrent.futures.as_completed(futures):
        try:
            idx, response_content = future.result()
            extracted_questions.append((idx, response_content))
        except Exception as e:
            print(f"Exception occurred: {e}")

# Sort the results by the original index to retain the order
extracted_questions.sort(key=lambda x: x[0])

# Extract just the responses in the correct order
extracted_questions_ordered = [response for idx, response in extracted_questions]

In [None]:
img_df["extracted_text"] = [extract_json_from_markdown(i) for i in extracted_questions_ordered]

img_df

In [None]:
# Collect Questions and Answers
all_questions_lst = []
for idx, q_num in enumerate(list(set(img_df.question_number))):

    # Get question parts for a question number
    l = img_df[img_df.question_number == q_num].question_parts.to_list()
    m = [list(i) for i in l]

    for j in m:
        if len(j) == 0:
            j.append('')

    question_parts_collapsed = [item for sublist in m for item in sublist]
    question_parts_collapsed

    augmented_question_parts = []
    for idx, question_part in enumerate(question_parts_collapsed):
        if question_part == '':
            augmented_question_parts.append(question_parts_collapsed[idx - 1])
        else:
            augmented_question_parts.append(question_part)

    # Combine question number and question parts
    extracted_questions_by_part = img_df[img_df.question_number == q_num].extracted_text.sum()

    for idx, question_part in enumerate(augmented_question_parts):
        extracted_questions_by_part[idx]["question_part"] = question_part
        extracted_questions_by_part[idx]["question_number"] = q_num
        if not extracted_questions_by_part[idx].get("question_text"):
            extracted_questions_by_part[idx]["question_text"] = ""
    
    # Align Questions and Answers
    unique_qp_qn = list(OrderedDict.fromkeys([(i['question_number'], i["question_part"]) for i in extracted_questions_by_part]))

    for idx, record in enumerate(unique_qp_qn):
        qp = record[1]
        qn = record[0]

        question_set = list(compress(extracted_questions_by_part, [(i['question_number'] == qn and i['question_part'] == qp) for i in extracted_questions_by_part]))
        all_questions_lst.append(merge_dict_on_common_keys(question_set))

In [None]:
all_questions_ordered_lst = [dict_reorder(i, keys_order=["question_number", "question_part", "question_text", "answer_text"]) for i in all_questions_lst]
all_questions_ordered_lst

In [None]:
# Convert to JSON and Markdown
json_object = json.dumps(all_questions_ordered_lst, indent=4)

json_savedir = Path(f"./extracted_questions_answers/{subject_id}")
json_savedir.mkdir(parents=True, exist_ok=True)

# Writing to sample.json
with open(json_savedir / f"{candidate_id}_extracted_questions.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
# Convert to Markdown
markdown_doc = json_to_markdown(all_questions_ordered_lst)

# Print or write the markdown document to a file
print(markdown_doc)

# write it to a file:
with open(json_savedir / f"{candidate_id}_extracted_questions.md", "w") as f:
  f.write(markdown_doc)