In [1]:
import os
import base64
import json
import requests
from dotenv import load_dotenv
from pathlib import Path
from PIL import Image
import pdfplumber
from pdf2image import convert_from_path
from langchain_community.document_loaders import PyPDFLoader

load_dotenv(override=True)

True

In [2]:
# OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


In [10]:
pdfpath = Path("./data/edexcel_business_studies/")

images = convert_from_path(pdfpath / "5123.pdf", fmt='png')

KeyboardInterrupt: 

In [11]:
len(images)

28

In [12]:
images

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmI

In [309]:
[images[5:8] + images[13:16]]

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=4595x6495>]

In [311]:
# Save a subset of pages
[image.save(f"img{idx+1}.png", 'png') for idx, image in enumerate(images[5:8] + images[13:16])]

[None, None, None, None, None, None]

In [314]:
# Load PDF
import glob
img_paths = list(glob.glob("*.png"))
img_paths = sorted(img_paths)
img_paths

['img1.png', 'img2.png', 'img3.png', 'img4.png', 'img5.png', 'img6.png']

In [315]:
import pandas as pd
page_num = [1, 2, 3, 4, 5, 6]
encoded_imgs = [encode_image(v) for v in img_paths]

img_df = pd.DataFrame({"page_number": page_num, "encoded_image": encoded_imgs})
img_df = img_df.sort_values(by='page_number').reset_index(drop=True)
img_df

Unnamed: 0,page_number,encoded_image
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...


In [316]:
prompt = """
You are given a page from an exam paper. 
Extract the question parts mentioned in the page. 
Every question number will have a letter denoting the question.
Read the page and return in the letters of the questions as a tuple.

If no question parts are detected, return an empty tuple. Return nothing but the tuple.
"""

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

model_name = "gpt-4o"

In [317]:
extracted_tuples = []
for idx, encoded_img in enumerate(list(img_df.encoded_image)):

  payload = {
      "model": model_name,
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"{prompt}"
            },
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{encoded_img}"
              }
            }
          ]
        }
      ],
      "max_tokens": 500
  }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  response_content = response.json()["choices"][0]["message"]["content"]
  
  extracted_tuples.append(response_content)

["('a', 'b')", "('c', 'd')", "('e',)", "('a', 'b')", "('c',)", '()']

In [64]:
import re

def extract_tuple(text):
    # Define the regex patterns to find the tuples
    pattern_ab = r"\('([a-z])', '([a-z])'\)"
    pattern_c = r"\('([a-z])',?\)"
    pattern_empty = r"\(\)"
    
    # Search for the tuples in the text
    match_ab = re.search(pattern_ab, text)
    match_c = re.search(pattern_c, text)
    match_empty = re.search(pattern_empty, text)
    
    # Check for the tuples and return the corresponding result
    if match_ab:
        return match_ab.groups()
    elif match_c and not match_ab:
        return (match_c.groups()[0],)
    elif match_empty:
        return ()
    else:
        return None

In [318]:
question_parts = [extract_tuple(i) for i in extracted_tuples]
question_parts

[('a', 'b'), ('c', 'd'), ('e',), ('a', 'b'), ('c',), ()]

In [319]:
img_df['question_parts'] = question_parts
img_df

Unnamed: 0,page_number,encoded_image,question_parts
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)"
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c, d)"
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(e,)"
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)"
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c,)"
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,()


In [320]:
img_df['contains_a'] = img_df['question_parts'].apply(lambda x: 'a' in x)
img_df['number_of_question_parts'] = img_df['question_parts'].apply(lambda x: len(x))
img_df

Unnamed: 0,page_number,encoded_image,question_parts,contains_a,number_of_question_parts
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c, d)",False,2
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(e,)",False,1
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c,)",False,1
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,(),False,0


In [321]:
counter = 0
q_num = []
for i in list(img_df.contains_a):
    counter += i
    q_num.append(counter)

img_df["question_number"] = q_num
img_df

Unnamed: 0,page_number,encoded_image,question_parts,contains_a,number_of_question_parts,question_number
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,1
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c, d)",False,2,1
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(e,)",False,1,1
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,2
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c,)",False,1,2
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,(),False,0,2


In [322]:
prompt1 = """
You are given a page from an exam paper. Your role is to extract the following details from the page:
1. Printed Question Text
2. Handwritten Answer Text

Obey the following format
[
{
"question_text": str,
"answer_text": str
}
]

If there are multiple questions, report all of them in a list of jsons obeying the format specified.
If there is no information about the fields requested, return the value of those fields to be an empty string. 
Do not invent anything.
"""

prompt2 = """
You are given a page from an exam paper. Your role is to extract the following details from the page:
1. Handwritten Text

Obey the following format

{
"answer_text": str
}

If there is no information about the fields requested, return the value of those fields to be an empty string. 
Do not invent anything.
"""

In [323]:
img_df

Unnamed: 0,page_number,encoded_image,question_parts,contains_a,number_of_question_parts,question_number
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,1
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c, d)",False,2,1
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(e,)",False,1,1
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,2
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c,)",False,1,2
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,(),False,0,2


In [324]:
extracted_questions = []
for idx, row in img_df.iterrows():
    
  if row['number_of_question_parts'] == 0:
    prompt = prompt2
  else:
    prompt = prompt1
  
  payload = {
    "model": model_name,
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": f"{prompt}"
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{row['encoded_image']}"
            }
          }
        ]
      }
    ],
    "max_tokens": 500
    }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  response_content = response.json()["choices"][0]["message"]["content"]
  
  extracted_questions.append(response_content)

In [344]:
print(extracted_questions[1])

{
  "data": [
    {
      "question_text": "Explain one disadvantage to a small business of not paying its employees on time.",
      "answer_text": "One disadvantage is that employees may quit. If employees quit there is more work for less people. This makes the business difficult to run."
    },
    {
      "question_text": "Explain one advantage to a small business from using retained profit as a source of finance.",
      "answer_text": "One advantage to this is there is nobody to pay back. This means a more stable revenue which may lead to better investments for the business."
    }
  ]
}


In [110]:
def extract_json_from_markdown(markdown_string):
    # Remove the triple backticks and 'json' from the input string
    json_string = markdown_string.strip().strip('```json').strip('```').strip()
    
    # Parse the cleaned JSON string into a Python dictionary
    data = json.loads(json_string)

    if isinstance(data, list):
        return data
    else: 
        return [data]

In [326]:
img_df["extracted_text"] = [extract_json_from_markdown(i) for i in extracted_questions]
img_df

Unnamed: 0,page_number,encoded_image,question_parts,contains_a,number_of_question_parts,question_number,extracted_text
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,1,[{'details': [{'question_text': 'Which one of ...
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c, d)",False,2,1,[{'data': [{'question_text': 'Explain one disa...
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(e,)",False,1,1,[{'question_text': 'Discuss the impact on a sm...
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,2,[{'question_text': 'State one element of the m...
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c,)",False,1,2,[{'question_text': 'In order to make the busin...
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,(),False,0,2,[{'answer_text': ''}]


In [116]:
for idx, q_num in enumerate(list(set(img_df.question_number))):

    img_df[img_df.question_numer == q_num].extracted_text

[1]

In [262]:
q_num = 1
l = img_df[img_df.question_number == q_num].question_parts.to_list()
m = [list(i) for i in l]

for j in m:
    if len(j) == 0:
        j.append('')

question_parts_collapsed = [item for sublist in m for item in sublist]
question_parts_collapsed

augmented_question_parts = []
for idx, question_part in enumerate(question_parts_collapsed):
    if question_part == '':
        augmented_question_parts.append(question_parts_collapsed[idx - 1])
    else:
        augmented_question_parts.append(question_part)

augmented_question_parts


['a', 'b', 'c', 'c']

In [283]:
extracted_questions_by_part = img_df[img_df.question_number == q_num].extracted_text.sum()

for idx, question_part in enumerate(augmented_question_parts):
    extracted_questions_by_part[idx]["question_part"] = question_part
    extracted_questions_by_part[idx]["question_number"] = q_num
    if not extracted_questions_by_part[idx].get("question_text"):
        extracted_questions_by_part[idx]["question_text"] = ""

extracted_questions_by_part

[{'question_text': 'State one element of the marketing mix, other than price, for Lili Heating Ltd.',
  'answer_text': 'Age',
  'question_part': 'a',
  'question_number': 1},
 {'question_text': 'Outline one reason why the government would want Lili Heating Ltd to be successful.',
  'answer_text': 'Everybody has heating therefore the government can tax the company more as it will have higher income.',
  'question_part': 'b',
  'question_number': 1},
 {'question_text': 'In order to make the business more successful, Lili Heating Ltd is considering two options:\n\nOption 1: offer a discounted price to female customers\n\nOption 2: use social media to promote the business.\n\n(c) Justify which one of these two options Lili Heating Ltd should choose.',
  'answer_text': 'In my opinion the business should take option two as many people are willing to pay high prices and almost everybody is on social media. This is better advertising. Option 1 may not make a big difference as it is uncommon fo

In [284]:
unique_qp_qn = list(set([(i['question_number'], i["question_part"]) for i in extracted_questions_by_part]))
unique_qp_qn 

[(1, 'b'), (1, 'a'), (1, 'c')]

In [285]:
output_lst = []

for idx, record in enumerate(unique_qp_qn):
    qp = record[1]
    qn = record[0]

    question_set = list(compress(extracted_questions_by_part, [(i['question_number'] == qn and i['question_part'] == qp) for i in extracted_questions_by_part]))
    print(question_set)
    output_lst.append(merge_dict_on_common_keys(question_set))

[{'question_text': 'Outline one reason why the government would want Lili Heating Ltd to be successful.', 'answer_text': 'Everybody has heating therefore the government can tax the company more as it will have higher income.', 'question_part': 'b', 'question_number': 1}]
[{'question_text': 'State one element of the marketing mix, other than price, for Lili Heating Ltd.', 'answer_text': 'Age', 'question_part': 'a', 'question_number': 1}]
[{'question_text': 'In order to make the business more successful, Lili Heating Ltd is considering two options:\n\nOption 1: offer a discounted price to female customers\n\nOption 2: use social media to promote the business.\n\n(c) Justify which one of these two options Lili Heating Ltd should choose.', 'answer_text': 'In my opinion the business should take option two as many people are willing to pay high prices and almost everybody is on social media. This is better advertising. Option 1 may not make a big difference as it is uncommon for females to b

In [286]:
output_lst

[[{'question_text': 'Outline one reason why the government would want Lili Heating Ltd to be successful.',
   'answer_text': 'Everybody has heating therefore the government can tax the company more as it will have higher income.',
   'question_part': 'b',
   'question_number': 1}],
 [{'question_text': 'State one element of the marketing mix, other than price, for Lili Heating Ltd.',
   'answer_text': 'Age',
   'question_part': 'a',
   'question_number': 1}],
 {'question_text': 'In order to make the business more successful, Lili Heating Ltd is considering two options:\n\nOption 1: offer a discounted price to female customers\n\nOption 2: use social media to promote the business.\n\n(c) Justify which one of these two options Lili Heating Ltd should choose.',
  'answer_text': 'In my opinion the business should take option two as many people are willing to pay high prices and almost everybody is on social media. This is better advertising. Option 1 may not make a big difference as it is u

In [234]:
from collections import OrderedDict
from itertools import compress


In [301]:
# Helper Functions

def merge_dict_on_common_keys(dicts):

    if len(dicts) == 1:
        comb_dict = dicts[0]
    else:
        dict_keys = list(dicts[0].keys())
        comb_dict = {}

        for key in dict_keys:
            vals = [record[key] for record in dicts] 
            dedup_vals = list(OrderedDict.fromkeys(vals))

            if len(dedup_vals) == 1:
                comb_dict[key] = dedup_vals[0]
            else: 
                comb_dict[key] = "".join(dedup_vals)
        
    return comb_dict
  


In [328]:
img_df

Unnamed: 0,page_number,encoded_image,question_parts,contains_a,number_of_question_parts,question_number,extracted_text
0,1,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,1,[{'details': [{'question_text': 'Which one of ...
1,2,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c, d)",False,2,1,[{'data': [{'question_text': 'Explain one disa...
2,3,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(e,)",False,1,1,[{'question_text': 'Discuss the impact on a sm...
3,4,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(a, b)",True,2,2,[{'question_text': 'State one element of the m...
4,5,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,"(c,)",False,1,2,[{'question_text': 'In order to make the busin...
5,6,iVBORw0KGgoAAAANSUhEUgAAEfMAABlfCAIAAAAvdKTrAA...,(),False,0,2,[{'answer_text': ''}]


In [329]:
all_questions_lst = []
for idx, q_num in enumerate(list(set(img_df.question_number))):

    # Get question parts for a question number
    print(q_num)
    l = img_df[img_df.question_number == q_num].question_parts.to_list()
    m = [list(i) for i in l]

    for j in m:
        if len(j) == 0:
            j.append('')

    question_parts_collapsed = [item for sublist in m for item in sublist]
    question_parts_collapsed

    augmented_question_parts = []
    for idx, question_part in enumerate(question_parts_collapsed):
        if question_part == '':
            augmented_question_parts.append(question_parts_collapsed[idx - 1])
        else:
            augmented_question_parts.append(question_part)

    # Combine question number and question parts
    extracted_questions_by_part = img_df[img_df.question_number == q_num].extracted_text.sum()

    for idx, question_part in enumerate(augmented_question_parts):
        extracted_questions_by_part[idx]["question_part"] = question_part
        extracted_questions_by_part[idx]["question_number"] = q_num
        if not extracted_questions_by_part[idx].get("question_text"):
            extracted_questions_by_part[idx]["question_text"] = ""
    
    # Align Questions and Answers
    unique_qp_qn = list(set([(i['question_number'], i["question_part"]) for i in extracted_questions_by_part]))

    #output_lst = []
    for idx, record in enumerate(unique_qp_qn):
        qp = record[1]
        qn = record[0]

        question_set = list(compress(extracted_questions_by_part, [(i['question_number'] == qn and i['question_part'] == qp) for i in extracted_questions_by_part]))
        all_questions_lst.append(merge_dict_on_common_keys(question_set))
    
    #all_questions_lst.append(output_lst)

1


IndexError: list index out of range

In [305]:
all_questions_lst

[{'question_text': 'Outline one reason why the government would want Lili Heating Ltd to be successful.',
  'answer_text': 'Everybody has heating therefore the government can tax the company more as it will have higher income.',
  'question_part': 'b',
  'question_number': 1},
 {'question_text': 'State one element of the marketing mix, other than price, for Lili Heating Ltd.',
  'answer_text': 'Age',
  'question_part': 'a',
  'question_number': 1},
 {'question_text': 'In order to make the business more successful, Lili Heating Ltd is considering two options:\n\nOption 1: offer a discounted price to female customers\n\nOption 2: use social media to promote the business.\n\n(c) Justify which one of these two options Lili Heating Ltd should choose.',
  'answer_text': 'In my opinion the business should take option two as many people are willing to pay high prices and almost everybody is on social media. This is better advertising. Option 1 may not make a big difference as it is uncommon fo

In [335]:
extracted_questions_by_part

[{'details': [{'question_text': 'Which one of the following could a small business use to add value?\nSelect one answer.\n A Locate in a convenient place\n  B Pay a higher rate of taxation\n  C Reduce its cash inflows\n  D Use a long-term source of finance',
    'answer_text': ''},
   {'question_text': 'Using the information in Figure 1, calculate the revenue for March. You are advised to show your workings.',
    'answer_text': '7200 units sold. £5.20 each. 7200 x £5.20 = £37,440'}],
  'question_part': 'a',
  'question_number': 1,
  'question_text': ''},
 {'data': [{'question_text': 'Explain one disadvantage to a small business of not paying its employees on time.',
    'answer_text': 'One disadvantage is that employees may quit. If employees quit there is more work for less people. This makes the business difficult to run.'},
   {'question_text': 'Explain one advantage to a small business from using retained profit as a source of finance.',
    'answer_text': 'One advantage to this i