In [1]:
!pip install PyPDF2
!pip install pymupdf Pillow pytesseract



In [1]:
from PyPDF2 import PdfReader

def read(file_path, start, end):
    result = []
    
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)
        if end > num_pages:
            print("end number outside of total pages")
            return None
        
        for page_num in range(start, end):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            result.append(text)
    return "\n".join(result)

In [2]:
import re

def filter(text):
    def license_filter(text):
        pattern = r"© The Author\(s\), under exclusive license to Springer Nature Switzerland AG \d{4}\s+V\s*\.\s*K\s*\.\s*Kohli\s*et\s*al\.,\s*Comprehensive Multiple-Choice Questions in Pathology\s*,\s*https://doi\.org/10\.1007/[\d\w\-_.]+"
        return re.sub(pattern, "", text)

    text = text.replace("\xa0", " ")
    text = license_filter(text)
    return text


In [3]:
import numpy as np 

def get_questions_and_answers(text):
    questions = []
    answers = []

    current_Q = []
    current_A = []

    q = True
    a = False
    filtered_text = filter(text)

    for line in filtered_text.split('\n')[4:]:
        line = line.strip()
        if q:
            if "Answers and Explanations" in line:
                questions.append(current_Q)
            
                current_Q = []
                q = False
                a = True
                continue
                
            current_Q.append(line)

        if a:
            if "Bibliography" in line:
                answers.append(current_A)
                current_A = []
                q = False
                a = False
                continue
                
            current_A.append(line)

        if q * a == 0:
            if "Multiple Choice Questions" in line:
                q = True
                a = False
            
    return questions, answers

In [4]:
def restructure_questions(questions):
    result = []
    
    def split_q(text):
        pattern = r"(?=\b\d{1,2}\.\s|A\.\s*|B\.\s*|C\.\s*|D\.\s*|E\.\s*)"
        split_text = re.split(pattern, text)
        return [part.strip() for part in split_text if part.strip() and len(part) > 1]

    y = 1
    
    for question_block in questions:
        text = " ".join(question_block)
        lists = split_q(text)
        current = []
        x = 1
        
        for i in lists:
            if re.match(r'^\d{1,2}\.\s', i):
                if current:
                    result.append([f"Set_{y}: q{x}"] + current)
                    x += 1
                current = [i[3:]]
            else: 
                current.append(i)
        
        if current:
            result.append([f"Set_{y}: q{x}"] + current)
        y += 1
    
    return result

In [5]:
def restructure_answers(answers):
    result = []
    def split_a(text):
        pattern = r"\d+\.\s"
        split_text = re.split(pattern, text)
        return np.array([part.strip() for part in split_text if part.strip()])
    y = 1
    for answer in answers:
        x = 1
        text = "\n".join(answer)
        lists = split_a(text)
            

        
        for line in lists:
            current = [f"Set_{y}: q{x}"] + [line[:line.find("\n")], line[line.find("\n")+1:].replace("\n", " ")]
            result.append(current)
            x += 1
        y += 1
    return result
        

In [None]:
import fitz  
import io
from PIL import Image

textbook = "QA.pdf"
textpic = fitz.open(textbook)

for page_index in range(len(textpic)):
    page = textpic.load_page(page_index)
    image_list = page.get_images(full=True)

    if image_list:
        print(f"[+] Found {len(image_list)} images on page {page_index + 1}")
    else:
        print(f"[!] No images found on page {page_index + 1}")

    for image_index, img in enumerate(image_list, start=1):
        xref = img[0]
        
        base_image = textpic.extract_image(xref)
        image_bytes = base_image["image"]

        image_ext = base_image["ext"]

        image = Image.open(io.BytesIO(image_bytes))

        image_filename = f"page{page_index + 1}_{image_index}.{image_ext}"
        image.save(open(image_filename, "wb"))

        print(f"Saved image {image_filename}")

In [6]:
textbook = "QA.pdf"
contents = read(textbook, 9, 191)
questions_d, answers_d = get_questions_and_answers(contents)

In [7]:
questions = restructure_questions(questions_d)

In [8]:
answers = restructure_answers(answers_d)

In [9]:
import pandas as pd

q_table = pd.DataFrame(data=questions)
q_table.columns = ["id", "Question", "A", "B", "C", "D", "E", "F"]

a_table = pd.DataFrame(data=answers)
a_table.columns = ["id", "Answer", "explaination"]

In [10]:
a_table = a_table[a_table["Answer"].str[:6] == "Answer"].reset_index()

In [11]:
q_table.dropna(how='all')
q_table.to_csv('questions.csv', index=False)

In [12]:
a_table.dropna(how='all')
a_table.to_csv('answers.csv', index=False)

In [13]:
table = pd.concat([q_table.drop(["id"], axis = 1), a_table.drop(["id"], axis = 1)], axis=1)
table.to_csv('final.csv', index=False)