In [2]:
!pip install PyPDF2
!pip install pymupdf Pillow pytesseract



In [5]:
from PyPDF2 import PdfReader
import fitz  # PyMuPDF
import os
import shutil


def read(file_path, start, end):
    
    result = []
    
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)
        if end > num_pages:
            print("end number outside of total pages")
            return None


        folder = "images"

        if os.path.exists(folder):
            shutil.rmtree(folder)
    
        os.makedirs(folder)
        q = 0
        
        for page_num in range(start, end):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
                
            result.append(text)

    return "\n".join(result)

In [9]:
import re

def filter(text):
    def license_filter(text):
        pattern = r"© The Author\(s\), under exclusive license to Springer Nature Switzerland AG \d{4}\s+V\s*\.\s*K\s*\.\s*Kohli\s*et\s*al\.,\s*Comprehensive Multiple-Choice Questions in Pathology\s*,\s*https://doi\.org/10\.1007/[\d\w\-_.]+"
        return re.sub(pattern, "", text)

    text = text.replace("\xa0", " ")
    text = license_filter(text)
    return text


In [11]:
import numpy as np 

def get_questions_and_answers(text):
    questions = []
    answers = []

    current_Q = []
    current_A = []

    q = True
    a = False
    filtered_text = filter(text)

    for line in filtered_text.split('\n')[4:]:
        line = line.strip()
        if q:
            if "Answers and Explanations" in line:
                questions.append(current_Q)
            
                current_Q = []
                q = False
                a = True
                continue

            
            current_Q.append(line)

        if a:
            if "Bibliography" in line:
                answers.append(current_A)
                current_A = []
                q = False
                a = False
                continue
                
            current_A.append(line)

        if q * a == 0:
            if "Multiple Choice Questions" in line:
                q = True
                a = False
            
    return questions, answers

In [13]:
def restructure_questions(questions):
    result = []
    
    def split_q(text):
        pattern = r"(?=\b\d{1,2}\.\s|A\.\s*|B\.\s*|C\.\s*|D\.\s*|E\.\s*)"
        split_text = re.split(pattern, text)
        return [part.strip() for part in split_text if part.strip() and len(part) > 1]

    y = 1
    
    for question_block in questions:
        text = " ".join(question_block)
        lists = split_q(text)
        current = []
        x = 1
        
        for i in lists:
            if re.match(r'^\d{1,2}\.\s', i):
                if current:
                    result.append([f"Set_{y}: q{x}"] + current)
                    x += 1
                current = [i[3:]]
            else: 
                current.append(i)
        
        if current:
            result.append([f"Set_{y}: q{x}"] + current)
        y += 1
    
    return result

In [15]:
def restructure_answers(answers):
    result = []
    def split_a(text):
        pattern = r'(?<!\d)\b\d+\.\s+(?=[A])'
        split_text = re.split(pattern, text)
        return np.array([part.strip() for part in split_text if part.strip()])
    y = 1
    for answer in answers:
        x = 1
        text = "\n".join(answer)
        lists = split_a(text)
            

        
        for line in lists:
           # line = re.sub(r'^Answer: ', '', line)
            current = [f"Set_{y}: q{x}"] + [line[:line.find("\n")], line[line.find("\n")+1:].replace("\n", " ")]
            result.append(current)
            x += 1
        y += 1
    return result
        

In [17]:
def remove_answers_and_explanations(text):
    pattern1 = r'Answers and Explanations.*?((?<!\.)\s|$)'
    cleaned_text = re.sub(pattern1, '', text, flags=re.DOTALL).strip()
    #pattern = r"(?<![A-E]\.)(?<=\.\s*)[^\s.]+(\s+[^\s.]+)*\s*$"
    #cleaned_text = re.sub(pattern, '.', text).strip()
    return cleaned_text

def clean_sentences(double_list):
    return [[remove_answers_and_explanations(sentence) for sentence in sublist] for sublist in double_list]



def ending(text):
    pattern = r"\.\s*[^\s.]+(\s+[^\s.]+)*\s*$"
    modified_text = re.sub(pattern, '.', text).strip()
    return modified_text

def explanation(answers):
    for answer in answers:
        if len(answer) >= 3:
            answer[2] = ending(answer[2])
    return answers



In [19]:
textbook = "QA.pdf"
contents = read(textbook, 9, 191)
questions_d, answers_d = get_questions_and_answers(contents)
#answers_d = clean_sentences(answer)
#print(answers_d)

In [21]:
questions = restructure_questions(questions_d)

In [23]:
answers = restructure_answers(answers_d)
answers = clean_sentences(answers)
answers = explanation(answers)
#print(answers)

In [25]:
import pandas as pd

q_table = pd.DataFrame(data=questions)
q_table.columns = ["id", "Question", "A", "B", "C", "D", "E", "F"]

a_table = pd.DataFrame(data=answers)
a_table.columns = ["id", "Answer", "explaination"]

In [27]:
a_table = a_table[a_table["Answer"].str[:6] == "Answer"].reset_index()

In [29]:
q_table.dropna(how='all')
q_table.to_csv('questions.csv', index=False)

In [31]:
a_table.dropna(how='all')
a_table.to_csv('answers.csv', index=False)

In [33]:
a_table_m = pd.read_csv('answers.csv')
a_table_m['Answer'] = a_table_m['Answer'].str.replace(r'^Answer:\s*', '', regex=True)
a_table_m.to_csv('answers.csv', index=False)

In [75]:
new_row = {
    'id': 'Set_22: q2', 
    'Answer': 'C. Meningioma', 
    'explaination': 'Meningiomas are relatively common neoplasms derived from meningothelial cells of the arachnoid. The World Health organization (WHO) classified meningiomas in three groups'
}

#a_table_m = pd.concat([a_table_m.iloc[:572],row, a_table_m.iloc[572:]]).reset_index(drop=True)
#a_table_m.to_csv('answers.csv', index=False) ##这里跑一次就行


question = [[1,25],[4,5],[4,21],[5,13],[7,5],[7,6],[7,21],[8,14],[8,20],[8,24],[8,46],[9,6],[9,16],[10,9],[10,10],[10,14],[10,14],[12,6],[12,11],[12,12],[12,32],[14,1],[14,13],[14,20],[14,28],[16,21],[16,25],[17,11],[18,7],[18,10],[19,11],[20,3],[20,4],[20,10],[20,11],[20,12],[20,12],[20,13],[20,17],[20,18],[20,19],[20,20],[20,21],[20,23],[20,24],[20,26],[20,27],[20,28],[22,4],[22,27],[22,28],[23,3],[23,5],[23,11],[23,12]]
a_table_m['image'] = ' '

for set in question:
    set_id = 'Set_{}: q{}'.format(set[0], set[1])
    if not a_table_m[a_table_m['id'] == set_id].empty:
        row_index = a_table_m[a_table_m['id'] == set_id].index[0]
        a_table_m.at[row_index, 'image'] = '/images/Set_{}_q{}.png'.format(set[0], set[1])

a_table_m.to_csv('answers.csv', index=False)


In [97]:
q_table_f = pd.read_csv('questions_f.csv')
a_table_f = pd.read_csv('answers_f.csv')

In [99]:
table = pd.concat([q_table_f.drop(["id"], axis = 1), a_table_f.drop(["id"], axis = 1)], axis=1)
table.to_csv('final.csv', index=False)

In [101]:
a_table_f.drop(columns=['index'])
table_json = pd.concat([q_table_f, a_table_f.drop(["id"], axis = 1)], axis=1)
table_json.to_csv('final.csv', index=False)

# for modifying the images list:

In [238]:
question = [[1,25],[4,5],[4,21],[5,13],[7,5],[7,6],[7,21],[8,14],[8,20],[8,24],[8,46],[9,6],[9,16],[10,9],[10,10],[10,14],[12,6],[12,11],[12,12],[12,32],[14,1],[14,13],[14,20],[14,28],[16,21],[16,25],[17,11],[18,7],[18,10],[19,11],[20,3],[20,4],[20,10],[20,11],[20,12],[20,13],[20,17],[20,18],[20,19],[20,20],[20,21],[20,23],[20,24],[20,26],[20,27],[20,28],[22,4],[22,27],[22,28],[23,3],[23,5],[23,11],[23,12]]
len(question)

53

In [225]:
a_table_f.drop(columns=['index'])
table_json = pd.concat([q_table_f, a_table_f.drop(["id"], axis = 1)], axis=1) # keep the did first
table_json.to_csv('final.csv', index=False)

In [227]:
df_image = pd.read_csv('final.csv')
df_image

def extract_numbers(s):
    return [int(num) for num in re.findall(r'\d+', s)]

extracted_values = [extract_numbers(s) for s in question_ids]

print(extracted_values)

Unnamed: 0,id,Question,A,B,C,D,E,index,Answer,explaination
0,Set_1: q1,Programmed cell death is also called as:,A. Degeneration,B. Calcification,C. Apoptosis,D. Necrosis,E. Atrophy,0.0,C. Apoptosis,Apoptosis is a complex type of programmed cell...
1,Set_1: q2,Which of the following medical condition is be...,A. Acanthosis nigricans,B. Xeroderma pigmentosum,C. Basal cell nevus syndrome,D. Bloom syndrome,E. Werner syndrome,1.0,B. Xeroderma pigmentosum,Xeroderma pigmentosum (XP) is a rare disorder ...
2,Set_1: q3,Which of the following is a pathologic cause o...,A. Endometrial hyperplasia,B. Compensatory hyperplasia after partial hepa...,C. Hormonal stimulation seen in breast develop...,D. Antigenic stimulation seen in lymphoid hype...,E. Cardiac muscle in hypertension,2.0,A. Endometrial hyperplasia,Hyperplasia is an increase in the number of ce...
3,Set_1: q4,"In pathology, what is the name given when ther...",A. Hyperplasia,B. Hypertrophy,C. Metaplasia,D. Dysplasia,E. Atrophy,3.0,C. Metaplasia,Metaplasia is a reversible change which replac...
4,Set_1: q5,Which of the following is the best statement f...,A. Perinuclear yellow-brown pigment,B. Black-brown pigment,C. Golden yellow-brown granular pigment,D. Protein,E. Lipid,4.0,A. Perinuclear yellow-brown pigment,Lipofuscin is a yellow-brown aging pigment. It...
...,...,...,...,...,...,...,...,...,...,...
623,Set_23: q19,Which of the following eye conditions lead to...,A. Keratoconus,B. Pterygium,C. Corneal ectasia,D. Interstitial keratitis,E. Pellucid marginal degeneration,664.0,A. Keratoconus,Keratoconus is a disorder of the cornea. It le...
624,Set_23: q20,Which of the following eye conditions have pl...,A. Diabetes mellitus,B. Xanthelasma,C. Necrobiotic xanthogranuloma,D. Tuberous xanthomas,E. Orbital lipogranulomas,665.0,B. Xanthelasma,Most eyelid lesions are benign such as hordeol...
625,Set_23: q21,A 61-year-old female presents with painless p...,A. Fibrous histiocytoma,B. Cavernous hemangioma,C. Hemangiopericytoma,D. Solitary fibrous tumor,E. Intraocular melanoma,666.0,B. Cavernous hemangioma,Patients of cavernous hemangioma present with ...
626,Set_23: q22,A 42-year-old female presents for an evaluati...,A. Basal cell carcinoma,B. Squamous cell carcinoma,C. Molluscum contagiosum,D. Melanoma,E. Pyogenic granuloma,667.0,C. Molluscum contagiosum,Molluscum contagiosum presents with a raised d...


In [240]:
image_name = []
i = 0
j = 0

for q_id in question_ids:
    if j < len(question) and question[j][0] == extracted_values[i][0] and question[j][1] == extracted_values[i][1]:
        image_name.append(f'/images/Set_{question[j][0]}_q{question[j][1]}.png')
        j += 1
    else:
        image_name.append('') 
    
    i += 1

# Display the result
image_name
len(image_name)

628

In [253]:
df_image['image'] = image_name
df_image
df_image.to_csv('FINAL.csv', index=False)
df_image.drop(columns=['index', 'id']).to_csv('for_jasonl.csv', index=False)

# image test below

In [533]:
pip install pymupdf

Note: you may need to restart the kernel to use updated packages.


In [534]:
"""
import fitz  # PyMuPDF
import os
import shutil

# Path to the PDF file
pdf_path = "QA.pdf"
# Path to the folder where images will be saved
folder = "images"

if os.path.exists(folder):
    shutil.rmtree(folder)
    
os.makedirs(folder)

doc = fitz.open(pdf_path)

for i in range(10,190):
    # Get the list of images on the page
    image_list = doc.get_page_images(i)
    
    for img_index, img in enumerate(image_list):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)
        pix = fitz.Pixmap(fitz.csRGB, pix)
        pix.save(os.path.join(folder, "page{}_img{}.png".format(i-8, img_index)))

print("Image extraction completed.")
"""





'\nimport fitz  # PyMuPDF\nimport os\nimport shutil\n\n# Path to the PDF file\npdf_path = "QA.pdf"\n# Path to the folder where images will be saved\nfolder = "images"\n\nif os.path.exists(folder):\n    shutil.rmtree(folder)\n    \nos.makedirs(folder)\n\ndoc = fitz.open(pdf_path)\n\nfor i in range(10,190):\n    # Get the list of images on the page\n    image_list = doc.get_page_images(i)\n    \n    for img_index, img in enumerate(image_list):\n        xref = img[0]\n        pix = fitz.Pixmap(doc, xref)\n        pix = fitz.Pixmap(fitz.csRGB, pix)\n        pix.save(os.path.join(folder, "page{}_img{}.png".format(i-8, img_index)))\n\nprint("Image extraction completed.")\n'

In [535]:
last_num = -1
img_index = 0
counter = 0

def get_pic(page_num):
    global last_num, img_index,counter
    question = [[1,25],[4,5],[4,21],[5,13],[7,5],[7,6],[7,21],[8,14],[8,20],[8,24],[8,46],[9,6],[9,16],[10,9],[10,10],[10,14],[10,14],[12,6],[12,11],[12,12],[12,32],[14,1],[14,13],[14,20],[14,28],[16,21],[16,25],[17,11],[18,7],[18,10],[19,11],[20,3],[20,4],[20,10],[20,11],[20,12],[20,12],[20,13],[20,17],[20,18],[20,19],[20,20],[20,21],[20,23],[20,24],[20,26],[20,27],[20,28],[22,4],[22,27],[22,28],[23,3],[23,5],[23,11],[23,12]]

    pdf_path = "QA.pdf"
    folder = "images"

    if last_num == page_num:
        img_index += 1
    else:
        img_index = 0

    special_page= (49, 94, 112, 142, 155, 172, 184)
    if page_num in special_page:
        img_index = 1
        if last_num == page_num:
            img_index += 1

    doc = fitz.open(pdf_path)
    
    image_list = doc.get_page_images(page_num)
    
    img = image_list[img_index]
    
    xref = img[0]
    pix = fitz.Pixmap(doc, xref)
    pix = fitz.Pixmap(fitz.csRGB, pix)
    set = question[counter]
    counter += 1
    pix.save(os.path.join(folder, "Set_{}_q{}.png".format(set[0], set[1])))

    last_num = page_num
    
    
folder = "images"

if os.path.exists(folder):
    shutil.rmtree(folder)
    
os.makedirs(folder)

for i in page_num:
    get_pic(i)

In [536]:
page_num = [11,29,31,36,49,50,51,57,58,58,60,71,73,79,79,79,80,94,95,95,97,112,113,114,115,129,130,136,142,143,149,155,155,156,156,157,157,157,158,158,158,158,159,159,159,160,160,160,172,174,175,184,184,185,185]
question = [[1,25],[4,5],[4,21],[5,13],[7,5],[7,6],[7,21],[8,14],[8,20],[8,24],[8,46],[9,6],[9,16],[10,9],[10,10],[10,14],[10,14],[12,6],[12,11],[12,12],[12,32],[14,1],[14,13],[14,20],[14,28],[16,21],[16,25],[17,11],[18,7],[18,10],[19,11],[20,3],[20,4],[20,10],[20,11],[20,12],[20,12],[20,13],[20,17],[20,18],[20,19],[20,20],[20,21],[20,23],[20,24],[20,26],[20,27],[20,28],[22,4],[22,27],[22,28],[23,3],[23,5],[23,11],[23,12]]