In [1468]:
!pip install PyPDF2
!pip install pymupdf Pillow pytesseract



In [1469]:
from PyPDF2 import PdfReader

def read(file_path, start, end):
    result = []
    
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)
        if end > num_pages:
            print("end number outside of total pages")
            return None
        
        for page_num in range(start, end):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            result.append(text)
    return "\n".join(result)

In [1470]:
import re

def filter(text):
    def license_filter(text):
        pattern = r"© The Author\(s\), under exclusive license to Springer Nature Switzerland AG \d{4}\s+V\s*\.\s*K\s*\.\s*Kohli\s*et\s*al\.,\s*Comprehensive Multiple-Choice Questions in Pathology\s*,\s*https://doi\.org/10\.1007/[\d\w\-_.]+"
        return re.sub(pattern, "", text)

    text = text.replace("\xa0", " ")
    text = license_filter(text)
    return text


In [1471]:
import numpy as np 

def get_questions_and_answers(text):
    questions = []
    answers = []

    current_Q = []
    current_A = []

    q = True
    a = False
    filtered_text = filter(text)

    for line in filtered_text.split('\n')[4:]:
        line = line.strip()
        if q:
            if "Answers and Explanations" in line:
                questions.append(current_Q)
            
                current_Q = []
                q = False
                a = True
                continue
                
            current_Q.append(line)

        if a:
            if "Bibliography" in line:
                answers.append(current_A)
                current_A = []
                q = False
                a = False
                continue
                
            current_A.append(line)

        if q * a == 0:
            if "Multiple Choice Questions" in line:
                q = True
                a = False
            
    return questions, answers

In [1472]:
def restructure_questions(questions):
    result = []
    
    def split_q(text):
        pattern = r"(?=\b\d{1,2}\.\s|A\.\s*|B\.\s*|C\.\s*|D\.\s*|E\.\s*)"
        split_text = re.split(pattern, text)
        return [part.strip() for part in split_text if part.strip() and len(part) > 1]

    y = 1
    
    for question_block in questions:
        text = " ".join(question_block)
        lists = split_q(text)
        current = []
        x = 1
        
        for i in lists:
            if re.match(r'^\d{1,2}\.\s', i):
                if current:
                    result.append([f"Set_{y}: q{x}"] + current)
                    x += 1
                current = [i[3:]]
            else: 
                current.append(i)
        
        if current:
            result.append([f"Set_{y}: q{x}"] + current)
        y += 1
    
    return result

In [1473]:
def restructure_answers(answers):
    result = []
    def split_a(text):
        pattern = r'(?<!\d)\b\d+\.\s+(?=[A])'
        split_text = re.split(pattern, text)
        return np.array([part.strip() for part in split_text if part.strip()])
    y = 1
    for answer in answers:
        x = 1
        text = "\n".join(answer)
        lists = split_a(text)
            

        
        for line in lists:
           # line = re.sub(r'^Answer: ', '', line)
            current = [f"Set_{y}: q{x}"] + [line[:line.find("\n")], line[line.find("\n")+1:].replace("\n", " ")]
            result.append(current)
            x += 1
        y += 1
    return result
        

In [1474]:
def remove_answers_and_explanations(text):
    pattern1 = r'Answers and Explanations.*?((?<!\.)\s|$)'
    cleaned_text = re.sub(pattern1, '', text, flags=re.DOTALL).strip()
    #pattern = r"(?<![A-E]\.)(?<=\.\s*)[^\s.]+(\s+[^\s.]+)*\s*$"
    #cleaned_text = re.sub(pattern, '.', text).strip()
    return cleaned_text

def clean_sentences(double_list):
    return [[remove_answers_and_explanations(sentence) for sentence in sublist] for sublist in double_list]



def ending(text):
    pattern = r"\.\s*[^\s.]+(\s+[^\s.]+)*\s*$"
    modified_text = re.sub(pattern, '.', text).strip()
    return modified_text

def explanation(answers):
    for answer in answers:
        if len(answer) >= 3:
            answer[2] = ending(answer[2])
    return answers

In [1475]:
textbook = "QA.pdf"
contents = read(textbook, 9, 191)
questions_d, answers_d = get_questions_and_answers(contents)
#answers_d = clean_sentences(answer)
#print(answers_d)

In [1476]:
questions = restructure_questions(questions_d)

In [1477]:
answers = restructure_answers(answers_d)
answers = clean_sentences(answers)
answers = explanation(answers)
#print(answers)

In [1478]:
import pandas as pd

q_table = pd.DataFrame(data=questions)
q_table.columns = ["id", "Question", "A", "B", "C", "D", "E", "F"]

a_table = pd.DataFrame(data=answers)
a_table.columns = ["id", "Answer", "explaination"]

In [1479]:
a_table = a_table[a_table["Answer"].str[:6] == "Answer"].reset_index()

In [1480]:
q_table.dropna(how='all')
q_table.to_csv('questions.csv', index=False)

In [1481]:
a_table.dropna(how='all')
a_table.to_csv('answers.csv', index=False)

In [1482]:
a_table_m = pd.read_csv('answers.csv')
a_table_m['Answer'] = a_table_m['Answer'].str.replace(r'^Answer:\s*', '', regex=True)
a_table_m.to_csv('answers.csv', index=False)

In [1483]:
new_row = {
    'id': 'Set_22: q2', 
    'Answer': 'C. Meningioma', 
    'explaination': 'Meningiomas are relatively common neoplasms derived from meningothelial cells of the arachnoid. The World Health organization (WHO) classified meningiomas in three groups'
}


row = pd.DataFrame([new_row])
#print(a_table_m.head())
#print(row)

a_table_m = pd.concat([a_table_m.iloc[:572],row, a_table_m.iloc[572:]]).reset_index(drop=True)
a_table_m.to_csv('answers.csv', index=False) ##这里跑一次就行

In [1484]:
q_table_f = pd.read_csv('questions_f.csv')
a_table_f = pd.read_csv('answers.csv')

In [1485]:
table = pd.concat([q_table_f.drop(["id"], axis = 1), a_table_f.drop(["id"], axis = 1)], axis=1)
table.to_csv('final.csv', index=False)

In [1486]:
a_table_f.drop(columns=['index'])
table_json = pd.concat([q_table_f.drop(["id"], axis = 1), a_table_f.drop(["id"], axis = 1)], axis=1)
table_json.to_csv('final.csv', index=False)