In [121]:
import pymupdf
import re
import numpy as np

# global function

In [451]:
def crop_header_footer_pdf_page(page):
    height, width = page.rect.height, page.rect.width
    header = min(0.06 * height, height)  # Ensure header is within bounds
    footer = min(0.06 * height, height)  # Ensure footer is within bounds

    # Ensure the crop box is within the media box
    crop_box = (0, header, width , height - footer)

    if crop_box[1] < crop_box[3] and crop_box[3] <= height:
        page.set_cropbox(crop_box)
    else:
        raise ValueError("Calculated crop box is outside the media box boundaries.")
    return page



def extract_bold_block(lines):
    """
    extract each bold block from the list of lines
    each block start from a line in blod font
    and ends with a line in bold font
    
    
    :param lines: list of lines, each line is a list of spans
    """
    bold_blocks = []
    bold_block = []

    for line in lines:
        if 'bold' in line['font'].lower().replace('-',' '):
            if bold_block:
                bold_blocks.append(bold_block.copy())
                bold_block = []
            
        bold_block.append(line.copy())

    bold_blocks.append(bold_block.copy())
    return bold_blocks

def extract_bold_block_and_lines(lines):
    """
    extract each bold block from the list of lines
    each block start from a line in blod font
    and ends with a line in bold font
    
    
    :param lines: list of lines, each line is a list of spans
    """
    bold_blocks = []
    bold_block = []

    for line in lines:
        if 'bold' in line['font'].lower().replace('-',' ') or 'underline' in line['font'].lower().replace('-',' '):

            if bold_block:
                bold_blocks.append(bold_block.copy())
                bold_block = []
            
        bold_block.append(line.copy())

    bold_blocks.append(bold_block.copy())
    return bold_blocks


def extract_question_block_not_bold(lines):
    """
    extract each question block from the list of lines
    each question start from a line with 'Question X' in blod font
    and ends with a line with 'Question X+1' in bold font
    
    X goes from 1 to 20
    The question number 20 ends with the last line of the last page
    
    
    :param lines: list of lines, each line is a list of spans
    """
    questions = []
    question = []
    question_number = 1

    regex_question = re.compile(r'question \d+', re.IGNORECASE)
    

    for idx, line in enumerate(lines):
        # if line corresponds to regex_question
        if regex_question.search(line['text']):

            if idx - 1 > 0:
                
                if lines[idx - 1]["text"].strip() == "" or lines[idx - 1]["text"].strip().endswith(("?", ":", ".")):
                    if question:
                        questions.append(question.copy())
                        question = []
                    question_number += 1
                      
        question.append(line.copy())

    questions.append(question.copy())
    return questions


def extract_question_block(lines):
    """
    extract each question block from the list of lines
    each question start from a line with 'Question X' in blod font
    and ends with a line with 'Question X+1' in bold font
    
    X goes from 1 to 20
    The question number 20 ends with the last line of the last page
    
    
    :param lines: list of lines, each line is a list of spans
    """
    questions = []
    question = []
    question_number = 1

    regex_question = re.compile(r'question \d+', re.IGNORECASE)
    

    for line in lines:
        if 'bold' in line['font'].lower():
            # if line corresponds to regex_question
            if regex_question.match(line['text']):
                if question:
                    questions.append(question.copy())
                    question = []
                question_number += 1
            
            # if line['text'].lower().startswith(f'question {question_number}'):

            #     if question:
            #         questions.append(question.copy())
            #         question = []
            #     question_number += 1
            
        question.append(line.copy())

    questions.append(question.copy())
    return questions


def build_line(spans):
    output = spans[0].copy()

    fonts = {
    }

    fonts[spans[0]['font']] = len(spans[0]['text'].replace(' ', ''))
    
    for span in spans[1::]:
        output['text'] += span['text']

        fonts[span['font']] = fonts.get(span['font'], 0) + len(span['text'].replace(' ', ''))

    output['font'] = max(fonts, key=fonts.get)
    if len(output['text'].replace(' ', '')) <= 5:
        output['font'] = 'arial'

    return output


def prepross_pdf(pdf, *, number_of_first_pages_to_skip=0, number_of_last_pages_to_skip=0):
    """
    extract each line from the pdf preprocess
    """
    lines = []

    if number_of_first_pages_to_skip < 0 or number_of_last_pages_to_skip < 0:
        raise ValueError("number_of_first_pages_to_skip and number_of_last_pages_to_skip should be positive integers")
    
    if number_of_first_pages_to_skip + number_of_last_pages_to_skip >= len(pdf):
        raise ValueError("number_of_first_pages_to_skip + number_of_last_pages_to_skip should be less than the number of pages in the pdf")
    
    pages = pdf[number_of_first_pages_to_skip:-number_of_last_pages_to_skip]
    if number_of_last_pages_to_skip == 0:
        pages = pdf[number_of_first_pages_to_skip:]
        
    
    for page in pages:
        page = crop_header_footer_pdf_page(page)
        
        for block in page.get_text('dict', flags=11)['blocks']:
            for line in block['lines']:
                lines.append(build_line(line['spans'].copy()))
    
    return lines

                
def make_block_unique(block):
    """
    make a block unique by removing the duplicate lines
    """
    unique_block = []
    for line in block:
        if line not in unique_block:
            unique_block.append(line)

    return unique_block

## func for question

In [452]:
def repaire_question_split(line):
    """
    split each - question - into sub questions
    split each • question • into sub questions
    """

    dash_split = line['text'].split('━')
    dot_split = line['text'].split('•')

    if len(dash_split)  == 1 and len(dot_split) == 1:
        return [line]

    if len (dash_split) > 1 and len(dot_split) > 1:
        raise ValueError("Ask for help")
    
    to_work = []
    
    if len(dash_split) > 1:
        print(f"len(dash_split): {len(dash_split)}")
        print(f"dash_split: {dash_split}")
        to_work = dash_split

    if len(dot_split) > 1:
        print(f"len(dot_split): {len(dot_split)}")
        print(f"dot_split: {dot_split}")
        to_work = dot_split
        
    # return pair of [0, 1], [0, 1], [0, 2], ...
    output = []

    header = to_work[0]
    print("\nHEADER: ", header)
    for idx, sub in enumerate(to_work[1:]):
        output.append({
            'code': line['code'] + '.' + str(idx + 1),
            'text': header + ' ' + sub
        })

    print('\n\n')
    print(output)
    return output

def find_start_of_sentence(sentence, index_End_Of_Sentence):
    end_Of_Sentence_Char = ['.', '?', '!']
    idx = index_End_Of_Sentence - 1

    while idx >= 0 and sentence[idx] not in end_Of_Sentence_Char:
        idx -= 1


    if idx == 0:
        return sentence[:index_End_Of_Sentence + 1]
    
    return sentence[idx + 1: index_End_Of_Sentence + 1]

## PaperD1

In [460]:
path_file = '../data/PaperD/question/2022_PaperD1-2_questions_EN.pdf'
path_file = '../data/PaperD/question/2022_PaperD1-1_questions_EN.pdf'
path_file = '../data/PaperD/question/2023_PaperD1-2_questions_EN.pdf'
path_file = '../data/PaperD/question/2023_PaperD1-1_questions_EN.pdf'
path_file = '../data/PaperD/question/2024_PaperD1-1_questions_EN.pdf'
path_file = '../data/PaperD/question/2024_PaperD1-2_questions_EN.pdf'

verbose = True
pdf = pymupdf.open(path_file)

lines = prepross_pdf(pdf, number_of_first_pages_to_skip=1)

question_block = extract_question_block(lines)
print(len(question_block))
print()

for block in question_block:
    
    uni = make_block_unique(block)
    
    question_line = uni[0]
    if verbose:
        print(f"question_line: {question_line['text']}")
    
    regex_number = re.compile(r'\d+')
    question_number = int(regex_number.search(question_line['text']).group())
    if verbose:
        print(f'question_no: {question_number}')
    

    # for line in uni:
    #     print(f"line: {line['text']}")
    # print('-------------------\n\n')
    # continue
    
    idx = 2 # skip question number and marks 
    # regex for a sub question, it must start with (a) or (b) or (c) or (d) or 1. or 2. or 3. or 4.
    # regex_sub_question = re.compile(r'^(\([a-z]\)|[1-9]\.|[0-9]\))')
    regex_sub_question = re.compile(r'^(\([a-z]\)|[1-9]\.|[0-9]\)|\([0-9]\))')
    
    context = uni[idx]['text']
    while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
        context += uni[idx]['text']
        idx += 1
    
    if verbose:
        print(f"context: {context}")

    sub_questions = []
    while idx < len(uni):

        sub_question_text = uni[idx]['text']
        idx +=1
        # print(f'following lines: {uni[idx]["text"]}')
        while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
            # print('inside : {}'.format(uni[idx]['text']))
            sub_question_text += uni[idx]['text']
            idx += 1


        code = regex_sub_question.search(sub_question_text).group()
        if verbose:
            print(f'code : {code}')

        # remove the code from the subquestion
        sub_question_text = sub_question_text[len(code):].strip()
        if verbose:
            print(f'subquestoin : {sub_question_text}')


        sub_questions.append({
            'code': code,
            'text': sub_question_text
        })

        
    # repaire subquestions
    #  - if code is (1.), then the next code should be (2.)
    #  - if code is (a), then the next code should be (b)
    #
    # - if code is 1. and then (a) or (b) or (c) or (d) then it is a sub sub question -> should be merged with the previous sub question as 1;a (contains the text of both), 1.b (contains the text of both), 1.c (contains the text of both), 1.d (contains the text of both)
    
    # regex_number_question = re.compile(r'[1-9]\.|[0-9]\)')
    # regex_letter_question = re.compile(r'\([a-z]\)')
    # sub_questions_repaired = []
    # idx = 0
    # while idx < len(sub_questions):
    #     sub_question = sub_questions[idx]
    #     code = sub_question['code']
    #     text = sub_question['text']
    #     idx += 1

    #     if regex_number_question.match(code):
    #         # it is a sub question
    #         # check if it has a sub sub question
    #         found = False
    #         while idx < len(sub_questions) and regex_letter_question.match(sub_questions[idx]['code']):
    #             found = True

    #             tmp = sub_question.copy()
    #             tmp['code'] += sub_questions[idx]['code']
    #             tmp['text'] += sub_questions[idx]['text']
    #             sub_questions_repaired.append(tmp)

    #             idx += 1
            
    #         if not found:
    #             sub_questions_repaired.append(sub_question)
            

    #     elif regex_letter_question.match(code):
    #         # it is a sub sub question
    #         # merge it with the previous sub question
    #         sub_questions_repaired.append(sub_question)

    #     else:
    #         raise ValueError(f"unknown code {code}")

            
    # print("==============================\n\n")
    # for sub_question in sub_questions_repaired:
    #     print(f"code: {sub_question['code']}")
    #     print(f"text: {sub_question['text']}")
        


        
        
    # repaire subquestions
    #  - if code is (1.), then the next code should be (2.)
    #  - if code is (a), then the next code should be (b)
    #
    # - if code is 1. and then (a) or (b) or (c) or (d) then it is a sub sub question -> should be merged with the previous sub question as 1;a (contains the text of both), 1.b (contains the text of both), 1.c (contains the text of both), 1.d (contains the text of both)
    
    regex_number_question = re.compile(r'[1-9]\.|[0-9]\)|\([0-9]\)')
    regex_letter_question = re.compile(r'\([a-z]\)')
    sub_questions_repaired = []
    idx = 0
    while idx < len(sub_questions):
        sub_question = sub_questions[idx]
        code = sub_question['code'].strip()
        text = sub_question['text']
        idx += 1

        if regex_number_question.match(code):
            # it is a sub question
            # check if it has a sub sub question
            found = False
            while idx < len(sub_questions) and regex_letter_question.match(sub_questions[idx]['code']):
                found = True

                tmp = sub_question.copy()
                tmp['code'] += sub_questions[idx]['code']
                tmp['text'] += sub_questions[idx]['text']
                
                repaired = repaire_question_split(tmp)
                for rep in repaired:
                    sub_questions_repaired.append(rep)

                idx += 1
            
            if not found:
                repaired = repaire_question_split(sub_question)
                for rep in repaired:
                    sub_questions_repaired.append(rep)
            

        elif regex_letter_question.match(code):
            # it is a sub sub question
            # merge it with the previous sub question
            repaired = repaire_question_split(sub_question)
            for rep in repaired:
                sub_questions_repaired.append(rep)

        else:

            print(regex_number_question.match(code))
            print(regex_letter_question.match(code))
            raise ValueError(f"unknown code |{code}")

    
    # if we do not find any sub question, we look for '?'
    if not sub_questions_repaired:
        index_of_question_mark = np.where(np.array(list(context)) == '?')[0]
        # get all questions mark index in context
        

        print("===-----====---\n")
        print(index_of_question_mark)
        
        if len(index_of_question_mark) == 0:
            print(f'no question mark : {find_start_of_sentence(context, len(context) - 1)}')
        # for each Question mark, get the all sentence (where it start by a capital letter)
        for qm_idx in index_of_question_mark:
            print(f'found : {find_start_of_sentence(context, qm_idx)}')
        # print(question_mark_split)
            
    print("==============================\n\n")
    for sub_question in sub_questions_repaired:
        print(f"code: {sub_question['code']}")
        print(f"text: {sub_question['text']}")


        

        
        
    


    print('-------------------\n\n')
    

3

question_line: QUESTION 3
question_no: 3
context: The opposition period for European patent EP-1 expired yesterdayThe opposition period for European patent EP-1 expired yesterdayOn 16 January 2024, A and B, two nationals of Spain, A residing in Mexico and B residing in Spain, jointly filed a notice of opposition against EP-1. The opposition fee was paid on that date. The notice was filed in Spanish by A and B, named in that order, and signed by both of them. It includes the opponents' particulars, correctly identifies the opposed patent EP-1 and contains a statement of the extent to which EP-1 is opposed and properly substantiated grounds for this.On 29 January 2024, the opposition division issued an invitation to remedy deficiencies, setting a time limit of two months.Yesterday B filed a translation of the notice of opposition in English.What procedural steps need to be taken to remedy the deficiencies and for substantive examination of the joint opposition to start?
===-----====--


# Extract PaperD2

In [69]:
def repaire_question_split(line):
    """
    split each - question - into sub questions
    split each • question • into sub questions
    """

    dash_split = line['text'].split('━')
    dot_split = line['text'].split('•')

    if len(dash_split)  == 1 and len(dot_split) == 1:
        return [line]

    if len (dash_split) > 1 and len(dot_split) > 1:
        raise ValueError("Ask for help")
    
    to_work = []
    
    if len(dash_split) > 1:
        print(f"len(dash_split): {len(dash_split)}")
        print(f"dash_split: {dash_split}")
        to_work = dash_split

    if len(dot_split) > 1:
        print(f"len(dot_split): {len(dot_split)}")
        print(f"dot_split: {dot_split}")
        to_work = dot_split
        
    # return pair of [0, 1], [0, 1], [0, 2], ...
    output = []

    header = to_work[0]
    print("\nHEADER: ", header)
    for idx, sub in enumerate(to_work[1:]):
        output.append({
            'code': line['code'] + '.' + str(idx + 1),
            'text': header + ' ' + sub
        })

    print('\n\n')
    print(output)
    return output



    pass

In [482]:
path_file = '../data/PaperD/question/2021_PaperD2_questions_EN.pdf'
path_file = '../data/PaperD/question/2024_PaperD2_questions_EN.pdf'
path_file = '../data/PaperD/question/2023_PaperD2_questions_EN.pdf'
path_file = '../data/PaperD/question/2022_PaperD2_questions_EN.pdf'

verbose = True
pdf = pymupdf.open(path_file)

lines = prepross_pdf(pdf, number_of_first_pages_to_skip=1)

print(lines[0]['text'])
blocks = [lines]
# question_block = extract_question_block(lines)
print(len(blocks))
print()

for block in blocks:
    

    uni = make_block_unique(block)
   
    context = uni[0]['text']
    idx = 1 # skip question number and marks 
   

    # regex for a sub question, it must start with (a) or (b) or (c) or (d) or 1. or 2. or 3. or 4.
    regex_sub_question = re.compile(r'^(\([a-z]\)|[1-9]\.|[0-9]\)|\([0-9]\))')

    # while we do not reach the first sub question, we keep adding the lines to the context / document
    while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
        context += uni[idx]['text']
        idx += 1
    
    if verbose:
        print(f"context: {context}")

    # we are now at the first sub question
    sub_questions = []
    while idx < len(uni):

        sub_question_text = uni[idx]['text']
        idx +=1
        while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
            # print('inside : {}'.format(uni[idx]['text']))
            sub_question_text += uni[idx]['text']
            idx += 1


        code = regex_sub_question.search(sub_question_text).group()
        if verbose:
            print(f'code : {code}')

        # remove the code from the subquestion
        sub_question_text = sub_question_text[len(code):].strip()
        if verbose:
            print(f'subquestoin : {sub_question_text}')


        sub_questions.append({
            'code': code,
            'text': sub_question_text
        })

        
    # repaire subquestions
    #  - if code is (1.), then the next code should be (2.)
    #  - if code is (a), then the next code should be (b)
    #
    # - if code is 1. and then (a) or (b) or (c) or (d) then it is a sub sub question -> should be merged with the previous sub question as 1;a (contains the text of both), 1.b (contains the text of both), 1.c (contains the text of both), 1.d (contains the text of both)
    
    regex_number_question = re.compile(r'[1-9]\.|[0-9]\)|\([0-9]\)')
    regex_letter_question = re.compile(r'\([a-z]\)')
    sub_questions_repaired = []
    idx = 0
    while idx < len(sub_questions):
        sub_question = sub_questions[idx]
        code = sub_question['code'].strip()
        text = sub_question['text']
        idx += 1

        if regex_number_question.match(code):
            # it is a sub question
            # check if it has a sub sub question
            found = False
            while idx < len(sub_questions) and regex_letter_question.match(sub_questions[idx]['code']):
                found = True

                tmp = sub_question.copy()
                tmp['code'] += sub_questions[idx]['code']
                tmp['text'] += sub_questions[idx]['text']
                
                repaired = repaire_question_split(tmp)
                for rep in repaired:
                    sub_questions_repaired.append(rep)

                idx += 1
            
            if not found:
                repaired = repaire_question_split(sub_question)
                for rep in repaired:
                    sub_questions_repaired.append(rep)
            

        elif regex_letter_question.match(code):
            # it is a sub sub question
            # merge it with the previous sub question
            repaired = repaire_question_split(sub_question)
            for rep in repaired:
                sub_questions_repaired.append(rep)

        else:

            print(regex_number_question.match(code))
            print(regex_letter_question.match(code))
            raise ValueError(f"unknown code |{code}")

            
    print("==============================\n\n")
    for sub_question in sub_questions_repaired:
        print(f"code: {sub_question['code']}")
        print(f"text: {sub_question['text']}")
        


        
        


        

        
        
    


    print('-------------------\n\n')
    

Today is 8 March 2022. 
1

context: Today is 8 March 2022. You receive the following e-mail from the firm Optimisme.Dear Representative,[001] My   name   is   Candide.   I   am   the   General   Manager   of   Italian   company Optimisme (OPT). OPT specialises in plastic recycling. We design our own recycling machines  in-house  and  we  have  them  made  by  Torre  (TOR),  an  Italian  company manufacturing plastic recycling machines. When we started working with TOR in 2018, we signed an agreement. Under this agreement, TOR is bound to secrecy, and all intellectual property generated since then belongs to OPT. In future, we will manufacture these machines ourselves.[002] A known process of recycling includes cutting the plastic waste into small pieces in a mill, i.e. in a container with blades rotating at its bottom. An inorganic filler in the form of a powder is fed to the mill to be mixed with the plastic pieces to increase rigidity of the plastic. The problem with this known machi

# Extract answers

## func for answers

In [635]:
def split_as_part(blocks):
    """
    split the blocks into parts
    each part is a list of blocks
    """
    parts = []
    part = []
    # part I or part II
    regex_part = re.compile(r'Part I|part II', re.IGNORECASE)
    for block in blocks:
        # check if it contains the regex
        if regex_part.search(block[0]['text'].lower()):
            if part:
                parts.append(part.copy())
                part = []
        part.append(block.copy())

    parts.append(part.copy())

    return parts

def split_as_question(part):
    """
    split the part into questions
    each question is a list of blocks
    """
    questions = []
    question = []


    # question number
    regex_question = re.compile(r'question \d+', re.IGNORECASE)
    for block in part:
        # check if it contains the regex
        if regex_question.search(block[0]['text'].lower().strip()):

            if question:
                questions.append(question.copy())
                # print(f'question length: {len(question)}')
                question = []
        question.extend(block.copy())

    questions.append(question.copy())

    return questions

    
def extract_comment_part(part):
    """
    extract the comment part
    """
    comments = []
    # comment
    regex_question = re.compile(r'question \d+', re.IGNORECASE)
    for question in part[1::]:

        # get the part that match the regex
        quest = regex_question.search(question[0]['text'].lower())
        if quest:
            quest = quest.group()
        else:
            quest = None

        
        question_number = None
        if quest:
            question_number = int(re.search(r'\d+', quest).group())
        
        

        comment = ""
        for line in question[1::]:
            # print(line['text'])
            if line['text'] != '':
                comment +=   line['text'] + ' '
            
        comments.append((quest, question_number, comment))
        

    # if empty just take the first one
    if comments == []:
        comment = ""
        for line in part[0]:
            comment +=  line['text']
        
        comments.append(('OneForAll', None, comment))
        
    return comments

    
def extract_answer_part(part, *, verbose=False):
    
    answers = []
    
    regex_question = re.compile(r'question \d+', re.IGNORECASE)
    regex_sub_question = re.compile(r'^([a-z]\)|\([a-z]\)|[1-9]\.|[0-9]\)|\([0-9]\))')

    for question in part[1::]:
        
        if verbose:
            print(question[0]['text'])

        # get the part that match the regex
        quest = regex_question.search(question[0]['text'].lower())
        if quest:
            quest = quest.group()
        else:
            quest = None
        
        question_number = None
        if quest:
            question_number = int(re.search(r'\d+', quest).group())
        
        
        sub_questions = []
        idx = 1 if quest else 0
        while idx < len(question):

            header_idx = idx
            header_subtext = question[idx]['text']
            sub_question_text = ""
            idx +=1
            # print(f'following lines: {uni[idx]["text"]}')
            done = False
            while not done:
                while idx < len(question) and not regex_sub_question.match(question[idx]['text']):
                    # print('inside : {}'.format(uni[idx]['text']))
                    sub_question_text += question[idx]['text']
                    idx += 1

                if idx == len(question):
                    break
                
                if idx - 1 > 0:
                    
                    if verbose:
                        print('previous ::::')
                        print(f'{question[idx - 1]["text"]}')
                        print(f'{question[idx - 1]["text"].strip().endswith(("?", ":", "."))}')
                    
                        
                    if question[idx - 1]["text"].strip() == "" or question[idx - 1]["text"].strip().endswith(("?", ":", ".")):
                        done = True
                    else:
                        sub_question_text += question[idx]['text']
                        idx += 1
                        
            



            if idx == len(question) and not regex_sub_question.match(header_subtext):
                sub_questions.append({
                    'code': None,
                    'text': (header_subtext + ' ' + sub_question_text).strip()
                })

                if verbose:
                    print('\t------------------------')
                    print(f'code : {None}\ntext : {(header_subtext + " " + sub_question_text).strip()}')
                    print('\t------------------------')

                continue
            

            if verbose:
                print(f'\t\tidx: {idx} / {len(question)}')
                print(f'\t\tsub_text: {sub_question_text}') 
            
            code = header_subtext
            # remove the code from the subquestion
            # sub_question_text = sub_question_text[len(code):].strip()
            sub_question_text = sub_question_text.strip()
            if verbose:
                print(f'subquestoin : {sub_question_text}')

            if len(code) > 20:
                code2 = regex_sub_question.search(code).group()
                
                if not question[header_idx]['font'] == 'underlined':
                    sub_question_text = code[len(code2)::] + sub_question_text

                code = code2


            sub_questions.append({
                'code': code,
                'text': sub_question_text
            })

            if verbose:
                print('\t------------------------')
                print(f'code : {code}\ntext : {sub_question_text}')
                print('\t------------------------')

        answers.append((quest, question_number, sub_questions))


    return answers

    

## main for answer

In [490]:


pdf = pymupdf.open('../data/PaperD/answer/2023_PaperD_answers_EN.pdf')
pdf = pymupdf.open('../data/PaperD/answer/2022_PaperD_answers_EN.pdf')
pdf = pymupdf.open('../data/PaperD/answer/2021_PaperD_answers_EN.pdf.pdf')
pdf = pymupdf.open('../data/PaperD/answer/2024_PaperD_answers_EN.pdf')

# import os
#list all pdf in current dir
# pdf_files = [f for f in os.listdir() if f.endswith('.pdf')]
# print(pdf_files)
# pdf = pymupdf.open('../data/EQE/2014_PreEx_answers.pdf')

for page in pdf[1::]:
    continue
    for block in page.get_text('dict', flags=11)['blocks']:
        for line in block['lines']:
            for span in line['spans']:
                print(span.keys())
                print(span['ascender'], span['descender'])
                # print(span['bbox'].height)
                bbox = pymupdf.Rect(span['bbox'])
                print(bbox.height / (span['ascender'] - span['descender']))
                print(f'font: {span["font"]}|\t text: "{span["text"]}"\t size: {span["size"]}')
                print('-------------------\n\n')
            print('------- EOLine -------')
        print('========= EOBlock ==========\n')
    print('------- EOPage -------')


lines = prepross_pdf(pdf, number_of_first_pages_to_skip=0, number_of_last_pages_to_skip=1)
# print(f'number of lines: {len(lines)}')

bold_blocks = extract_bold_block(lines)
count = 0
# print(f'number of bold blocks: {len(bold_blocks)}')
for block in bold_blocks:
    # print(f'\t lines : {len(block)}')
    count += len(block)
# print(f'number of lines in bold blocks: {count}')

print(len(bold_blocks))
print(len(bold_blocks[1]))
part_blocks = split_as_part(bold_blocks)

count = 0
for part in part_blocks:
    for block in part:
        # print(len(block))
        count += len(block)
        
print(f'number of lines in part blocks: {count}')
print(len(bold_blocks))
print()

global_comment = None
part1 = {
    "comments": [],
    "answers": []
}

part2 = {
    "comments": [],
    "answers": []
}

for bblock in part_blocks:
    # print(f'block: {bblock[0]}')
    print(bblock[0][0]['text'])

    # regex must end with I
    regex_part = re.compile(r'Part I$', re.IGNORECASE)
    if regex_part.search(bblock[0][0]['text'].lower().strip()):

        questions = split_as_question(bblock)
        print(f'questions finds : {len(questions)}')

        if len(bblock) == 1:
            questions = extract_question_block_not_bold(bblock[0])
            # questions = split_as_question(bblock[0])
        print(f'number of questions blocks : {len(questions)}')
        
        regex_solution = re.compile(r'solution', re.IGNORECASE)
        if regex_solution.search(bblock[0][0]['text'].lower().strip()):
            print('solution part 1')
            part1['answers'] = questions
            continue
        else:
            print('comment part 1')
            part1['comments'] = questions
            continue

    regex_part = re.compile(r'Part II$', re.IGNORECASE)
    if regex_part.search(bblock[0][0]['text'].lower().strip()):
        print(len(bblock))
        
        questions = split_as_question(bblock)
        print(f'questions finds : {len(questions)}')
        if len(bblock) == 1:
            questions = extract_question_block_not_bold(bblock[0])

        print(f'number of questions: {len(questions)}')
        for quest in questions:
            print(f'question: {quest[0]["text"]}')
            print(f'number of sub blocks: {len(quest)}')
        
        regex_solution = re.compile(r'solution', re.IGNORECASE)
        if regex_solution.search(bblock[0][0]['text'].lower().strip()):
            print('solution part 2')
            part2['answers'] = questions
            continue
        else:
            print('comment part 2')
            part2['comments'] = questions
            continue

    global_comment = bblock

    
    # question_blocks = split_as_question(bblock)
    # print(f'number of questions: {len(question_blocks)}')
    # for quest in question_blocks:
    #     print(f'question: {quest[0][0]["text"]}')
    #     print(f'number of sub blocks: {len(quest)}')
    
    # print()

print('=====================\n\n')
# print(part1)
print(f'part1: comments: {part1["comments"] != []} answers: {part1["answers"] != []}')
print(len(part1['comments']))
print(len(part1['answers']))
print('=====================\n\n')
print(f'part2: comments: {part2["comments"] != []} answers: {part2["answers"] != []}')
print(len(part2['comments']))
print(len(part2['answers']))
# print(part2)
print('=====================\n\n')
# print(global_comment)
        
# extract_answer_data(pdf, verbose = True)
print('part 2, comments')
for a in extract_comment_part(part2['comments']):
    print(a[0])
    print(f'\t{a[1]}')

print('part 1, comments')
for a in extract_comment_part(part1['comments']):
    print(a[0])
    print(f'\t{a[1]}')




print('=====================\n\n')
print(extract_answer_part(part1['answers']))
print('=====================\n\n')
ans_p2 = extract_answer_part(part2['answers'])
for a in ans_p2:
   print(a) 
# print(extract_comment_part(part1['answers']))
# print(extract_comment_part(part1['comments']))
  

26
4
number of lines in part blocks: 399
26

Examiners’ Report – Paper D 2024
Examiners’ Report – Paper D 2024, Part I
number of blocks: 6
block length: 1
block length: 12
block length: 13
block length: 10
block length: 12
block length: 8
Examiners’ Report – Paper D 2024, Part I LiberationSerif-Bold
Question 1 (14 marks) LiberationSerif-Bold
This question was generally well answered. Most candidates stated that PCT-3 was likely filed LiberationSerif
in Spanish, as it is an accepted language by the SPTO. Fewer candidates however realized that LiberationSerif
PCT-3 must have been filed in Spanish, as it is the only language accepted by the SPTO and LiberationSerif
Question 2 (12 marks) LiberationSerif-Bold
Most candidates realized that the examination fee has not been paid and a reply to the written LiberationSerif
opinion was not filed in time. Some candidates forgot that the designation fee also has to be LiberationSerif
paid. Many candidates failed to recognize that the application is

# paper D1 as func

In [456]:
def extract_question_block_header(block, *, start_index = 1):
    """
        extract the correction note from a correction block
    """
    uni = block
    idx = start_index
    regex_sub_question = re.compile(r'^(\([a-z]\)|[1-9]\.|[0-9]\)|\([0-9]\))')
    
    context = uni[idx]['text']
    while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
        context += uni[idx]['text']
        idx += 1
    
    return context, idx


In [523]:
def extract_questionD1_data(path_file, *, verbose=False):
    
    pdf = pymupdf.open(path_file)

    lines = prepross_pdf(pdf, number_of_first_pages_to_skip=1)
    
    question_block = extract_question_block(lines)

    format_qestions = []
    for block in question_block:
        
        
        uni = make_block_unique(block)
        
        question_line = uni[0]
        if verbose:
            print(f"question_line: {question_line['text']}")
        
        regex_number = re.compile(r'\d+')
        search = regex_number.search(question_line['text'])
        if not search:
            continue

        question_number = int(search.group())
        if verbose:
            print(f'question_no: {question_number}')
        


        # idx = 2 # skip question number and marks 
        # regex for a sub question, it must start with (a) or (b) or (c) or (d) or 1. or 2. or 3. or 4.
        regex_sub_question = re.compile(r'^(\([a-z]\)|[1-9]\.|[0-9]\)|\([0-9]\))')

        question_context, idx = extract_question_block_header(uni, start_index=2)
                
        if verbose:
            print(f"context: {question_context}")

        question_format = {
            'question_number': question_number,
            "question_type": "OpenQuestion",
            'context': question_context,

            'questions': []
        }
        
        sub_questions = []
        while idx < len(uni):

            sub_question_text = uni[idx]['text']
            idx +=1
            while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
                sub_question_text += uni[idx]['text']
                idx += 1


            code = regex_sub_question.search(sub_question_text).group()
            if verbose:
                print(f'code : {code}')


            # remove the code from the subquestion
            sub_question_text = sub_question_text[len(code):].strip()
            if verbose:
                print(f'subquestoin : {sub_question_text}')


            sub_questions.append({
                'code': code,
                'text': sub_question_text
            })

            
            
        # repaire subquestions
        #  - if code is (1.), then the next code should be (2.)
        #  - if code is (a), then the next code should be (b)
        #
        # - if code is 1. and then (a) or (b) or (c) or (d) then it is a sub sub question -> should be merged with the previous sub question as 1;a (contains the text of both), 1.b (contains the text of both), 1.c (contains the text of both), 1.d (contains the text of both)
        
        regex_number_question = re.compile(r'[1-9]\.|[0-9]\)|\([0-9]\)')
        regex_letter_question = re.compile(r'\([a-z]\)')
        sub_questions_repaired = []
        idx = 0
        while idx < len(sub_questions):
            sub_question = sub_questions[idx]
            code = sub_question['code'].strip()
            text = sub_question['text']
            idx += 1

            if regex_number_question.match(code):
                # it is a sub question
                # check if it has a sub sub question
                found = False
                while idx < len(sub_questions) and regex_letter_question.match(sub_questions[idx]['code']):
                    found = True

                    tmp = sub_question.copy()
                    tmp['code'] += sub_questions[idx]['code']
                    tmp['text'] += sub_questions[idx]['text']
                    
                    repaired = repaire_question_split(tmp)
                    for rep in repaired:
                        sub_questions_repaired.append(rep)

                    idx += 1
                
                if not found:
                    repaired = repaire_question_split(sub_question)
                    for rep in repaired:
                        sub_questions_repaired.append(rep)
                

            elif regex_letter_question.match(code):
                # it is a sub sub question
                # merge it with the previous sub question
                repaired = repaire_question_split(sub_question)
                for rep in repaired:
                    sub_questions_repaired.append(rep)

            else:

                print(regex_number_question.match(code))
                print(regex_letter_question.match(code))
                raise ValueError(f"unknown code |{code}")


        # if we do not find any sub question, we look for '?'
        if not sub_questions_repaired:
            index_of_question_mark = np.where(np.array(list(question_context)) == '?')[0]
            # get all questions mark index in context
            

            print("===-----====---\n")
            print(index_of_question_mark)
            
            if len(index_of_question_mark) == 0:
                question_text = find_start_of_sentence(question_context, len(question_context) - 1)
                
                question_format['context'] = question_context[:-len(question_text)]
                question_format['questions'].append({
                    'question_code': None,
                    'question_text': question_text
                })
                    
                # print(f'no question mark : {find_start_of_sentence(question_context, len(question_context) - 1)}')
                format_qestions.append(question_format)
                continue

                
            else:
                # for each Question mark, get the all sentence (where it start by a capital letter)
                qm_idx = sorted(index_of_question_mark)
                text = ""
                for qm_idx in index_of_question_mark:
                    text += find_start_of_sentence(question_context, qm_idx) 
                
                question_format['context'] = question_context[:-len(text)]
                question_format['questions'].append({
                    'question_code': None,
                    'question_text': text
                })
                format_qestions.append(question_format)
                continue
                # print(question_mark_split)

                

        sub_question_format = []
        for sub_question in sub_questions_repaired:
            sub_question_format.append({
                'question_code': sub_question['code'],
                'question_text': sub_question['text'],
            }) 
        
        question_format['questions'] = sub_question_format
        format_qestions.append(question_format.copy())
        print(format_qestions)
    

    return format_qestions
            # print(f"code: {sub_question['code']}")
            # print(f"text: {sub_question['text']}")


            

            
            
        



        
# extract_questionD1_data("../data/PaperD/question/2021_PaperD1-1_questions_EN.pdf", verbose=False)
extract_questionD1_data("../data/PaperD/question/2022_PaperD1-1_questions_EN.pdf", verbose=False)
    

===-----====---

[752 796]
===-----====---

[946]
[{'question_number': 1, 'question_type': 'OpenQuestion', 'context': 'European patent application EP-F was filed in January 2017 by applicant F. The EPO European patent application EP-F was filed in January 2017 by applicant F. The EPO issued a communication under Article 94(3) EPC dated 1 September 2020 and which sets a time limit of four months. Thereafter no acts, including the payment of fees, were performed with respect to EP-F. Since applicant F did not reply to the communication, the EPO issued a communication dated 4 March 2021 notifying a loss of rights under Rule 112(1) EPC.Despite exercising all due care required by the circumstances, applicant F only became aware of the notification of loss of rights on 3 February 2022.', 'questions': [{'question_code': None, 'question_text': 'Which steps need to be performed to ensure that the prosecution of EP-F continues?By when do these steps have to be performed?'}]}, {'question_number':

[{'question_number': 1,
  'question_type': 'OpenQuestion',
  'context': 'European patent application EP-F was filed in January 2017 by applicant F. The EPO European patent application EP-F was filed in January 2017 by applicant F. The EPO issued a communication under Article 94(3) EPC dated 1 September 2020 and which sets a time limit of four months. Thereafter no acts, including the payment of fees, were performed with respect to EP-F. Since applicant F did not reply to the communication, the EPO issued a communication dated 4 March 2021 notifying a loss of rights under Rule 112(1) EPC.Despite exercising all due care required by the circumstances, applicant F only became aware of the notification of loss of rights on 3 February 2022.',
  'questions': [{'question_code': None,
    'question_text': 'Which steps need to be performed to ensure that the prosecution of EP-F continues?By when do these steps have to be performed?'}]},
 {'question_number': 2,
  'question_type': 'OpenQuestion',


# Question Part2 as func


In [637]:

path_file = '../data/PaperD/question/2021_PaperD2_questions_EN.pdf'
path_file = '../data/PaperD/question/2024_PaperD2_questions_EN.pdf'
path_file = '../data/PaperD/question/2023_PaperD2_questions_EN.pdf'
path_file = '../data/PaperD/question/2022_PaperD2_questions_EN.pdf'

def extract_questionD2_data(path_pdf, *, verbose=False):
    pdf = pymupdf.open(path_file)

    lines = prepross_pdf(pdf, number_of_first_pages_to_skip=1)

    if verbose:
        print(lines[0]['text'])

    blocks = [lines]

    for block in blocks:
        

        uni = make_block_unique(block)
    
        # regex for a sub question, it must start with (a) or (b) or (c) or (d) or 1. or 2. or 3. or 4.
        regex_sub_question = re.compile(r'^(\([a-z]\)|[1-9]\.|[0-9]\)|\([0-9]\))')
        
        context, idx = extract_question_block_header(uni, start_index=1)

            
        if verbose:
            print(f"context: {context}")


        # we are now at the first sub question
        question_format = {
            'question_number': 'part2',
            'context': context,

            'questions' : []
        }

        sub_questions = []
        while idx < len(uni):

            sub_question_text = uni[idx]['text']
            idx +=1
            while idx < len(uni) and not regex_sub_question.match(uni[idx]['text']):
                # print('inside : {}'.format(uni[idx]['text']))
                sub_question_text += uni[idx]['text']
                idx += 1


            code = regex_sub_question.search(sub_question_text).group()
            if verbose:
                print(f'code : {code}')

            # remove the code from the subquestion
            sub_question_text = sub_question_text[len(code):].strip()
            if verbose:
                print(f'subquestoin : {sub_question_text}')


            sub_questions.append({
                'code': code,
                'text': sub_question_text
            })

            
        # repaire subquestions
        #  - if code is (1.), then the next code should be (2.)
        #  - if code is (a), then the next code should be (b)
        #
        # - if code is 1. and then (a) or (b) or (c) or (d) then it is a sub sub question -> should be merged with the previous sub question as 1;a (contains the text of both), 1.b (contains the text of both), 1.c (contains the text of both), 1.d (contains the text of both)
        
        number_question = re.compile(r'[1-9]\.|[0-9]\)|\([0-9]\)')
        regex_letter_question = re.compile(r'\([a-z]\)')
        sub_questions_repaired = []
        idx = 0
        while idx < len(sub_questions):
            sub_question = sub_questions[idx]
            code = sub_question['code'].strip()
            text = sub_question['text']
            sub_question['questions'] = []
            idx += 1

            ssubs = []
            if regex_number_question.match(code):
                # it is a sub question
                # check if it has a sub sub question
                found = False
                while idx < len(sub_questions) and regex_letter_question.match(sub_questions[idx]['code']):
                    found = True
                    
                    ssubs.append({
                        'question_code': sub_questions[idx]['code'], 
                        'question_text': sub_questions[idx]['text']
                    })
                    
                    idx += 1
                
                # TODO: repainre this
                # if len(ssubs) == 0:
                #     # WTF IS THIS, it got reversed but i don't remember what is used to be
                #     sub_question['']
                sub_question['questions'] = ssubs
                sub_questions_repaired.append(sub_question)


            elif regex_letter_question.match(code):
                # it is a sub sub question
                # merge it with the previous sub question
                ssubs.append({
                    'question_code': None, 
                    'question_text': sub_question['text']
                })
                sub_question['questions'] = ssubs
                sub_questions_repaired.append(sub_question)

            else:


                print(regex_number_question.match(code))
                print(regex_letter_question.match(code))
                raise ValueError(f"unknown code |{code}")


        
        
        # regex_number_question = re.compile(r'[1-9]\.|[0-9]\)|\([0-9]\)')
        # regex_letter_question = re.compile(r'\([a-z]\)')
        # sub_questions_repaired = []
        # idx = 0
        # while idx < len(sub_questions):
        #     sub_question = sub_questions[idx]
        #     code = sub_question['code'].strip()
        #     text = sub_question['text']
        #     idx += 1

        #     if regex_number_question.match(code):
        #         # it is a sub question
        #         # check if it has a sub sub question
        #         found = False
        #         while idx < len(sub_questions) and regex_letter_question.match(sub_questions[idx]['code']):
        #             found = True

        #             tmp = sub_question.copy()
        #             tmp['code'] += sub_questions[idx]['code']
        #             tmp['text'] += sub_questions[idx]['text']
                    
        #             repaired = repaire_question_split(tmp)
        #             for rep in repaired:
        #                 sub_questions_repaired.append(rep)

        #             idx += 1
                
        #         if not found:
        #             repaired = repaire_question_split(sub_question)
        #             for rep in repaired:
        #                 sub_questions_repaired.append(rep)
                

        #     elif regex_letter_question.match(code):
        #         # it is a sub sub question
        #         # merge it with the previous sub question
        #         repaired = repaire_question_split(sub_question)
        #         for rep in repaired:
        #             sub_questions_repaired.append(rep)

        #     else:

        #         print(regex_number_question.match(code))
        #         print(regex_letter_question.match(code))
        #         raise ValueError(f"unknown code |{code}")

                
        print("==============================\n\n")
        for sub_question in sub_questions_repaired:
            
                
            regex_number = re.compile(r'\d+')
            search = regex_number.search(sub_question['code'])
            question_number = sub_question['code']
            if search:
                question_number = int(search.group())


            question_format['questions'].append({
                'question_number': question_number,
                'question_text': sub_question['text'],
                'questions': sub_question['questions']
            })
            print(f"code: {question_number}")
            print(f"text: {sub_question['text']}")
            
    return question_format


            
            


            

            
            
        


        # print('-------------------\n\n')
        
extract_questionD2_data(path_file, verbose=False)



code: 1
text: Outline the patent situation as it currently stands for:
code: 2
text: How would you advise us to respond to the opposition against OPT-EP1?
code: 3
text: What could we do to improve our situation?
code: 4
text: After the improvements, what products and methods could we stop TOR from making or using, and in which countries?


{'question_number': 'part2',
 'context': 'You receive the following e-mail from the firm Optimisme.You receive the following e-mail from the firm Optimisme.Dear Representative,[001] My   name   is   Candide.   I   am   the   General   Manager   of   Italian   company Optimisme (OPT). OPT specialises in plastic recycling. We design our own recycling machines  in-house  and  we  have  them  made  by  Torre  (TOR),  an  Italian  company manufacturing plastic recycling machines. When we started working with TOR in 2018, we signed an agreement. Under this agreement, TOR is bound to secrecy, and all intellectual property generated since then belongs to OPT. In future, we will manufacture these machines ourselves.[002] A known process of recycling includes cutting the plastic waste into small pieces in a mill, i.e. in a container with blades rotating at its bottom. An inorganic filler in the form of a powder is fed to the mill to be mixed with the plastic pieces to increase rigidity of the pl

# extract answer as func

In [632]:



pdf = pymupdf.open('../data/PaperD/answer/2023_PaperD_answers_EN.pdf')
pdf = pymupdf.open('../data/PaperD/answer/2022_PaperD_answers_EN.pdf')
pdf = pymupdf.open('../data/PaperD/answer/2021_PaperD_answers_EN.pdf.pdf')
pdf = pymupdf.open('../data/PaperD/answer/2024_PaperD_answers_EN.pdf')

def extract_answer_data(pdf_path, *, verbose=False):
    pdf = pymupdf.open(pdf_path)
    
    lines = prepross_pdf(pdf, number_of_first_pages_to_skip=0, number_of_last_pages_to_skip=1)

    bold_blocks = extract_bold_block(lines)

    part_blocks = split_as_part(bold_blocks)

    
    global_comment = None
    part1 = {
        "comments": [],
        "answers": []
    }

    part2 = {
        "comments": [],
        "answers": []
    }

    for bblock in part_blocks:
        
        if verbose:
            print(bblock[0][0]['text'])

        # regex must end with I
        regex_part = re.compile(r'Part I$', re.IGNORECASE)
        if regex_part.search(bblock[0][0]['text'].lower().strip()):

            questions = split_as_question(bblock)
            if len(bblock) == 1:
                questions = extract_question_block_not_bold(bblock[0])
            
            if verbose:
                print(f'number of questions blocks : {len(questions)}')
            
            regex_solution = re.compile(r'solution', re.IGNORECASE)
            if regex_solution.search(bblock[0][0]['text'].lower().strip()):
                part1['answers'] = questions
                continue
            else:
                part1['comments'] = questions
                continue

        regex_part = re.compile(r'Part II$', re.IGNORECASE)
        if regex_part.search(bblock[0][0]['text'].lower().strip()):
            
            questions = split_as_question(bblock)
            if len(bblock) == 1:
                questions = extract_question_block_not_bold(bblock[0])

            if verbose:
                print(f'number of questions: {len(questions)}')

           
            regex_solution = re.compile(r'solution', re.IGNORECASE)
            if regex_solution.search(bblock[0][0]['text'].lower().strip()):
                part2['answers'] = questions
                continue
            else:
                part2['comments'] = questions
                continue

        global_comment = bblock


    treat_comment = ""
    for line in global_comment[1::]:
        for span in line:
            treat_comment += span['text']

    output = {
        'part1': {
           'comments': extract_comment_part(part1['comments']),
           'answers': extract_answer_part(part1['answers'])
        },
        
        'part2': {
            'comments': extract_comment_part(part2['comments']),
            'answers': extract_answer_part(part2['answers'])  
        },
        
        'global_comment': treat_comment
    }

    return output

# extract_answer_data('../data/PaperD/answer/2022_PaperD_answers_EN.pdf', verbose=False)   
# extract_answer_data('../data/PaperD/answer/2023_PaperD_answers_EN.pdf', verbose=False)   
extract_answer_data('../data/PaperD/answer/2021_PaperD_answers_EN.pdf.pdf', verbose=False)   

{'part1': {'comments': [('question 1',
    1,
    '  Most candidates realized that EP-A2 was not filed in the language prescribed, however  fewer realized legal consequences and accordingly the necessity to revive EP1 so that a  new divisional can be filed. Many candidates erroneously suggested to merely file a  translation for EP-A2.    '),
   ('question 2',
    2,
    '  This question was reasonably well answered. Most of the candidates realized that the  EPO is not bound by the decision of any receiving office under the “unintentional criterion”.  Many candidates also correctly indicated the steps required to ensure that the restoration  of the right of priority is effective in the European phase. The required payment of a fee for  requesting restoration was sometimes forgotten.    '),
   ('question 6',
    6,
    '  The question regarding the procedural steps to be taken before the EPO to request  amendment of the claim in respect of Germany was well answered, although some  candid

# Export as JSON

This one only extract Part1-1 & Part1-2 cause part 2 is document's based (and I also got some prob for some reason that I can not explain)

In [638]:
import os
import json
os.makedirs('output/PaperD', exist_ok=True)

list_pdfs = [
    # (
    #     "../data/PaperD/answer/2021_PaperD_answers_EN.pdf.pdf",
    #     "../data/PaperD/question/2021_PaperD1-1_questions_EN.pdf",
    #     "../data/PaperD/question/2021_PaperD1-2_questions_EN.pdf",
    #     "../data/PaperD/question/2021_PaperD2_questions_EN.pdf",
    # ),

    (
        "../data/PaperD/answer/2022_PaperD_answers_EN.pdf",
        "../data/PaperD/question/2022_PaperD1-1_questions_EN.pdf",
        "../data/PaperD/question/2022_PaperD1-2_questions_EN.pdf",
        "../data/PaperD/question/2022_PaperD2_questions_EN.pdf",
    ),
    (
        "../data/PaperD/answer/2023_PaperD_answers_EN.pdf",
        "../data/PaperD/question/2023_PaperD1-1_questions_EN.pdf",
        "../data/PaperD/question/2023_PaperD1-2_questions_EN.pdf",
        "../data/PaperD/question/2023_PaperD2_questions_EN.pdf",
    ),
    (
        "../data/PaperD/answer/2024_PaperD_answers_EN.pdf",
        "../data/PaperD/question/2024_PaperD1-1_questions_EN.pdf",
        "../data/PaperD/question/2024_PaperD1-2_questions_EN.pdf",
        "../data/PaperD/question/2024_PaperD2_questions_EN.pdf",
    )
]




def extract_data(pdfs):
    # ans_pdf = pymupdf.open( pdfs[0])
    # que_p1_1_pdf = pymupdf.open( pdfs[1])
    # que_p1_2_pdf = pymupdf.open( pdfs[2])
    # que_p2_pdf = pymupdf.open( pdfs[3])


    questions = []

    questions.extend(extract_questionD1_data(pdfs[1]))
    questions.extend(extract_questionD1_data(pdfs[2]))

    questions2 = extract_questionD2_data(pdfs[3])

    answer = extract_answer_data(pdfs[0])
    # print(extract_questionD1_data(pdfs[1]))
    # print(extract_questionD1_data(pdfs[2]))
    print(questions)
    print('\n\n\n')

    output = {
        'exam_type': 'EQE',
        'year': os.path.basename(pdfs[0]).split('_')[0],
        'exam_name': 'PaperD',
        
        'exercices': questions
    }

    # for a in answer['part2']['answers']:
    #     print(a)

    # for q in questions2['questions']:
    #     print(q)

    # assert len(questions) == len(answer['part1']['answers'])
    # assert len(questions2) == len(answer['part2']['answers'])

    # for q in questions:
    #     for k,v in q.items():
    #         print(f"{k}: {v}")
            
        
        
    #     for s in q[2]:
    #         s_no = s[0]
    #         s_context = s[1]
            
            
    #         s_data = {
    #             'question_code': str(s_no).strip(),
    #             'question_text': s_context.strip(),
    #         }
            
    #         q_data['questions'].append(s_data)
        
    #     output['exercices'].append(q_data)

        
    # sort the questions by number
    output['exercices'] = sorted(output['exercices'], key=lambda x: int(x['question_number']))

    # dump the output to a json file
    with open(f"output/PaperD/EQE_{output['year']}_{output['exam_name']}.json", 'w') as f:
        json.dump(output, f, indent=4)

    with open(f"output/PaperD/EQE_{output['year']}_{output['exam_name']}_com.json", 'w') as f:
        json.dump(answer['part1']['comments'], f, indent=4)

    with open(f"output/PaperD/EQE_{output['year']}_{output['exam_name']}_ans.json", 'w') as f:
        json.dump(answer['part1']['answers'], f, indent=4)



    for idx, a in enumerate(answer['part1']['answers']):
        a_no = a[1]
        answers = a[2]
        
        # print(f'answer nb : {a_no}')
        # find current question (by number)
        current_question = None
        q_idx = -1
        for idx, q in enumerate(output['exercices']):
            if q['question_number'] == a_no:
                current_question = q
                q_idx = idx
                break
            
        assert current_question is not None
        assert q_idx != -1
        

        # remove any answre that is empty
        # for q in current_question['questions']:
        #     print(q)
        #     print("\n\n")
        current_question['questions'] = [q for q in current_question['questions'] if q['question_text'].strip() != '']
        
        # for k,v in answers:
        #     print(f"{k}: {v}")

        answers = [aa for aa in answers if aa['text'].strip() != '']
        for aa in answers:
            print(aa)
            print("\n\n")
    
        print(f'current_question: {current_question["question_number"]}')
        print(f'current answer: {a_no}')
        print(f'len of questions: {len(current_question["questions"])}')
        print(f'len of answers: {len(answers)}')
        print(answers[0])
        assert len(current_question['questions']) == len(answers)
        
        for idx, q in enumerate(current_question['questions']):
            q['answer'] = answers[idx]['text']
        
        current_question['examiner_note'] = answer['part1']['comments'][q_idx][2]
        output['exercices'][q_idx] = current_question
            

    with open(f"output/PaperD/EQE_{output['year']}_{output['exam_name']}_step2.json", 'w') as f:
        json.dump(output, f, indent=4)

    output['global_comment'] = answer['global_comment'].strip()


    with open(f"output/PaperD/EQE_{output['year']}_{output['exam_name']}_final_documentLess.json", 'w') as f:
        json.dump(output, f, indent=4)

    # with open(f"EQE_{output['year']}_{output['exam_name']}_q2.json", 'w') as f:
    #     json.dump(questions2, f, indent=4)
        
    # print('=====================\n\n')
    # print(questions2)
    # for q in questions2['questions']:
    #     print(q)
    # questions2 = sorted(questions2['questions'], key=lambda x: int(x['question_number']))

    # for idx, a in enumerate(answer['part2']['answers']):
    #     a_no = a[1]
    #     answers = a[2]
        
    #     print(f'answer nb : {a_no}')
    #     # find current question (by number)
    #     current_question = None
    #     q_idx = -1
    #     for idx, q in enumerate(questions2):
    #         if q['question_number'] == a_no:
    #             current_question = q
    #             q_idx = idx
    #             break
            
    #     assert current_question is not None
    #     assert q_idx != -1
    #     print('==============')

    #     # remove any answre that is empty
    #     for q in current_question['questions']:
    #         print(q)
    #         print("\n\n")
    #     current_question['questions'] = [q for q in current_question['questions'] if q['question_text'].strip() != '']
        
    #     print('---------------')
        
    #     for aa in answers:
    #         print(aa)
    #         print("\n\n")
    #     # for k,v in answers:
    #     #     print(f"{k}: {v}")

    #     answers = [aa for aa in answers if aa['text'].strip() != '']
    
    #     print(f'len of questions: {len(current_question["questions"])}')
    #     print(f'len of answers: {len(answers)}')
    #     assert len(current_question['questions']) == len(answers)

        
    #     for idx, q in enumerate(current_question['questions']):
    #         q['answer'] = answers[idx]['text']
        
    #     current_question['examiner_note'] = answer['part1']['comments'][q_idx][2]
    #     questions2[q_idx] = current_question
    

    # with open(f"EQE_{output['year']}_{output['exam_name']}_step3.json", 'w') as f:
    #     json.dump(questions2, f, indent=4)

for pdfs in list_pdfs:
    print('---------------------------------\n\n')
    print(pdfs)
    print('---------------------------------\n\n')
    extract_data(pdfs)

---------------------------------


('../data/PaperD/answer/2022_PaperD_answers_EN.pdf', '../data/PaperD/question/2022_PaperD1-1_questions_EN.pdf', '../data/PaperD/question/2022_PaperD1-2_questions_EN.pdf', '../data/PaperD/question/2022_PaperD2_questions_EN.pdf')
---------------------------------


===-----====---

[752 796]
===-----====---

[946]
[{'question_number': 1, 'question_type': 'OpenQuestion', 'context': 'European patent application EP-F was filed in January 2017 by applicant F. The EPO European patent application EP-F was filed in January 2017 by applicant F. The EPO issued a communication under Article 94(3) EPC dated 1 September 2020 and which sets a time limit of four months. Thereafter no acts, including the payment of fees, were performed with respect to EP-F. Since applicant F did not reply to the communication, the EPO issued a communication dated 4 March 2021 notifying a loss of rights under Rule 112(1) EPC.Despite exercising all due care required by the circumstance