In [None]:
import json
from typing import List

import pymupdf

from parser.elements.rectangle import Rectangle

In [None]:
import os

from parser.elements.pymupdf_integ import load_exam_and_extract_text_words

load_exam_and_extract_text_words("../fe_files/exams/FE-Jan24.pdf")

In [None]:
import json  # noqa: F811
import os  # noqa: F811

import pymupdf  # type: ignore # noqa: F811
import regex  # type: ignore

from parser.dataset.dataloader import DataLoader
from parser.elements.fill_in_the_blank import separate_underscores_from_text
from parser.elements.page import ExamResult, PageBlocks, PageResult, is_code_page
from parser.elements.pymupdf_integ import PyMuPDFRect, mark_rect, rect_to_bbox

MARKED_RECTS: List[Rectangle] = []


def load_exam_and_extract_text_words(
    filepath: str, write_files: bool = False
) -> ExamResult:
    doc = pymupdf.open(filepath)

    page_results: List[PageResult] = []

    for i, page in enumerate(doc):  # scan through the pages
        blocks: List[PyMuPDFRect] = page.get_text("words")

        page_blocks = PageBlocks.from_page_bboxes(blocks, page)

        underscore_marking_queue: List[Rectangle] = []

        for block in page_blocks.blocks:
            rectangle = Rectangle.from_points(*block[0:4])
            rect_text = block[4]
            separated = separate_underscores_from_text(page, rectangle, rect_text)
            if separated.underscores is not None:
                underscore_marking_queue.append(separated.underscores)

        is_code, code_areas = is_code_page(page, page_blocks)
        print(f"is_code={is_code}")

        if is_code:
            assert code_areas is not None
            for code_area in code_areas:
                assert len(code_area.sub_areas) > 0
                print(f"underscore_marking_queue={underscore_marking_queue}")

                mark_rect(page, code_area.rect, "entire_function_area")

                for underscore in underscore_marking_queue:
                    underscore_rect = Rectangle.from_points(*rect_to_bbox(underscore))
                    if underscore_rect.is_within(code_area.rect):
                        # remove the underscore from the queue
                        mark_rect(page, underscore_rect, "code_underscores")
                        print(f"Removed underscore from queue: {underscore}")
                        underscore_marking_queue.remove(underscore)

                for area in code_area.sub_areas:
                    mark_rect(page, area.code_textarea, "code_textarea")

            if write_files:
                with open(f"page-{i}-code_areas.json", "w") as code_file:
                    json.dump(
                        [code_area.model_dump() for code_area in code_areas],
                        code_file,
                        indent=4,
                    )

        for underscore in underscore_marking_queue:
            mark_rect(
                page,
                Rectangle.from_points(*rect_to_bbox(underscore)),
                "code_underscores",
            )

        page_result = PageResult(
            page_number=i,
            code_areas=code_areas if code_areas is not None else [],
            fill_in_the_blank_areas=[
                Rectangle.from_points(*rect_to_bbox(x))
                for x in underscore_marking_queue
                if x is not None
            ],
        )

        page_results.append(page_result)

    if write_files:
        doc.save("marked-" + os.path.basename(filepath))

    return ExamResult(page_results=page_results)

In [None]:
LOAD_ALL = True
if LOAD_ALL:
    loader = DataLoader("../fe_files/exams/", "../fe_files/solutions/")
    exam_paths = loader.get_exam_paths()

    for exam_path in exam_paths:
        load_exam_and_extract_text_words(exam_path, write_files=True)
else:
    load_exam_and_extract_text_words("../fe_files/exams/FE-Jan20.pdf", write_files=True)

In [None]:
from typing import Dict

from parser.elements.code import CodeArea


def get_all_exam_results() -> Dict[str, ExamResult]:
    loader = DataLoader("../fe_files/exams/", "../fe_files/solutions/")
    exam_paths = loader.get_exam_paths()

    exam_results: Dict[str, ExamResult] = {}
    for exam_path in exam_paths:
        exam_results[exam_path] = load_exam_and_extract_text_words(
            exam_path, write_files=True
        )

    return exam_results


def serialize_all_code_areas(
    exam_results: Dict[str, ExamResult], exam_path: str
) -> None:
    exam_result = exam_results[exam_path]

    code_areas: Dict[int, List[CodeArea]] = {}
    for page_result in exam_result.page_results:
        code_areas[page_result.page_number] = page_result.code_areas

    out_dir = os.path.join("test_data", os.path.basename(exam_path.replace(".pdf", "")))
    os.makedirs("test_data", exist_ok=True)
    with open(f"{out_dir}-code_areas.json", "w") as code_file:
        json.dump(
            {k: [x.model_dump() for x in v] for k, v in code_areas.items()},
            code_file,
            indent=4,
        )


exam_results = get_all_exam_results()
for exam_path in exam_results.keys():
    serialize_all_code_areas(exam_results, exam_path)

In [None]:
pattern = regex.compile(
    r"""
        \{                          # Opening brace
        (?:
            \s*                     # Optional leading whitespace
            (?:
                //.*(?:\n|$)        | # Single-line comment
                for\s*\([^)]*\)\s*\{\s*\}\s*(?:\n|$)  | # Empty for loop
                while\s*\([^)]*\)\s*\{\s*\}\s*(?:\n|$) | # Empty while loop
            )
        )*                           # Repeat for multiple lines
        \}                          # Closing brace
        """,
    regex.VERBOSE | regex.MULTILINE,
)

inputs = [
    "{ //complete this function \n}",
    "{ //complete this function \n\n}",
    "{ //complete this function \n\n\n}",
    "{ //complete this function \n\n\n\n}",
    "{ }",
    "int kClosePerm(int* perm, int* used, int n, int maxgap, int k) {\nif (n == k)  \n ______________________ ; \n int res = 0; \n for (int i=0; i<n; i++) { \n} \n return res; \n}",  # should match ONLY the text within the empty for loop
]

for input in inputs:
    matches = pattern.finditer(input)
    print(f"matches={matches}")
    for match in matches:
        print(match)

In [None]:
inputs = [
    """node* deleteMe(node* head, node* me) { \n}""",
    """node *deleteMe(node* head, node* me) { \n}""",
]


my_pattern = regex.compile(
    r"""
    ^\s*                                   # Ensure the function definition starts at a line (after optional indentation)
    (?P<return_type>
        [A-Za-z_*\&\[\]]+                   # The first part of the return type (e.g., int, void, etc.), excluding digits
        (?:
            (?:\s+|/\*.*?\*/|//.*?$)+        # Allow whitespace or comments between return type parts
            [A-Za-z_*\&\[\]]+               # Additional part of the return type (e.g., static, inline)
        )*
    )
    \s*                                    # Some whitespace after the return type before the function name
    (?P<function_name>\w+)                 # The function name
    \s*                                    # Optional whitespace before the arguments
    \(
        (?P<arguments>
            (?:[^()]*|\((?:[^()]|\([^()]*\))*\))*
        )
    \)\s*
    (?P<body>
        \{
            (?:
                [^{}]+                     # Non-brace characters
                |
                (?&body)                   # Recursively match nested braces
            )*
        \}
    )
    """,
    regex.MULTILINE | regex.DOTALL | regex.VERBOSE,
)

for input in inputs:
    matches = my_pattern.finditer(input)
    print(f"matches={matches}")
    for match in matches:
        print(match)