In [None]:
import json
from typing import List, Literal, Tuple

import pymupdf

from parser.elements.rectangle import Rectangle

PyMuPDFRect = Tuple[float, float, float, float, str, float, float, float]

InputAreaKind = Literal[
    "underscores",
    "code_underscores",
    "textarea",
    "code_textarea",
    "entire_function_area",
]


def rect_to_bbox(rect: PyMuPDFRect) -> Tuple[float, float, float, float]:
    return (rect[0], rect[1], rect[2], rect[3])


def mark_rect(page: pymupdf.Page, rect: PyMuPDFRect | Rectangle, kind: InputAreaKind):
    """Underline each word that contains 'text'."""
    if isinstance(rect, Rectangle):
        bbox = rect.as_tuple()
    else:
        bbox = rect_to_bbox(rect)

    annot = page.add_rect_annot(bbox)  # underline
    blue = (0, 0, 1)
    if kind == "underscores":
        fill = (153 / 255, 240 / 255, 234 / 255)
    elif kind == "textarea":
        fill = (234 / 255, 240 / 255, 153 / 255)
    elif kind == "code_textarea":
        fill = (234 / 255, 153 / 255, 240 / 255)
    elif kind == "code_underscores":
        fill = (153 / 255, 240 / 255, 234 / 255)
    elif kind == "entire_function_area":
        fill = (234 / 255, 153 / 255, 240 / 255)
    else:
        raise ValueError(f"Invalid kind: {kind}")

    annot.set_border(width=1, dashes=[1, 2])
    annot.set_colors(stroke=blue, fill=fill)
    annot.update(opacity=0.5)


def serialize_blocks_to_json(
    blocks: List[PyMuPDFRect],
) -> str:
    blocks_list = [
        {
            "x0": block[0],
            "y0": block[1],
            "x1": block[2],
            "y1": block[3],
            "text": block[4],
            "width": block[5],
            "height": block[6],
        }
        for block in blocks
    ]
    return json.dumps(blocks_list, indent=4)

In [None]:
import os
from typing import List, Tuple


def load_exam_and_extract_text_words(filepath: str):
    doc = pymupdf.open(filepath)

    for i, page in enumerate(doc):  # scan through the pages
        blocks: List[Tuple[float, float, float, float, str, float, float]] = (
            page.get_text("blocks")
        )

        for block in blocks:
            mark_rect(page, block)  # mark the page's words

        json_data = serialize_blocks_to_json(blocks)
        with open(f"page-{i}-blocks.json", "w") as json_file:
            json_file.write(json_data)

    doc.save("marked-" + os.path.basename(filepath))


load_exam_and_extract_text_words("../fe_files/exams/FE-Jan24.pdf")

In [None]:
import json  # noqa: F811
import os  # noqa: F811
from typing import Literal

import pymupdf  # noqa: F811
import regex
from pydantic import BaseModel

from parser.dataset.dataloader import DataLoader
from parser.elements.rectangle import (
    Rectangle,
    build_connectivity_graph,
    find_connected_components,
)
from parser.question_extraction import FUNCTION_EXTRACTION_PATTERN

MARKED_RECTS: List[Rectangle] = []


class SeparatedRects(BaseModel):
    underscores: PyMuPDFRect | None
    remaining: PyMuPDFRect | None


def separate_underscores_from_text(
    page: pymupdf.Page, rect: Rectangle, text: str
) -> SeparatedRects:
    underscores = None
    remaining = None

    # Find the longest continuous sequence of underscores
    longest_underscore_seq = ""
    current_seq = ""

    for char in text:
        if char == "_":
            current_seq += char
        else:
            if len(current_seq) > len(longest_underscore_seq):
                longest_underscore_seq = current_seq
            current_seq = ""

    # Check the last sequence
    if len(current_seq) > len(longest_underscore_seq):
        longest_underscore_seq = current_seq

    if len(longest_underscore_seq) >= 4:
        # Use page.search_for to find the remaining text
        non_underscores = text.replace(longest_underscore_seq, "")
        remaining_bbox = page.search_for(
            text.replace(longest_underscore_seq, ""), clip=rect.as_tuple()
        )
        if remaining_bbox:
            remaining = (
                remaining_bbox[0][0],
                remaining_bbox[0][1],
                remaining_bbox[0][2],
                remaining_bbox[0][3],
                non_underscores,
                0,
                0,
                0,
            )

        underscores_bbox = page.search_for(longest_underscore_seq, clip=rect.as_tuple())
        if underscores_bbox:
            underscores = (
                underscores_bbox[0][0],
                underscores_bbox[0][1],
                underscores_bbox[0][2],
                underscores_bbox[0][3],
                longest_underscore_seq,
                0,
                0,
                0,
            )
    else:
        remaining = None

    return SeparatedRects(underscores=underscores, remaining=remaining)


def merge_rectangles(rectangles: List[Rectangle]) -> Rectangle:
    MAX_DISTANCE = (
        2000000  # Maximum allowed distance to consider rectangles as neighbors
    )

    # Assign an index to each rectangle for easy reference
    rect_indices = {i: rect for i, rect in enumerate(rectangles)}

    # Step 1: Build the connectivity graph using the function
    graph = build_connectivity_graph(rect_indices, MAX_DISTANCE)

    # Step 2: Find connected components using the function
    components = find_connected_components(graph, rect_indices)

    # Step 3: Merge rectangles in each connected component
    merged_rectangles: List[Rectangle] = []

    for component in components:
        merged_rect = rect_indices[component[0]]

        for idx in component[1:]:
            merged_rect = merged_rect.merge_with(rect_indices[idx])

        merged_rectangles.append(merged_rect)

    # Step 4: Select the largest rectangle
    largest_rectangle: Rectangle = max(merged_rectangles, key=lambda r: r.area())

    return largest_rectangle


class PageBlocks(BaseModel):
    page_number: int
    blocks: List[PyMuPDFRect]
    merged: Rectangle
    merged_text: str

    @classmethod
    def from_page_bboxes(
        cls, page_bboxes: List[PyMuPDFRect], pymypdfPage: pymupdf.Page
    ):
        rectangles: List[Rectangle] = [
            Rectangle.from_points(*bbox[0:4]) for bbox in page_bboxes
        ]

        largest_rectangle: Rectangle = merge_rectangles(rectangles)

        return cls(
            blocks=page_bboxes,
            merged=largest_rectangle,
            merged_text=pymypdfPage.get_text(clip=largest_rectangle.as_tuple()),
            page_number=pymypdfPage.number,
        )


class CodeInputSubArea(BaseModel):
    kind: Literal[
        "fill_in_the_blank", "free_response", "partially_filled_free_response"
    ]
    code_textarea: Rectangle | None
    text: str


class CodeArea(BaseModel):
    rect: Rectangle
    sub_areas: List[CodeInputSubArea]
    text: str


def filter_empty_lines(text: str) -> str:
    return text.replace(" ", "").replace("\n", "")


EMPTY_BRANCH_PATTERN = regex.compile(
    r"""
        \{                          # Opening brace
        (?:
            \s*                     # Optional leading whitespace
            (?:
                //.*(?:\n|$)        | # Single-line comment
                for\s*\([^)]*\)\s*\{\s*\}\s*(?:\n|$)  | # Empty for loop
                while\s*\([^)]*\)\s*\{\s*\}\s*(?:\n|$) | # Empty while loop
                #int\s+\w+\s*=\s*\d+;\s*(?:\n|$)        | # Integer declaration
                #return\s*[^;]*;\s*(?:\n|$)             # Return statement
            )
        )*                           # Repeat for multiple lines
        \}                          # Closing brace
        """,
    regex.VERBOSE | regex.MULTILINE,
)


def detect_empty_branches(code: str) -> List[str]:
    print(f"code={code}")
    # Regular expression to match empty code branches, e.g., for loops, if conditions, etc.

    # pattern = regex.compile(r"^\{\s*(?:\/\/.*\s*)*\}$", regex.MULTILINE | regex.DOTALL)

    # pattern = regex.compile(
    #    r"^\{\s*(?:(?:\/\/[^\n]*)|(?0)|\s)*\}$", regex.MULTILINE | regex.DOTALL
    # )

    matches = EMPTY_BRANCH_PATTERN.findall(code)
    print(f"matches={matches}")
    for match in matches:
        print(f"match={match}")

    return matches


def extract_code_inputs(page: pymupdf.Page, page_blocks: PageBlocks) -> List[CodeArea]:
    # Find all matches
    matches = FUNCTION_EXTRACTION_PATTERN.finditer(page_blocks.merged_text)

    print(f"matches={matches}")
    code_areas: List[CodeArea] = []

    print(f"page_number={page_blocks.page_number}")
    for match in matches:
        code_sub_areas: List[CodeInputSubArea] = []

        raw = match.group(0)
        print(f"raw={raw}")
        print(f"match={match}")
        function_name = match.group("function_name")
        print(f"parsing function_name='{function_name}'")
        # arguments = match.group("arguments")
        body = match.group("body")

        empty_branches: List[str] = detect_empty_branches(body)
        if len(empty_branches) > 0:
            # define parent bbox
            parent_bbox = page.search_for(raw, clip=page_blocks.merged.as_tuple())

            parent_rect = merge_rectangles(
                [Rectangle.from_points(*x[0:4]) for x in parent_bbox]
            )

            print(f"empty_branches={empty_branches}")
            for branch in empty_branches:
                code_bboxes = page.search_for(branch, clip=parent_rect.as_tuple())
                if code_bboxes is None or len(code_bboxes) == 0:
                    raise ValueError(
                        f"No code bbox found for body='{body}', branch={branch}"
                    )

                code_textarea_rect = merge_rectangles(
                    [Rectangle.from_points(*x[0:4]) for x in code_bboxes]
                )

                # code_textarea_rect = Rectangle.from_points(*code_bboxes[0][0:4])
                code_sub_areas.append(
                    CodeInputSubArea(
                        kind="free_response",
                        code_textarea=code_textarea_rect,
                        text=body,
                    )
                )

        if len(code_sub_areas) == 0:
            # Fallback to use distance-based approach... very ugly
            print(
                f"No code areas found for page {page_blocks.page_number}. Reverting to distance-based approach."
            )

            # Find second to last line and last line of input code
            code_lines = raw.split("\n")

            print(f"code_lines={code_lines}")
            if len(code_lines) < 2:
                print(
                    f"Aborting distance-based approach, only found {len(code_lines)} lines of code. Need 2."
                )
                continue
            second_to_last_line = code_lines[-2]
            last_line = code_lines[-1]

            # Find locations of second to last line and last line
            second_to_last_line_bbox = page.search_for(second_to_last_line)
            last_line_bbox = page.search_for(last_line)

            if second_to_last_line_bbox is None:
                print(
                    f"Aborting distance-based approach, could not find locations of second to last line='{second_to_last_line}'."
                )
                continue

            if last_line_bbox is None:
                print(
                    f"Aborting distance-based approach, could not find locations of last line='{last_line}'."
                )
                continue

            if len(second_to_last_line_bbox) > 1 or len(last_line_bbox) > 1:
                print(
                    "Aborting distance-based approach, found more than one location for second to last line and last line."
                )
                continue

            second_to_last_line_rect = Rectangle.from_points(
                *second_to_last_line_bbox[0][0:4]
            )
            last_line_rect = Rectangle.from_points(*last_line_bbox[0][0:4])

            # Find vertical distance between second to last line and last line
            vertical_distance = last_line_rect.y0 - second_to_last_line_rect.y1

            print(f"vertical_distance={vertical_distance}")
            if vertical_distance > 200:
                print(
                    "Found candidate for partially filled in code area question. Creating Question now."
                )

                merged_code_area = merge_rectangles(
                    [
                        Rectangle.from_points(*x[0:4])
                        for x in page.search_for(
                            raw, clip=page_blocks.merged.as_tuple()
                        )
                    ]
                )

                # Create bbox for code area
                code_area_bbox = Rectangle.from_points(
                    merged_code_area.x0,
                    second_to_last_line_rect.y1,
                    merged_code_area.x1,
                    last_line_rect.y0,
                )

                # Find text
                code_area_text = page.get_text(clip=code_area_bbox.as_tuple())

                code_sub_areas.append(
                    CodeInputSubArea(
                        kind="partially_filled_free_response",
                        code_textarea=code_area_bbox,
                        text=code_area_text,
                    )
                )

            # return []

        if len(code_sub_areas) > 0:
            merged_code_area = merge_rectangles(
                [
                    Rectangle.from_points(*x[0:4])
                    for x in page.search_for(raw, clip=page_blocks.merged.as_tuple())
                ]
            )
            # merged_code_area = merge_rectangles(
            #    [x.code_textarea for x in code_sub_areas if x.code_textarea is not None]
            # )

            code_area = CodeArea(
                rect=merged_code_area, text=raw, sub_areas=code_sub_areas
            )
            code_areas.append(code_area)

    return code_areas


def is_code_page(
    page: pymupdf.Page, page_blocks: PageBlocks
) -> Tuple[bool, List[CodeArea]]:
    code: List[CodeArea] = extract_code_inputs(page, page_blocks)
    if len(code) > 0:
        return True, code
    return False, None


def load_exam_and_extract_text_words(filepath: str):
    doc = pymupdf.open(filepath)

    for i, page in enumerate(doc):  # scan through the pages
        blocks: List[PyMuPDFRect] = page.get_text("words")

        page_blocks = PageBlocks.from_page_bboxes(blocks, page)

        underscore_marking_queue: List[Rectangle] = []

        for block in page_blocks.blocks:
            rectangle = Rectangle.from_points(*block[0:4])
            rect_text = block[4]
            separated = separate_underscores_from_text(page, rectangle, rect_text)
            if separated.underscores is not None:
                underscore_marking_queue.append(separated.underscores)
                # mark_rect(page, separated.underscores, "underscores")

        is_code, code_areas = is_code_page(page, page_blocks)
        print(f"is_code={is_code}")

        if is_code:
            assert code_areas is not None
            for code_area in code_areas:
                assert len(code_area.sub_areas) > 0
                print(f"underscore_marking_queue={underscore_marking_queue}")

                mark_rect(page, code_area.rect, "entire_function_area")

                for underscore in underscore_marking_queue:
                    underscore_rect = Rectangle.from_points(*rect_to_bbox(underscore))
                    if underscore_rect.is_within(code_area.rect):
                        # remove the underscore from the queue
                        mark_rect(page, underscore_rect, "code_underscores")
                        print(f"Removed underscore from queue: {underscore}")
                        underscore_marking_queue.remove(underscore)

                for area in code_area.sub_areas:
                    mark_rect(page, area.code_textarea, "code_textarea")

            with open(f"page-{i}-code_areas.json", "w") as code_file:
                json.dump(
                    [code_area.model_dump() for code_area in code_areas],
                    code_file,
                    indent=4,
                )

        for underscore in underscore_marking_queue:
            mark_rect(
                page,
                Rectangle.from_points(*rect_to_bbox(underscore)),
                "code_underscores",
            )

        # with open(f"page-{i}-blocks.json", "w") as json_file:
        #    json_file.write(page_blocks.model_dump_json(indent=4))

    doc.save("marked-" + os.path.basename(filepath))


LOAD_ALL = True
if LOAD_ALL:
    loader = DataLoader("../fe_files/exams/", "../fe_files/solutions/")
    exam_paths = loader.get_exam_paths()

    for exam_path in exam_paths:
        load_exam_and_extract_text_words(exam_path)
else:
    load_exam_and_extract_text_words("../fe_files/exams/FE-Sept20.pdf")

In [None]:
pattern = regex.compile(
    r"""
    \{                          # Opening brace
    (?:
        \s*                     # Optional leading whitespace
        (?:
            //.*(?:\n|$)        | # Single-line comment
            for\s*\([^)]*\)\s*\{\s*\}\s*(?:\n|$)  | # Empty for loop
            while\s*\([^)]*\)\s*\{\s*\}\s*(?:\n|$) | # Empty while loop
            #int\s+\w+\s*=\s*\d+;\s*(?:\n|$)        | # Integer declaration
            #return\s*[^;]*;\s*(?:\n|$)             # Return statement
        )
    )*                           # Repeat for multiple lines
    \}                          # Closing brace
    """,
    regex.VERBOSE | regex.MULTILINE,
)


inputs = [
    "{ //complete this function  }",
    "{ //complete this function \n}",
    "{ //complete this function \n\n}",
    "{ //complete this function \n\n\n}",
    "{ //complete this function \n\n\n\n}",
    "{ }",
    "int kClosePerm(int* perm, int* used, int n, int maxgap, int k) {\nif (n == k)  \n ______________________ ; \n int res = 0; \n for (int i=0; i<n; i++) { \n} \n return res; \n}",  # should match ONLY the text within the empty for loop
    """
    { 
  int i, len = strlen(str); 
  char *new_string = malloc(sizeof(char) * (len + 1)); 
  Stack s1, s2; 
  init(&s1); // initializes stack s1 to be empty 
  init(&s2); // initializes stack s2 to be empty 
  for (i = 0; i < len; i++) { 
     push(&s1, str[i]);  // this pushes onto stack s1 
     push(&s2, str[i]);  // this pushes onto stack s2 
  } 
  for (i = 0; i < len; i++) { 
     if (i % 2 == 0) { 
        // Note: pop() returns the character being removed from the stack. 
        if (!isEmpty(&s1)) 
           new_string[i] = pop(&s1); 
        if (!isEmpty(&s1)) 
           push(&s2, pop(&s1)); 
     } 
     else { 
        pop(&s2); 
        new_string[i] = pop(&s2); 
     } 
  } 
  new_string[len] = '\0'; 
  printf("%s\n", new_string); 
  free(new_string); 
}""",
]

for input in inputs:
    matches = pattern.finditer(input)
    for match in matches:
        print(match)