In [None]:
import regex

# Regular expression as defined above
pattern = regex.compile(
    r"""
    (?P<return_type>
        (?:[\w\s\*\&\[\]]+?\s+)+
    )
    (?P<function_name>\w+)\s*
    \(
        (?P<arguments>
            (?:[^()]*|\((?:[^()]|\([^()]*\))*\))*
        )
    \)\s*
    (?P<body>
        \{
            (?>
                [^{}]+
                |
                (?&body)
            )*
        \}
    )
    """,
    regex.VERBOSE | regex.MULTILINE | regex.DOTALL,
)

# Sample code containing multiple functions
code = """
int add(int a, int b) {
    return a + b;
}

void swap(int *a, int *b) {
    int temp = *a;
    *a = *b;
    *b = temp;
}

static inline int max(int a, int b) {
    return a > b ? a : b;
}

struct node* create_node(int data) {
    struct node* new_node = (struct node*)malloc(sizeof(struct node));
    new_node->data = data;
    new_node->next = NULL;
    return new_node;
}

void (*signal(int sig, void (*func)(int)))(int) {
    // function body
}

void complex_function(
    int a,
    char *b,
    double (*c)(int)
) {
    // code
}

void process_array(int arr[], size_t size) {
    for (size_t i = 0; i < size; ++i) {
        // process arr[i]
    }
}

void no_arguments(void) {
    // code
}

const char* const* get_strings(void) {
    static const char* strs[] = {"hello", "world", NULL};
    return strs;
}

int compute(int **matrix, int rows, int cols) {
    int sum = 0;
    for(int i = 0; i < rows; ++i) {
        for(int j = 0; j < cols; ++j) {
            sum += matrix[i][j];
        }
    }
    return sum;
}
"""

# Find all matches
matches = pattern.finditer(code)


for match in matches:
    function_name = match.group("function_name")
    arguments = match.group("arguments")
    body = match.group("body")
    print(f"Function name: {function_name}")
    print(f"Arguments: {arguments.strip()}")
    print(f"Body:\n{body.strip()}")
    print("-" * 50)

In [None]:
from parser.dataset.dataloader import DataLoader
from parser.model import Semester

data_loader = DataLoader("../fe_files/exams/", None)
data_loader.load_data()

exam = data_loader.get_exam(semester=Semester.FALL, year=2017)

section = exam.sections[len(exam.sections) - 1]

text = section.questions[len(section.questions) - 1].original_text

matches = pattern.finditer(text)

for match in matches:
    function_name = match.group("function_name")
    arguments = match.group("arguments")
    body = match.group("body")
    print(f"Function name: {function_name}")
    print(f"Arguments: {arguments.strip()}")
    print(f"Body:\n{body.strip()}")
    print("-" * 50)

In [None]:
from typing import List

from parser.model import ExtractionType, Metadata, SubQuestion, Text

In [None]:
def extract_fill_in_the_blank_sub_questions(text: str) -> List[SubQuestion]:
    """
    Extracts fill-in-the-blank sub-questions from a given text, handling each
    independently, including multi-line underscores, based on the specified formats.
    """
    text_lines = text.splitlines(
        keepends=True
    )  # Keep line endings for accurate indexing
    sub_questions: List[SubQuestion] = []

    # Initialize variables
    is_collecting = False
    start_index = None
    found_underscore = (
        False  # Ensure underscores are found before creating a sub-question
    )

    # Compute cumulative lengths of lines for accurate indexing
    line_lengths = [len(line) for line in text_lines]
    cumulative_lengths = [0]
    for length in line_lengths:
        cumulative_lengths.append(cumulative_lengths[-1] + length)

    for i, line in enumerate(text_lines):
        stripped_line = line.strip()
        line_without_whitespace = line.replace(" ", "")
        contains_underscore = "_____" in line_without_whitespace

        # Determine if the line contains ':', '=', or ';'
        line_contains_colon = any(
            char in line_without_whitespace for char in [":", "=", ";"]
        )

        # Check if the line starts a sub-question
        starts_fill_in = line_contains_colon

        if starts_fill_in:
            # If we're already collecting, finalize the previous sub-question
            if is_collecting and found_underscore:
                end_index = cumulative_lengths[i]
                question_text = text[start_index:end_index]
                question_text_obj = Text.from_string(question_text, text, start_index)
                sub_question = SubQuestion(
                    identifier="",
                    points=None,
                    original_text=question_text_obj,
                    filtered_text=question_text_obj,
                    sub_questions=[],
                    metadata=Metadata(
                        extraction_type=ExtractionType.FILL_IN_THE_BLANKS
                    ),
                )
                sub_questions.append(sub_question)
            # Start collecting the new sub-question
            is_collecting = True
            start_index = cumulative_lengths[i]
            found_underscore = contains_underscore
        elif is_collecting:
            # Continue collecting lines
            if stripped_line == "" or contains_underscore:
                if contains_underscore:
                    found_underscore = True
                # Keep collecting
            else:
                # End of the current sub-question
                if found_underscore:
                    end_index = cumulative_lengths[i]
                    question_text = text[start_index:end_index]
                    question_text_obj = Text.from_string(
                        question_text, text, start_index
                    )
                    sub_question = SubQuestion(
                        identifier="",
                        points=None,
                        original_text=question_text_obj,
                        filtered_text=question_text_obj,
                        sub_questions=[],
                        metadata=Metadata(
                            extraction_type=ExtractionType.FILL_IN_THE_BLANKS
                        ),
                    )
                    sub_questions.append(sub_question)
                # Reset flags
                is_collecting = False
                start_index = None
                found_underscore = False

    # Handle any remaining collected lines at the end of the text
    if is_collecting and found_underscore:
        end_index = cumulative_lengths[-1]
        question_text = text[start_index:end_index]
        question_text_obj = Text.from_string(question_text, text, start_index)
        sub_question = SubQuestion(
            identifier="",
            points=None,
            original_text=question_text_obj,
            filtered_text=question_text_obj,
            sub_questions=[],
            metadata=Metadata(extraction_type=ExtractionType.FILL_IN_THE_BLANKS),
        )
        sub_questions.append(sub_question)

    return sub_questions

In [None]:
text = """
Partition Element Index: ______

Partition Element Value: ______

Reason it was the Partition Element:
__________________________________________________________________________________

__________________________________________________________________________________
"""

sub_questions = extract_fill_in_the_blank_sub_questions(text)
for sq in sub_questions:
    print("Extracted SubQuestion:")
    print(sq.original_text.text)
    print("-" * 50)

In [None]:
text = """
Explain your reasoning:




_____
"""

sub_questions = extract_fill_in_the_blank_sub_questions(text)
for sq in sub_questions:
    print("Extracted SubQuestion:")
    print(sq.original_text.text)
    print("-" * 50)

In [None]:
text = """
- Partition Element Index: ______
- Partition Element Value: ______
- Reason it was the Partition Element:
________________________________________________________________________________

________________________________________________________________________________
"""

sub_questions = extract_fill_in_the_blank_sub_questions(text)
for sq in sub_questions:
    print("Extracted SubQuestion:")
    print(sq.original_text.text)
    print("-" * 50)