In [1]:
from itertools import zip_longest


def parse_response(text: str):
    is_source_text, is_answer = False, False
    source_text = ""
    answers = []
    conditions = []
    for line in text.split("\n"):
        if line.startswith("SOURCE-TEXT"):
            is_source_text, is_answer = True, False
            line = line.removeprefix("SOURCE-TEXT").removeprefix(": ")
        elif line.startswith("ANSWER"):
            is_source_text, is_answer = False, True
            line = line.removeprefix("ANSWER").removeprefix(": ").removesuffix('.')

        if is_source_text:
            source_text += line
        elif is_answer:
            line_len = len(line)
            answer = ""
            condition = ""
            inside_answer = False
            for idx, char in enumerate(line):
                # Beginning of an answer line
                if idx == 0:
                    assert char == "[", "Answer should start with [["
                # End of an answer line
                elif idx == line_len - 1:
                    assert char == "]", "Answer should end with ]]"
                # Check if the token is beginning of answer sequence
                elif char == "[" and line[idx - 1] == "[":
                    inside_answer = True
                    # If a condition exists from the last answer, save it
                    if condition:
                        conditions.append(condition.strip())
                        condition = ""
                # Check if the token is end of an answer sequence
                elif char == "]" and line[idx + 1] == "]":
                    inside_answer = False
                    # If it is, save the completed answer
                    answers.append(answer)
                    answer = ""
                elif inside_answer:
                    answer += char
                elif char != "[" and char != "]":
                    # If it's not a bos or eos token and it's not an answer, then it must be a condition between answers
                    condition += char
    return source_text.strip(), list(zip_longest(answers, conditions))

In [2]:
parse_response("""
SOURCE-TEXT: The inclusion criteria for the clinical trial protocol specify the following cancer biomarkers for breast cancer:

Estrogen receptor (ER) and progesterone receptor (PR) =< 1% by immunohistochemistry
Her-2/neu negative (0 or 1+ by immunohistochemistry OR not amplified by College of American Pathologists/American Society of Clinical Oncology [CAP/ASCO] standards) 1 .

ANSWER: [[Estrogen receptor =< 1%]]
ANSWER: [[Progesterone receptor =< 1%]]
ANSWER: [[Her-2/neu negative]]
""")

('The inclusion criteria for the clinical trial protocol specify the following cancer biomarkers for breast cancer:Estrogen receptor (ER) and progesterone receptor (PR) =< 1% by immunohistochemistryHer-2/neu negative (0 or 1+ by immunohistochemistry OR not amplified by College of American Pathologists/American Society of Clinical Oncology [CAP/ASCO] standards) 1 .',
 [('Estrogen receptor =< 1%', None),
  ('Progesterone receptor =< 1%', None),
  ('Her-2/neu negative', None)])