In [None]:
import re
import pandas as pd
import random
import os
from nltk.tokenize import sent_tokenize
from lxml import etree as ET

In [None]:
def get_sentence_containing_reference(root, reference_ids):
    if not isinstance(reference_ids, list):
        reference_ids = [reference_ids]
    
    # Extract content from both TEXT and DCT elements
    text_content_list = []

    for tag in ['TEXT', 'DCT']:
        element = root.find(tag)
        if element is not None:
            text_content = "".join(element.itertext())

            # Add a period at the end of the 'DCT' content if it doesn't already have one
            if tag == 'DCT' and not text_content.endswith('.'):
                text_content += '.'

            text_content_list.append(text_content)

    # Combine both the TEXT and DCT content
    combined_text_content = ' '.join(text_content_list)

    # If there's no combined text content, return appropriate messages
    if not combined_text_content:
        return {ref_id: ("Unknown reference", "No TEXT or DCT element found") for ref_id in reference_ids}

    sentences = sent_tokenize(combined_text_content)
    sentences_containing_references = {}

    for reference_id in reference_ids:
        # Find the reference element
        reference_element = root.xpath(f'//*[@eid="{reference_id}"] | //*[@tid="{reference_id}"]')

        if reference_element:
            reference_element_text = "".join(reference_element[0].itertext())
            
            # Find the sentence containing the reference element
            for sentence in sentences:
                if reference_element_text in sentence:
                    sentences_containing_references[reference_id] = sentence.strip()
                    break
        else:
            sentences_containing_references[reference_id] = "Unknown reference"
    
    return sentences_containing_references

In [None]:
def generate_questions_from_file(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()
    
    events = {event.attrib['eid']: event for event in root.iter('EVENT')}
    timex3s = {timex3.attrib['tid']: timex3 for timex3 in root.iter('TIMEX3')}
    makeinstances = {mi.attrib['eiid']: mi for mi in root.iter('MAKEINSTANCE')}
    tlinks = root.findall(".//TLINK")
    
    def get_text_from_reference(reference_id):
        if reference_id in events:
            reference_element = events[reference_id]
        elif reference_id in timex3s:
            reference_element = timex3s[reference_id]
        else:
            return "Unknown reference", "Unknown context"

        # Get the text of the reference
        reference_text = " ".join(reference_element.itertext())

        # Get all text in the parent element as a single string
        text_element = root.find('.//TEXT')
        text_content = " ".join(text_element.itertext())

        # Find the sentence containing the reference text
        sentences = re.split(r'(?<=[.!?]) +', text_content)
        context = next((sentence for sentence in sentences if reference_text in sentence), "Unknown context")

        return reference_text
    
    questions = []

    for tlink in tlinks:
        reltype = tlink.attrib['relType']
        eiid = tlink.attrib.get('eventInstanceID')
        related_to_eiid = tlink.attrib.get('relatedToEventInstance')
        related_to_time = tlink.attrib.get('relatedToTime')

        # Get eventID or timeID from MAKEINSTANCE or directly
        if eiid:
            event_id = makeinstances[eiid].attrib['eventID']
            sentences = get_sentence_containing_reference(root, [event_id])
            event_text = get_text_from_reference(event_id)
            event_context = sentences.get(event_id, "Unknown context")
        else:
            event_text, event_context = "Unknown event", "Unknown context"

        # Get relatedToEvent text or relatedToTime text
        if related_to_eiid:
            related_to_event_id = makeinstances[related_to_eiid].attrib['eventID']
            sentences = get_sentence_containing_reference(root, [related_to_event_id])
            related_to_event_text = get_text_from_reference(related_to_event_id)
            related_to_event_context = sentences.get(related_to_event_id, "Unknown context")
            question_text = f"What is the relationship between the event '{event_text}' and the event '{related_to_event_text}'?"
            if event_context != related_to_event_context:
                context = f"{event_context} {related_to_event_context}"
            else:
                context = f"{event_context}"
        elif related_to_time:
            sentences = get_sentence_containing_reference(root, [related_to_time])
            related_to_event_text = get_text_from_reference(related_to_time)
            related_to_event_context = sentences.get(related_to_time, "Unknown context")
            question_text = f"What is the relationship between the event '{event_text}' and the time '{related_to_event_text}'?"
            if event_context != related_to_event_context:
                context = f"{event_context} {related_to_event_context}"
            else:
                context = f"{event_context}"
        else:
            related_to_event_text, related_to_event_context = "Unknown event", "Unknown context"
            question_text = "Unknown relationship"
            context = "Unknown context"

        questions.append((question_text, reltype, context))
    
    options = ["BEFORE", "AFTER", "INCLUDES", "IS-INCLUDED", "DURING", "SIMULTANEOUS", "IMMEDIATELY AFTER", "IMMEDIATELY BEFORE", "IDENTITY", "BEGINS", "ENDS", "BEGUN-BY", "ENDED-BY"]
    data = []

    for question, reltype, context in questions:
        # Create the full question text
        full_question_text = f"{context} {question}"

        # Get three random options, ensuring they don't include the correct answer
        wrong_options = random.sample([opt for opt in options if opt != reltype], 2)

        # Create a list of options including the correct answer, and shuffle it to randomize the order
        all_options = wrong_options + [reltype]
        random.shuffle(all_options)

        # Find the index of the correct answer and map it to "A", "B" or "C"
        answer_index = all_options.index(reltype)
        answer_letter = chr(65 + answer_index)  # 65 is the ASCII code for "A"

        # Create a dictionary representing the row, and add it to the list of rows
        row = {
            "Question": full_question_text,
            "Option A": all_options[0],
            "Option B": all_options[1],
            "Option C": all_options[2],
            "Answer": answer_letter,
        }
        data.append(row)

    # Create a dataframe from the list of rows
    df = pd.DataFrame(data)
        
    return df

In [None]:
path_to_xml_files = "C:\\Users\\XXXXX\\Desktop\\Research\\Benchmark_Temporal_Reasoning_LLM\\TE3-Silver-data"

# Get a list of all XML files in the folder
xml_files = [os.path.join(path_to_xml_files, f) for f in os.listdir(path_to_xml_files) if f.endswith('.tml')]

# Create a list to store DataFrames
dfs = []

In [None]:
for xml_file in xml_files:
    df = generate_questions_from_file(xml_file)
    dfs.append(df)

# Concatenate all individual DataFrames to create a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)
final_df = final_df.drop_duplicates(subset=['Question'])