In [8]:
import ast

def load_txt_to_dict_list(filepath):
    data = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                # Safely evaluate the string to a Python dictionary
                record = ast.literal_eval(line)

                # Extract 'sentence' and 'triple' keys
                sentence = record.get('sentence', '')
                triples = record.get('triple', [])

                data.append({
                    "instruction": "Extract all subject–predicate–object triples from the sentence.",
                    "input": sentence,
                    "output": str(triples)
                })

            except Exception as e:
                print(f" Error on line:\n{line}\n{e}")

    return data

#  Usage
file_path = "train.txt" 
formatted_data = load_txt_to_dict_list(file_path)

# Optional: check a sample
print("Sample formatted data:")
print(formatted_data[0])


Sample formatted data:
{'instruction': 'Extract all subject–predicate–object triples from the sentence.', 'input': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers.', 'output': "[['Palestinian', 'job_title', 'olive farmers']]"}


In [9]:
import ast

def load_txt_to_dict_list(filepath):
    data = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                # Safely evaluate the string to a Python dictionary
                record = ast.literal_eval(line)

                # Extract 'sentence' and 'triple' keys
                sentence = record.get('sentence', '')
                triples = record.get('triple', [])

                data.append({
                    "instruction": "Extract all subject–predicate–object triples from the sentence.",
                    "input": sentence,
                    "output": str(triples)
                })

            except Exception as e:
                print(f" Error on line:\n{line}\n{e}")

    return data

#  Usage
file_path = "test.txt" 
formatted_data_test = load_txt_to_dict_list(file_path)

# Optional: check a sample
print("Sample formatted data:")
print(formatted_data_test[0])


Sample formatted data:
{'instruction': 'Extract all subject–predicate–object triples from the sentence.', 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.', 'output': "[['Nashville North', 'known_as', 'outdoor concert'], ['Tim McGraw', 'job_title', 'musician'], ['Toby Keith', 'job_title', 'musician']]"}


In [3]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Using cached numpy-2.2.5-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading mu

In [10]:
from datasets import Dataset

dataset_train = Dataset.from_list(formatted_data)


In [11]:
dataset_test = Dataset.from_list(formatted_data_test)

In [12]:
dataset_train

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 414
})

In [13]:
dataset_test

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 271
})

In [14]:
dataset_train[0]

{'instruction': 'Extract all subject–predicate–object triples from the sentence.',
 'input': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers.',
 'output': "[['Palestinian', 'job_title', 'olive farmers']]"}

In [15]:
dataset_test[0]

{'instruction': 'Extract all subject–predicate–object triples from the sentence.',
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "[['Nashville North', 'known_as', 'outdoor concert'], ['Tim McGraw', 'job_title', 'musician'], ['Toby Keith', 'job_title', 'musician']]"}

In [16]:
import ast

def validate_dataset(dataset):
    invalid_entries = []
    multiple_triple_count = 0

    for idx, item in enumerate(dataset):
        try:
            # Check required keys
            if not all(k in item for k in ['instruction', 'input', 'output']):
                invalid_entries.append((idx, "Missing required keys"))
                continue

            output_str = item['output'].strip()

            # Check if output string starts and ends with valid brackets
            if not (output_str.startswith("[") and output_str.endswith("]")):
                invalid_entries.append((idx, "Output format invalid or not properly closed"))
                continue

            # Parse the output string
            triples = ast.literal_eval(output_str)

            if not isinstance(triples, list):
                invalid_entries.append((idx, "Output is not a list"))
                continue

            # Check if each element is a valid triple
            if not all(isinstance(triple, list) and len(triple) == 3 for triple in triples):
                invalid_entries.append((idx, "Each triple must be a list of 3 elements"))
                continue

            # Count multiple triples
            if len(triples) > 1:
                multiple_triple_count += 1

        except Exception as e:
            invalid_entries.append((idx, f"Error parsing output: {e}"))

    print(f"\n Validation completed.")
    print(f"Total entries: {len(dataset)}")
    print(f"Total entries with multiple triples: {multiple_triple_count}")
    print(f" Total invalid entries: {len(invalid_entries)}")

    if invalid_entries:
        print("\n Sample invalid entries:")
        for i, msg in invalid_entries[:5]:  # Display first 5 issues
            print(f" - Row {i}: {msg}")
    else:
        print(" All entries are valid.")

# Example usage:
# validate_dataset(dataset)


In [17]:
validate_dataset(dataset_train)


 Validation completed.
Total entries: 414
Total entries with multiple triples: 96
 Total invalid entries: 0
 All entries are valid.


In [18]:
validate_dataset(dataset_test)


 Validation completed.
Total entries: 271
Total entries with multiple triples: 161
 Total invalid entries: 0
 All entries are valid.


In [2]:
%pip install -U spacy
%python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%python` not found (But cell magic `%%python` exists, did you mean that instead?).


In [5]:
import spacy
print(spacy.info())  # This will show available models


{'spacy_version': '3.8.5', 'location': 'c:\\Vamsi\\Study\\SS25\\Track2\\Exercise\\Ex2\\.conda\\Lib\\site-packages\\spacy', 'platform': 'Windows-11-10.0.26100-SP0', 'python_version': '3.12.9', 'pipelines': {}}


In [None]:
import spacy
import ast
from datasets import Dataset

# Load English spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_subject_entities(text):
    """
    Uses spaCy to extract named entities that can act as subjects.
    You can customize this more based on your needs (e.g. noun chunks).
    """
    doc = nlp(text)
    subjects = [ent.text for ent in doc.ents]  # Named entities as candidate subjects
    return subjects

def build_subject_conditioned_dataset(dataset):
    new_examples = []

    for row in dataset:
        sentence = row['input']
        all_triples = ast.literal_eval(row['output'])

        # Extract subjects from sentence using spaCy
        subjects_in_text = extract_subject_entities(sentence)

        for triple in all_triples:
            subj, pred, obj = triple

            # Use gold triple's subject if it's in the text (you can customize this)
            if subj in subjects_in_text:
                new_examples.append({
                    "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                    "input": sentence,
                    "output": str([subj, pred, obj])
                })

    return Dataset.from_list(new_examples)

# Example usage:
# subject_conditioned_dataset = build_subject_conditioned_dataset(dataset)


In [36]:
ner_train_dataset = build_subject_conditioned_dataset(dataset_train)

In [22]:
ner_train_dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 409
})

In [37]:
ner_train_dataset[0]

{'instruction': "Given the subject entity 'Palestinian', extract the full triple from the sentence.",
 'input': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers.'}

In [27]:
import spacy
from spacy.lang.en.examples import sentences 

nlp = spacy.load("en_core_web_sm")
doc = nlp("The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers")
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers
The DET det
worst ADJ amod
hotspot NOUN nsubj
for ADP prep
violence NOUN pobj
has AUX aux
been AUX ROOT
in ADP prep
the DET det
Palestinian ADJ amod
village NOUN pobj
of ADP prep
Beita PROPN pobj
, PUNCT punct
where SCONJ advmod
locals NOUN nsubj
began VERB relcl
organising VERB xcomp
the DET det
recent ADJ amod
protests NOUN dobj
after SCONJ mark
hard ADJ amod
- PUNCT punct
line NOUN nmod
Jewish ADJ amod
settlers NOUN nsubj
set VERB advcl
up ADP prt
a DET det
new ADJ amod
outpost NOUN dobj
on ADP prep
land NOUN pobj
claimed VERB acl
by ADP agent
Palestinian ADJ amod
olive NOUN compound
farmers NOUN pobj


In [28]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def extract_spo_triples(sentence):
    doc = nlp(sentence)
    triples = []

    for token in doc:
        # Rule 1: Find subject → verb → object
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = token.text
            verb = token.head
            predicate = verb.text

            # Look for direct object (dobj) or object of preposition (pobj)
            obj = None
            for child in verb.children:
                if child.dep_ in ("dobj", "attr", "pobj", "oprd"):
                    obj = child.text
                elif child.dep_ == "prep":  # handle prep + pobj
                    for subchild in child.children:
                        if subchild.dep_ == "pobj":
                            obj = f"{child.text} {subchild.text}"

            if obj:
                triples.append([subject, predicate, obj])

        # Rule 2: Handle passive voice (agent phrases)
        if token.dep_ == "agent" and token.head.dep_ == "acl":
            agent = list(token.children)
            if agent:
                subject = agent[0].text
                predicate = "claimed_by"
                obj = token.head.head.text  # e.g. land claimed by X
                triples.append([subject, predicate, obj])

    return triples


In [29]:
sentence = "The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers"

In [31]:
extract_spo_triples(sentence)

[['settlers', 'set', 'outpost'], ['farmers', 'claimed_by', 'land']]

In [33]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_all_spo_triples(text):
    doc = nlp(text)
    triples = []

    for token in doc:
        # -------- Rule 1: Active voice (nsubj → verb → dobj/pobj) --------
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = token.text
            verb = token.head
            predicate = verb.lemma_  # Use lemma to normalize (e.g., "set up" → "set")

            # Find object
            obj = None
            for child in verb.children:
                if child.dep_ == "dobj":
                    obj = child.text
                elif child.dep_ == "prep":
                    for subchild in child.children:
                        if subchild.dep_ == "pobj":
                            obj = subchild.text

            if obj:
                triples.append([subject, predicate, obj])

        # -------- Rule 2: Passive voice (e.g., "land claimed by farmers") --------
        if token.dep_ == "agent" and token.head.dep_ == "acl":
            verb = token.head.lemma_
            passive_obj = token.head.head.text
            agent = None
            for child in token.children:
                if child.pos_ in ["NOUN", "PROPN"]:
                    agent = child.text

            if agent:
                triples.append([agent, f"{verb}_by", passive_obj])

    return triples


In [34]:
extract_all_spo_triples(sentence)

[['settlers', 'set', 'outpost'], ['farmers', 'claim_by', 'land']]

In [50]:
import spacy
import ast
from datasets import Dataset

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_spo_triples(sentence):
    """
    Extract subject-predicate-object triples from sentence using dependency parsing.
    """
    doc = nlp(sentence)
    triples = []

    for token in doc:
        # Active: subject → verb → object
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = token.text
            verb = token.head.lemma_
            obj = None

            for child in token.head.children:
                if child.dep_ in ("dobj", "attr", "oprd"):
                    obj = child.text
                elif child.dep_ == "prep":
                    for subchild in child.children:
                        if subchild.dep_ == "pobj":
                            obj = subchild.text

            if obj:
                triples.append([subject, verb, obj])

        # Passive: object ← verb ← agent
        if token.dep_ == "agent" and token.head.dep_ == "acl":
            agent = None
            for child in token.children:
                if child.pos_ in ["NOUN", "PROPN"]:
                    agent = child.text

            passive_obj = token.head.head.text
            if agent:
                triples.append([agent, f"{token.head.lemma_}_by", passive_obj])

    return triples

def build_subject_conditioned_dataset(dataset):
    """
    Creates dataset with subject-conditioned prompts from gold and extracted triples.
    """
    new_examples = []

    for row in dataset:
        sentence = row["input"]

        # 1. Gold triples
        try:
            gold_triples = ast.literal_eval(row["output"])
        except:
            gold_triples = []

        for subj, pred, obj in gold_triples:
            new_examples.append({
                "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                "input": sentence,
                "output": str([subj, pred, obj])
            })

        # 2. Extracted triples
        extracted_triples = extract_spo_triples(sentence)
        for subj, pred, obj in extracted_triples:
            # Avoid duplicate gold triples
            if [subj, pred, obj] not in gold_triples:
                new_examples.append({
                    "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                    "input": sentence,
                    "output": str([subj, pred, obj])
                })

    return Dataset.from_list(new_examples)


In [51]:
new_set = build_subject_conditioned_dataset(dataset_train)

In [52]:
new_set

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1027
})

In [61]:
for i in new_set:
    print(i)

{'instruction': "Given the subject entity 'Palestinian', extract the full triple from the sentence.", 'input': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers.', 'output': "['Palestinian', 'job_title', 'olive farmers']"}
{'instruction': "Given the subject entity 'settlers', extract the full triple from the sentence.", 'input': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers.', 'output': "['settlers', 'set', 'outpost']"}
{'instruction': "Given the subject entity 'farmers', extract the full triple from the sentence.", 'input': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the r

In [62]:
new_set_test = build_subject_conditioned_dataset(dataset_test)

In [63]:
new_set_test

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 973
})

In [64]:
new_set_test

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 973
})

In [65]:
new_set_test[0]

{'instruction': "Given the subject entity 'Nashville North', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Nashville North', 'known_as', 'outdoor concert']"}

In [66]:
new_set_test[1]

{'instruction': "Given the subject entity 'Tim McGraw', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Tim McGraw', 'job_title', 'musician']"}

In [67]:
new_set_test[2]

{'instruction': "Given the subject entity 'Toby Keith', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Toby Keith', 'job_title', 'musician']"}

In [68]:
new_set_test[3]

{'instruction': "Given the subject entity 'people', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['people', 'sip', 'Canadian']"}

In [69]:
new_set_test[4]

{'instruction': "Given the subject entity 'musicians', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['musicians', 'belt', 'covers']"}

In [70]:
new_set_test[5]

{'instruction': "Given the subject entity 'Strieck', extract the full triple from the sentence.",
 'input': 'Strieck moved to Calgary and 1992 and landed at the airport on a Stampede Saturday and fell in love with her new home then and there.',
 'output': "['Strieck', 'resident_of', 'Calgary']"}

In [97]:
import spacy
import ast
from datasets import Dataset

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Util: Reconstruct compound subject/object
def get_compound_noun(token):
    compound_parts = []
    for child in token.lefts:
        if child.dep_ in ("compound", "amod"):
            compound_parts.append(child.text)
    compound_parts.append(token.text)
    return " ".join(compound_parts)

# Extract full triples with compound nouns
def extract_clean_triples(sentence):
    doc = nlp(sentence)
    triples = []

    for token in doc:
        # Rule 1: subject → verb → object
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = get_compound_noun(token)
            predicate = token.head.lemma_

            obj_token = None
            for child in token.head.children:
                if child.dep_ in ("dobj", "attr", "oprd"):
                    obj_token = child
                elif child.dep_ == "prep":
                    for subchild in child.children:
                        if subchild.dep_ == "pobj":
                            obj_token = subchild

            if obj_token:
                obj = get_compound_noun(obj_token)
                triples.append([subject, predicate, obj])

        # Rule 2: passive (e.g. land claimed by farmers)
        if token.dep_ == "agent" and token.head.dep_ == "acl":
            for child in token.children:
                if child.pos_ in ("NOUN", "PROPN"):
                    subject = get_compound_noun(child)
                    predicate = f"{token.head.lemma_}_by"
                    obj = token.head.head.text
                    triples.append([subject, predicate, obj])

    return triples

# Build dataset with both gold and extracted triples
def build_triple_dataset(dataset):
    examples = []

    for row in dataset:
        sentence = row["input"]

        # 1. Get gold triples (if any)
        try:
            gold_triples = ast.literal_eval(row["output"])
        except:
            gold_triples = []

        gold_set = set(tuple(trip) for trip in gold_triples)

        # Add gold triples
        for subj, pred, obj in gold_triples:
            examples.append({
                "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                "input": sentence,
                "output": str([subj, pred, obj])
            })

        # 2. Get extracted triples from spaCy
        extracted_triples = extract_clean_triples(sentence)
        for subj, pred, obj in extracted_triples:
            if (subj, pred, obj) not in gold_set:
                examples.append({
                    "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                    "input": sentence,
                    "output": str([subj, pred, obj])
                })

    return Dataset.from_list(examples)


In [None]:
new_trained = build_triple_dataset(dataset_train)

In [98]:
new_tested = build_triple_dataset(dataset_test)

In [99]:
new_tested

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 973
})

In [79]:
new_tested[0]

{'instruction': "Given the subject entity 'Nashville North', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Nashville North', 'known_as', 'outdoor concert']"}

In [80]:
new_tested[1]

{'instruction': "Given the subject entity 'Tim McGraw', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Tim McGraw', 'job_title', 'musician']"}

In [81]:
new_tested[2]

{'instruction': "Given the subject entity 'Toby Keith', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Toby Keith', 'job_title', 'musician']"}

In [82]:
new_tested[3]

{'instruction': "Given the subject entity 'people', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['people', 'sip', 'Molson Canadian']"}

In [83]:
new_tested[4]

{'instruction': "Given the subject entity 'musicians', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['musicians', 'belt', 'covers']"}

In [85]:
import spacy
import ast
from datasets import Dataset

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# -------- Utility: Reconstruct compound noun phrases --------
def get_compound_noun(token):
    compound_parts = []
    for child in token.lefts:
        if child.dep_ in ("compound", "amod"):
            compound_parts.append(child.text)
    compound_parts.append(token.text)
    return " ".join(compound_parts)

# -------- Triple Extraction Function (with conjunct handling) --------
def extract_clean_triples(sentence):
    doc = nlp(sentence)
    triples = []

    for token in doc:
        # Rule 1: subject → verb → object
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = get_compound_noun(token)
            predicate = token.head.lemma_

            obj_token = None
            for child in token.head.children:
                if child.dep_ in ("dobj", "attr", "oprd"):
                    obj_token = child
                elif child.dep_ == "prep":
                    for subchild in child.children:
                        if subchild.dep_ == "pobj":
                            obj_token = subchild

            if obj_token:
                obj_main = get_compound_noun(obj_token)
                triples.append([subject, predicate, obj_main])

                # NEW: Handle "conj" objects like "Budweiser"
                for sibling in obj_token.conjuncts:
                    obj_alt = get_compound_noun(sibling)
                    triples.append([subject, predicate, obj_alt])

        # Rule 2: passive voice agent pattern
        if token.dep_ == "agent" and token.head.dep_ == "acl":
            for child in token.children:
                if child.pos_ in ("NOUN", "PROPN"):
                    subject = get_compound_noun(child)
                    predicate = f"{token.head.lemma_}_by"
                    obj = token.head.head.text
                    triples.append([subject, predicate, obj])

    return triples

# -------- Final Dataset Builder: Combine Extracted + Gold Triples --------
def build_combined_triple_dataset(dataset):
    examples = []

    for row in dataset:
        sentence = row["input"]

        # Get gold triples (if available)
        try:
            gold_triples = ast.literal_eval(row["output"])
        except:
            gold_triples = []

        gold_set = set(tuple(trip) for trip in gold_triples)

        # Add gold-labeled triples
        for subj, pred, obj in gold_triples:
            examples.append({
                "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                "input": sentence,
                "output": str([subj, pred, obj])
            })

        # Extracted triples using spaCy
        extracted_triples = extract_clean_triples(sentence)
        for subj, pred, obj in extracted_triples:
            if (subj, pred, obj) not in gold_set:
                examples.append({
                    "instruction": f"Given the subject entity '{subj}', extract the full triple from the sentence.",
                    "input": sentence,
                    "output": str([subj, pred, obj])
                })

    return Dataset.from_list(examples)


In [86]:
new_tested = build_combined_triple_dataset(dataset_test)

In [95]:
new_tested

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 991
})

In [87]:
new_tested[0]

{'instruction': "Given the subject entity 'Nashville North', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Nashville North', 'known_as', 'outdoor concert']"}

In [88]:
new_tested[1]

{'instruction': "Given the subject entity 'Tim McGraw', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Tim McGraw', 'job_title', 'musician']"}

In [89]:
new_tested[2]

{'instruction': "Given the subject entity 'Toby Keith', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['Toby Keith', 'job_title', 'musician']"}

In [90]:
new_tested[3]

{'instruction': "Given the subject entity 'people', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['people', 'sip', 'Molson Canadian']"}

In [91]:
new_tested[4]

{'instruction': "Given the subject entity 'people', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['people', 'sip', 'Budweiser']"}

In [92]:
new_tested[5]

{'instruction': "Given the subject entity 'musicians', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['musicians', 'belt', 'covers']"}

In [93]:
new_tested[5]

{'instruction': "Given the subject entity 'musicians', extract the full triple from the sentence.",
 'input': 'The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.',
 'output': "['musicians', 'belt', 'covers']"}