In [1]:
import torch
import spacy
from transformers import AutoTokenizer, pipeline
from auto_gptq import AutoGPTQForCausalLM
from maverick import Maverick
from spacy.lang.en.stop_words import STOP_WORDS

  from .autonotebook import tqdm as notebook_tqdm
  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.


In [2]:

# === Load All Models Once ===
print("Loading Maverick coreference model...")
coref_model = Maverick(
    hf_name_or_path="sapienzanlp/maverick-mes-ontonotes",
    device="cpu"  # Use "cuda:0" if you have a GPU
)

print("Loading spaCy model...")
nlp = spacy.load("en_core_web_sm")

print("Loading tokenizer...")
model_id = "TheBloke/OpenHermes-2-Mistral-7B-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id)

print("Loading quantized Mistral model with AutoGPTQ...")
llm_model = AutoGPTQForCausalLM.from_quantized(
    model_id,
    device_map="auto",
    use_safetensors=True,
    trust_remote_code=True,
    quantize_config=None
)

print("Creating LLM pipeline...")
text_generator = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=tokenizer,
)

# === Coreference Resolution ===
def resolve_coref(text):
    output = coref_model.predict(text)
    chars = list(text)
    clusters = output.get("clusters_char_offsets", [])
    all_mentions = []
    for cluster in clusters:
        main_mention_span = cluster[0]
        main_mention_text = text[main_mention_span[0]:main_mention_span[1] + 1]
        for mention_span in cluster[1:]:
            all_mentions.append((mention_span[0], mention_span[1], main_mention_text))
    all_mentions.sort(key=lambda x: x[0], reverse=True)
    for start, end, main_text in all_mentions:
        for i in range(start, end + 1):
            chars[i] = ''
        chars[start] = main_text
    return ''.join(chars)

# === Concept & Relation Extraction ===
def extract_concepts(text):
    doc = nlp(text)
    concepts = set()
    for chunk in doc.noun_chunks:
        t = chunk.text.lower().strip()
        if len(t) > 2 and t not in STOP_WORDS and not t.startswith(("the ", "this ", "a ", "an ")):
            concepts.add(chunk.text.strip())
    for ent in doc.ents:
        if len(ent.text.strip()) > 2:
            concepts.add(ent.text.strip())
    return sorted(concepts)

def extract_relations(text):
    doc = nlp(text)
    relations = set()
    for token in doc:
        if token.pos_ == "VERB":
            subj = [child for child in token.lefts if child.dep_ in ("nsubj", "nsubjpass")]
            obj = [child for child in token.rights if child.dep_ in ("dobj", "obj", "pobj")]
            if subj and obj:
                aux = [child.text for child in token.lefts if child.dep_ in ("aux", "auxpass", "neg", "modal")]
                rel = ' '.join(aux + [token.lemma_]).strip()
                if rel:
                    relations.add(rel)
    for token in doc:
        if token.pos_ == "VERB":
            relations.add(token.lemma_)
    return sorted(relations)

# === Prompt Creation for LLM ===
def create_llm_prompt(text, concepts, relations):
    concepts_str = '\n- '.join(concepts)
    relations_str = '\n- '.join(relations)

    prompt = (
    "You are an intelligent system designed to extract structured knowledge from text in the form of "
    "Subject-Predicate-Object (SPO) triples.\n\n"
    "Given the following input text:\n"
    f"\"\"\"{text}\"\"\"\n\n"
    "The following candidate concepts and relations have been extracted from the text to guide you:\n\n"
    "Candidate Concepts (examples from the text):\n"
    f"{concepts_str}\n\n"
    "Candidate Relations (example verbs from the text):\n"
    f"{relations_str}\n\n"
    "Your task is to extract **all accurate and meaningful SPO triples** from the text using natural language understanding.\n"
    "- You may use the candidate concepts and relations as guidance, **but you are not limited to them**.\n"
    "- You can include **any valid predicate/verb** but it must be found in the input text that accurately represents a relationship.\n"
    "- Do not generate duplicate or redundant triples.\n\n"
    "Return the output only in the following format:\n"
    "spo triples:\n(subject, predicate, object)\n\n"
    "Only return the list of SPO triples. No extra commentary or explanation."
)

    return prompt
import re

def extract_spo_triples(text: str) -> list[tuple[str, str, str]]:
    """
    Extract only the SPO triples from the generated text, removing duplicates.

    Parameters:
        text (str): The raw LLM output that contains SPO triple sections.

    Returns:
        List[Tuple[str, str, str]]: A list of unique (subject, predicate, object) triples.
    """
    spo_pattern = re.compile(r"\(.*?\)")
    matches = spo_pattern.findall(text)

    # Parse each triple string into tuple and remove duplicates
    spo_triples = set()
    for match in matches:
        # Clean and split into parts
        parts = [part.strip() for part in match.strip("()").split(",")]
        if len(parts) == 3:
            spo_triples.add(tuple(parts))

    return sorted(spo_triples)



Loading Maverick coreference model...
sapienzanlp/maverick-mes-ontonotes loading


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loading spaCy model...
Loading tokenizer...


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.


Loading quantized Mistral model with AutoGPTQ...


INFO - The layer lm_head is not quantized.
Device set to use cuda:0
The model 'MistralGPTQForCausalLM' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForCondit

Creating LLM pipeline...


In [3]:
def create_llm_prompt_with_examples(text, concepts, relations, example_spo_triples):
    # Convert concepts and relations to comma-separated lowercase strings (if needed)
    concepts_str = ', '.join(concepts)
    relations_str = ', '.join(relations)

    # Format triples as comma-separated string of (sub, pred, obj)
    examples_str = ', '.join(f"('{s}', '{p}', '{o}')" for s, p, o in example_spo_triples)

    prompt = (
        "You are a knowledge extraction system.\n\n"
        "Given the following input text (with coreferences resolved):\n"
        f"\"\"\"{text}\"\"\"\n\n"
        "The candidate concepts extracted from the text are:\n"
        f"{concepts_str}\n\n"
        "The candidate relations extracted from the text are:\n"
        f"{relations_str}\n\n"
        "Here are some example Subject-Predicate-Object (SPO) triples previously generated from this text:\n"
        f"{examples_str}\n\n"
        "Your task is to carefully analyze the input text, concepts, relations, and example SPO triples, "
        "and produce a clean, consistent, and high-quality set of SPO triples suitable for building a knowledge graph.\n\n"
        "You may add new concepts and relations if you find them in the text but not previously included.\n"
        "Note that backtracking or revisiting previous steps is NOT possible.\n\n"
        "Please output all SPO triples in a clear, structured list format. Each triple must be on its own line and must follow this format:\nTest Output: "
        "predicate(subject, object)\n\n"
        "Only return the list of all SPO triples exist in input text. No extra commentary or explanation.\n\n"
        "Begin extraction now."
    )
    return prompt


In [3]:
def create_llm_prompt_with_examples(text, concepts, relations, example_spo_triples):
    concepts_str = ', '.join(concepts)
    relations_str = ', '.join(relations)

    # Format example SPO triples as lines (not tuples) to match your style
    example_lines = '\n'.join(
        f"{p}({s}, {o})" for s, p, o in example_spo_triples
    )

    prompt = (
        f'Given the following input text (with coreferences resolved):\n"""'
        f'{text}"""\n'
        f'And the candidate concepts extracted from the text: - {concepts_str}\n'
        f'And the candidate relations extracted from the text: - {relations_str}\n'
        f'Here are some example Subject-Predicate-Object (SPO) triples previously generated from this text:\n'
        f'{example_lines}\n'
        f'Your task is to carefully analyze the input text, concepts, relations, and example SPO triples, '
        f'and produce a clean, consistent, and high-quality set of SPO triples suitable for building a knowledge graph.\n'
        f'You may add new concepts and relations if you find them in the text but not previously included.\n'
        f'Note that backtracking or revisiting previous steps is NOT possible.\n'
        f'Please output all SPO triples in a clear, structured list format, one triple per line, like  predicate(subject, object:\n'
        f'Only return the list of SPO triples. No extra commentary or explanation.'
        f'Test Output: '
       
    )
    return prompt


In [10]:
def create_llm_prompt_with_examples(text, concepts, relations, example_spo_triples):
    # Convert concepts and relations to comma-separated lowercase strings (if needed)
    concepts_str = ', '.join(concepts)
    relations_str = ', '.join(relations)

    # Format triples as comma-separated string of (sub, pred, obj)
    examples_str = ', '.join(f"('{s}', '{p}', '{o}')" for s, p, o in example_spo_triples)
    prompt = (
        "Given the following input text (with coreferences resolved):\\n"
        f'"""' + text + '"""\\n\\n'
        "And the candidate concepts extracted from the text:\\n"
        f"{concepts_str}\\n"
        "And the candidate relations extracted from the text:\\n"
        f"{relations_str}\\n\\n"
        "Here are some example Subject-Predicate-Object (SPO) triples previously generated from this text:\\n"
        f"{examples_str}\\n\\n"
        "Your task is to carefully analyze the input text, concepts, relations, and example SPO triples, "
        "and produce a clean, consistent, and high-quality set of SPO triples suitable for building a knowledge graph.\\n"
        "You may add new concepts and relations if you find them in the text but not previously included.\\n"
        "Note that backtracking or revisiting previous steps is NOT possible.\\n\\n"
        "Please output all SPO triples in a clear, structured list format, one triple per line, as:\\n"
        "Test Output:  predicate(subject, object)\\n\\n"
        "Only return the list of SPO triples. No extra commentary or explanation."
    )
    return prompt

In [4]:
def create_llm_prompt_with_examples(text, concepts, relations, example_spo_triples):
    concepts_str = ', '.join(concepts)
    relations_str = ', '.join(relations)
    example_lines = '\n'.join(
        f"{p}({s}, {o})" for s, p, o in example_spo_triples
    )

    prompt = (
        "### INSTRUCTION:\n"
        "Your task is to carefully analyze the input text, concepts, relations, and example SPO triples, "
        "and produce a clean, consistent, and high-quality set of SPO triples suitable for building a knowledge graph.\n"
        "You may add new concepts and relations if you find them in the text but not previously included.\n"
        "Note that backtracking or revisiting previous steps is NOT possible.\n"
        "Please output all SPO triples in a clear, structured list format, one triple per line, like: predicate(subject, object).\n"
        "Only return the list of SPO triples. No extra commentary or explanation.\n\n"

        f"### ONTOLOGY CONCEPTS:\n{concepts_str}\n\n"
        f"### ONTOLOGY RELATIONS:\n{relations_str}\n\n"
        
        f"### EXAMPLES:\n{example_lines}\n\n"

        f"### TASK:\nInput Sentence: {text}\nOutput Triples:\n"
    )
    return prompt


In [5]:

# === Updated Full Pipeline Function to use examples ===
def extract_spo_triples_with_refinement(input_text):
    resolved_text = resolve_coref(input_text)
    concepts = extract_concepts(resolved_text)
    print("concepts is here \n",concepts)
    relations = extract_relations(resolved_text)
    print("relations is here \n",relations)
    prompt_initial = create_llm_prompt(resolved_text, concepts, relations)
    outputs = text_generator(prompt_initial, max_new_tokens=1000, do_sample=True, temperature=0.7)
    hermes_output = outputs[0]['generated_text']
    triples = extract_spo_triples(hermes_output)
    print("triples is here \n",triples)

   # print("\n=== Generating Refined Prompt with Examples ===")
    prompt_refined = create_llm_prompt_with_examples(resolved_text, concepts, relations, triples)
    print("Refined prompt is here:\n", prompt_refined)
    return prompt_refined




In [None]:

# # === Example Usage with refinement ===
# if __name__ == "__main__":
#     raw_text = (
#         "The recent advances in large language models (LLM) and foundation models with emergent "
#         "capabilities have been shown to improve the performance of many NLP tasks. LLMs and Knowledge "
#         "Graphs (KG) can complement each other such that LLMs can be used for KG construction or "
#         "completion while existing KGs can be used for different tasks such as making LLM outputs "
#         "explainable or fact-checking in Neuro-Symbolic manner. In this paper, we present Text2KGBench, a "
#         "benchmark to evaluate the capabilities of language models to generate KGs from natural language text "
#         "guided by an ontology."
#     )
#     extract_spo_triples_with_refinement(raw_text)

In [6]:
import json

# === Load JSONL, Process, Save ===
def process_jsonl_and_generate_prompts(input_path: str, output_path: str):
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            try:
                obj = json.loads(line)
                sent_id = obj.get("id")
                sent_text = obj.get("sent")

                if not sent_id or not sent_text:
                    print(f"Skipping line due to missing 'id' or 'sent': {line}")
                    continue

                # Generate refined prompt
                prompt = extract_spo_triples_with_refinement(sent_text)

                # Write result to output file
                json.dump({"id": sent_id, "prompt": prompt}, outfile)
                outfile.write("\n")
                print(f"Processed ID: {sent_id}")

            except Exception as e:
                print(f"Error processing line: {e}")
                continue

In [None]:
JSONL_FILEPATH = 'text_prompt.jsonl'
output_filepath='LLM_text_Response.jsonl'

In [None]:
# === Example Usage ===
if __name__ == "__main__":
    JSONL_FILEPATH = "ont_7_company_test.jsonl"
    output_filepath = "ont_7_company_prompts.jsonl"
    process_jsonl_and_generate_prompts(JSONL_FILEPATH, output_filepath)

concepts is here 
 ['Chinabank', 'Manila', 'the Insular Government of the Philippine Islands']
relations is here 
 ['found']
triples is here 
 [('Chinabank', 'founded at', 'Insular Government of the Philippine Islands'), ('Chinabank', 'type', 'public company'), ('Insular Government of the Philippine Islands', 'located in', 'Manila'), ('subject', 'predicate', 'object')]
Refined prompt is here:
 ### INSTRUCTION:
Your task is to carefully analyze the input text, concepts, relations, and example SPO triples, and produce a clean, consistent, and high-quality set of SPO triples suitable for building a knowledge graph.
You may add new concepts and relations if you find them in the text but not previously included.
Note that backtracking or revisiting previous steps is NOT possible.
Please output all SPO triples in a clear, structured list format, one triple per line, like: predicate(subject, object).
Only return the list of SPO triples. No extra commentary or explanation.

### ONTOLOGY CONCEP

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


concepts is here 
 ['295', '295 locations', 'Chinabank', 'Manila', 'Philippine', 'the insular Government of']
relations is here 
 ['found', 'have']
triples is here 
 [('brown', 'color', 'fox'), ('dog', 'jumped', 'fox'), ('fox', 'jumps', 'over'), ('fox', 'over', 'dog'), ('lazy', 'characteristic', 'dog'), ('quick', 'characteristic', 'fox'), ('subject', 'predicate', 'object'), ('the', 'article', 'dog'), ('the', 'article', 'fox')]
Refined prompt is here:
 ### INSTRUCTION:
Your task is to carefully analyze the input text, concepts, relations, and example SPO triples, and produce a clean, consistent, and high-quality set of SPO triples suitable for building a knowledge graph.
You may add new concepts and relations if you find them in the text but not previously included.
Note that backtracking or revisiting previous steps is NOT possible.
Please output all SPO triples in a clear, structured list format, one triple per line, like: predicate(subject, object).
Only return the list of SPO triple