In [1]:
from llama_cpp import Llama
import re
import json
from IPython.core.display import display, HTML
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from transformers import pipeline


callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

  from IPython.core.display import display, HTML
  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


## Loading LLM locally

In [30]:
llm = Llama(model_path="/Users/ananyahooda/.cache/lm-studio/models/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q4_K_S.gguf",  
n_ctx=2048,
n_gpu_layers=-1,
n_batch=512,
callback_manager=callback_manager,
verbose=True,)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/ananyahooda/.cache/lm-studio/models/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q4_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dime

## Our Triple extraction tool (based on REBEL)

In [88]:
# Initialize the pipeline and tokenizer once
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

def extract_text_triplets(input_text):
    """
    Extracts triplets from the given text.

    Parameters:
    input_text (str): The text from which to extract triplets.

    Returns:
    list: A list of dictionaries, each representing a triplet with 'head', 'type', and 'tail'.
    """
    # Use the tokenizer manually since we need special tokens
    extracted_text = triplet_extractor.tokenizer.batch_decode([
        triplet_extractor(input_text, return_tensors=True, return_text=False)[0]["generated_token_ids"]
    ])

    # Function to parse the generated text and extract the triplets
    def extract_triplets(text):
        triplets = []
        relation, subject, object_ = '', '', ''
        text = text.strip()
        current = 'x'
        for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
            if token == "<triplet>":
                current = 't'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                    relation = ''
                subject = ''
            elif token == "<subj>":
                current = 's'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                object_ = ''
            elif token == "<obj>":
                current = 'o'
                relation = ''
            else:
                if current == 't':
                    subject += ' ' + token
                elif current == 's':
                    object_ += ' ' + token
                elif current == 'o':
                    relation += ' ' + token
        if subject != '' and relation != '' and object_ != '':
            triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
        return triplets

    extracted_triplets = extract_triplets(extracted_text[0])
    return extracted_triplets

In [89]:
import json

# Function to extract triples for each context in eval.json and create pred.json
def generate_pred_json(eval_file_path, pred_file_path):
    # Load the evaluation data from eval.json
    with open(eval_file_path, 'r') as file:
        eval_data = json.load(file)
    
    # Initialize a list to hold the modified data with extracted triples
    modified_data = []
    
    # Iterate over each item in the evaluation data
    for item in eval_data:
        context = item['context']
        # Prepare the command with the contex
        # Use the process_command function to predict the extracted triples
        extracted_t = extract_text_triplets(context)
        # Append the extracted triples to the item under the 'triples' key
        item['triples'] =  extracted_t 
        # Append the modified item to the modified_data list
        modified_data.append(item)
    
    # Write the modified data with extracted triples to pred.json
    with open(pred_file_path, 'w') as file:
        json.dump(modified_data, file, indent=4)

# Example usage
eval_file_path = '/Users/ananyahooda/Desktop/final/data/evaluation_data/conll04_eval.json' # Replace with the actual path to your eval.json file
pred_file_path = '/Users/ananyahooda/Desktop/final/pred_conll04.json' # The output file path
generate_pred_json(eval_file_path, pred_file_path)

In [86]:
extract_text_triplets("An art exhibit at the Hakawati Theatre in Arab east Jerusalem was a series of portraits of Palestinians killed in the rebellion .")

[{'head': 'Hakawati Theatre',
  'type': 'located in the administrative territorial entity',
  'tail': 'Jerusalem'}]

In [None]:

Below are two examples where context is the text and triples are the corresponding extracted triples:
{{
        "context": "John Wilkes Booth , who assassinated President Lincoln , was an actor .
                "triples": [
            {{"head": "John Wilkes Booth","type": "killed by","tail": "President Lincoln"}}
        ]
    }},
{{
        "context": "Marie Magdefrau Ferraro , 50 , of Bethany , Conn. , was shot to death Thursday when two bandits armed with assault rifles emerged from nearby bushes and began firing at a van carrying a Connecticut Audubon Society wildlife wild tour group .",
        "triples": [
            {{"head": "Marie Magdefrau Ferraro", "type": "residence","tail": "Bethany"}},
            {{"head": "Marie Magdefrau Ferraro", "type": "residence", "tail": "Conn."}},
            {{"head": "Bethany","type": "location", "tail": "Conn."}}
        ]
}}

Here are the types/relations which are allowed: {{"killed by", "residence", "location", "headquarters", "location", "employer"}}

## LLM prompt

In [82]:
prompt_template = '''<s>[INST] <<SYS>>
Assistant is an expert JSON builder designed to assist with a wide range of tasks.

Assistant is able to trigger actions for User by responding with JSON strings that contain "action" and "action_input" parameters.

The available action to Assistant is:
- "extract_text_triplets": Useful for when Assistant is asked to extract triplets from a given text.
  - To use the extract_triplets tool, Assistant should respond like so:
    {{"action": "extract_text_triplets", "action_input": "Your text here"}}

Assistant will only output the following relations: "killed by", "residence", "location", "headquarters location", "employer".

Here are some previous conversations between the Assistant and User:

User: Hey how are you today?
Assistant: I'm good thanks, how are you?
User: Can you extract all the triplets from this text: "Gràcia is a district of the city of Barcelona, Spain."
Assistant: {{"action": "extract_text_triplets", "action_input": "Gràcia is a district of the city of Barcelona, Spain."}}
User: Also give triples for "obama was US president"
Assistant: {{"action": "extract_text_triplets", "action_input": "obama was US president"}}


<</SYS>>

{0}[/INST]'''

## Integrating single-tool with LLM

In [83]:
def process_command(command):
    # Put user command into prompt
    prompt = prompt_template.format("User: " + command)
    # Send command to the model
    output = llm(prompt, max_tokens=2000, stop=["User:"])
    response = output['choices'][0]['text']

    # try to find json in the response
    try:
        # Extract json from model response by finding first and last brackets {}
        firstBracketIndex = response.index("{")
        lastBracketIndex = len(response) - response[::-1].index("}")
        jsonString = response[firstBracketIndex:lastBracketIndex]
        responseJson = json.loads(jsonString)
        if responseJson['action'] == 'extract_text_triplets':
            extracted_triplets = extract_text_triplets(responseJson['action_input'])
            return extracted_triplets   
    except Exception as e:
        print(e)
    # No json match, just return response
    return response

In [84]:
process_command("Extract triples for:\"Ananya is working at IIT Bhilai\"")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    4964.28 ms
llama_print_timings:      sample time =       7.61 ms /    30 runs   (    0.25 ms per token,  3942.70 tokens per second)
llama_print_timings: prompt eval time =    2304.51 ms /   205 tokens (   11.24 ms per token,    88.96 tokens per second)
llama_print_timings:        eval time =    2463.19 ms /    29 runs   (   84.94 ms per token,    11.77 tokens per second)
llama_print_timings:       total time =    4895.35 ms /   234 tokens


[{'head': 'Ananya', 'type': 'educated at', 'tail': 'IIT Bhilai'}]

## Generating Prediction files for evaluation

In [51]:
extracted_triplets = process_command("Can you please give triple for \"Ananya works for IIT Bhilai.\"")

# Save the extracted triples to a JSON file
output_file_path = 'out.json'  # Define the output file path

# Write the extracted triples to the output file
with open(output_file_path, 'w') as file:
    json.dump(extracted_triplets, file, indent=4)

# Print a message to indicate that the file has been saved
print(f"Extracted triples have been saved to {output_file_path}")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    4964.28 ms
llama_print_timings:      sample time =      17.49 ms /    62 runs   (    0.28 ms per token,  3544.48 tokens per second)
llama_print_timings: prompt eval time =     428.34 ms /    21 tokens (   20.40 ms per token,    49.03 tokens per second)
llama_print_timings:        eval time =    5295.25 ms /    61 runs   (   86.81 ms per token,    11.52 tokens per second)
llama_print_timings:       total time =    6008.08 ms /    82 tokens


Extra data: line 2 column 1 (char 84)
Extracted triples have been saved to out.json


In [6]:
import json

# Function to extract triples for each context in eval.json and create pred.json
def generate_pred_json(eval_file_path, pred_file_path):
    # Load the evaluation data from eval.json
    with open(eval_file_path, 'r') as file:
        eval_data = json.load(file)
    
    # Initialize a list to hold the modified data with extracted triples
    modified_data = []
    
    # Iterate over each item in the evaluation data
    for item in eval_data:
        context = item['context']
        # Prepare the command with the context
        command = f"Can you please give triples for {context}"
        # Use the process_command function to predict the extracted triples
        extracted_triplets = process_command(command)
        # Append the extracted triples to the item under the 'triples' key
        item['triples'] = extracted_triplets
        # Append the modified item to the modified_data list
        modified_data.append(item)
    
    # Write the modified data with extracted triples to pred.json
    with open(pred_file_path, 'w') as file:
        json.dump(modified_data, file, indent=4)

# Example usage
eval_file_path = '/Users/ananyahooda/Desktop/final/data/evaluation_data/conll04_eval.json' # Replace with the actual path to your eval.json file
pred_file_path = '/Users/ananyahooda/Desktop/final/pred_conll04.json' # The output file path
generate_pred_json(eval_file_path, pred_file_path)

KeyError: '"Loc"'

In [93]:
import json

# Load the JSON data from the two files
with open('pred_conll04_Mistral.json', 'r') as file:
    data1 = json.load(file)

with open('pred_conll04.json', 'r') as file:
    data2 = json.load(file)

# Determine the length of both JSON files
length_data1 = len(data1)
length_data2 = len(data2)

# Print the lengths
print(f"Length of the golden truth JSON file: {length_data1}")
print(f"Length of the prediction JSON file: {length_data2}")

# Convert the lists to dictionaries indexed by the 'id' attribute
data1_dict = {item['id']: item for item in data1}
data2_dict = {item['id']: item for item in data2}

# Find the common IDs
common_ids = set(data1_dict.keys()) & set(data2_dict.keys())

# Extract the common data points
common_data1 = [data1_dict[id] for id in common_ids]
common_data2 = [data2_dict[id] for id in common_ids]

# Save the common data points to new JSON files
with open('pred_conll04_Mistral.json', 'w') as file:
    json.dump(common_data1, file, indent=4)

with open('pred_conll04.json', 'w') as file: 
    json.dump(common_data2, file, indent=4)

# Print a message to indicate that the new files have been created
print(f"Created new JSON files with common data points: 'common_file1.json' and 'common_file2.json'")

Length of the golden truth JSON file: 288
Length of the prediction JSON file: 288
Created new JSON files with common data points: 'common_file1.json' and 'common_file2.json'


In [95]:
import json

# Load the JSON data from the pred.json file
with open('pred_conll04_Mistral.json', 'r') as file:
    pred_data = json.load(file)

# Initialize a counter for entries with "triples" as a string
string_triples_count = 0
string_triples = []

# Iterate over the entries and check the type of "triples"
for entry in pred_data:
    if 'triples' in entry and isinstance(entry['triples'], str):
        string_triples_count += 1
        string_triples.append(entry['triples'])

# Print the count of such entries

print(f"Number of entries with 'triples' as a string: {string_triples_count}")

for i, string in enumerate(string_triples, start=1):
    print(f"String {i}: {string}")

Number of entries with 'triples' as a string: 26
String 1:  {"action": "extract_text_triplets", "action_input": "Of 40 million party line calls logged by New England Telephone over a 2-year period, at least 10 percent were made by those who dial services like XBT Telecom's 'Talkabout' teen line, a group conversation designed for people under age 16, said John Johnson, a spokesman for the telephone company.}

Possible triplets from the text:

1. ["New England Telephone", "logged", "40 million party line calls"]
2. ["Over a 2-year period", "at least", "10 percent"]
3. ["People who dial services like XBT Telecom's 'Talkabout'", "made", "party line calls"]
4. ["XBT Telecom's 'Talkabout'", "is", "a group conversation"]
5. ["People under age 16", "designed for", "XBT Telephone's 'Talkabout'"]
6. ["John Johnson", "said", "spokesman for the telephone company"]
String 2:  {"action": "extract_text_triplets", "action_input": "They plan to resubmit their proposal, and for the moment have pledged t

In [97]:
import json

# Load the JSON data from the two files
with open('pred_conll04.json', 'r') as file:
    data1 = json.load(file)

with open('pred_conll04_Mistral.json', 'r') as file:
    data2 = json.load(file)

# Find the IDs of entries with "triples" as a string in file1
ids_to_remove = [entry['id'] for entry in data2 if 'triples' in entry and isinstance(entry['triples'], str)]

# Remove the entries from both files
filtered_data1 = [entry for entry in data1 if entry['id'] not in ids_to_remove]
filtered_data2 = [entry for entry in data2 if entry['id'] not in ids_to_remove]

# Save the filtered data back to new JSON files
with open('golden_truth.json', 'w') as file:
    json.dump(filtered_data1 , file, indent=4)

with open('prediction.json', 'w') as file:
    json.dump(filtered_data2, file, indent=4)

# Print a message to indicate that the entries have been removed
print(f"Entries with 'triples' as a string have been removed. New files created: 'filtered_file1.json' and 'filtered_file2.json'")

Entries with 'triples' as a string have been removed. New files created: 'filtered_file1.json' and 'filtered_file2.json'


## Code for calculating Scores

In [100]:
import json

# Function to calculate precision, recall, and F1 score
def calculate_scores(tp, total_golden, total_prediction):
    precision = tp / total_prediction if total_prediction > 0 else 0
    recall = tp / total_golden if total_golden > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

# Function to process the files and calculate the scores, considering extras
def evaluate_predictions_corrected(golden_file, prediction_file):
    # Load the golden truths and predictions
    with open(golden_file, 'r') as f:
        golden_data = json.load(f)
    with open(prediction_file, 'r') as f:
        prediction_data = json.load(f)

    tp = 0
    extras = 0

    # Convert golden data and prediction data into dictionaries for easier access
    golden_dict = {item['id']: set(tuple(triple.items()) for triple in item['triples']) for item in golden_data}
    prediction_dict = {item['id']: set(tuple(triple.items()) for triple in item['triples']) for item in prediction_data}

    # Iterate over each instance in the golden data to calculate true positives
    for id, golden_triples in golden_dict.items():
        prediction_triples = prediction_dict.get(id, set())
        tp += len(golden_triples & prediction_triples)

    # Calculate extras in prediction
    for id, prediction_triples in prediction_dict.items():
        if id not in golden_dict:
            extras += len(prediction_triples)
        else:
            unmatched_triples = prediction_triples - golden_dict[id]
            print(unmatched_triples)
            extras += len(unmatched_triples)

    # Calculate micro scores
    total_golden = sum(len(triples) for triples in golden_dict.values())
    total_prediction = sum(len(triples) for triples in prediction_dict.values())
    precision_micro, recall_micro, f1_micro = calculate_scores(tp, total_golden, total_prediction)

    # Calculate macro scores
    total_items = len(golden_dict)
    precision_macro, recall_macro, f1_macro = 0, 0, 0
    for id, golden_triples in golden_dict.items():
        prediction_triples = prediction_dict.get(id, set())
        tp = len(golden_triples & prediction_triples)
        precision, recall, _ = calculate_scores(tp, len(golden_triples), len(prediction_triples))
        precision_macro += precision
        recall_macro += recall
    precision_macro /= total_items
    recall_macro /= total_items
    f1_macro = 2 * (precision_macro * recall_macro) / (precision_macro + recall_macro) if (precision_macro + recall_macro) > 0 else 0

    return {
        'micro': {
            'precision': precision_micro,
            'recall': recall_micro,
            'f1': f1_micro
        },
        'macro': {
            'precision': precision_macro,
            'recall': recall_macro,
            'f1': f1_macro
        },
        'true_positives': tp,
        'extras': extras
    }


In [101]:
scores = evaluate_predictions_corrected('golden_truth.json', 'prediction.json')
print("Micro Scores:", scores['micro'])
print("Macro Scores:", scores['macro'])
print(scores['extras'])

set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
{(('head', 'West Germany'), ('type', 'member of'), ('tail', 'AP'))}
set()
set()
set()
set()
set()
set()
set()
set()
set()
{(('head', 'Bingham County'), ('type', 'office held by head of government'), ('tail', "Sheriff's")), (('head', "Sheriff's"), ('type', 'applies to jurisdiction'), ('tail', 'Bingham County'))}
{(('head', 'Yerevan'), ('type', 'located in the administrative territorial entity'), ('tail', 'Armenian')), (('head', 'Armenian'), ('type', 'capital'), ('tail', 'Yerevan'))}
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
{(('head', "Maryland's House of Delegates"), ('type', 'chairperson'), ('tail', 'Judith C. Toth'))}
set()
set()
set()
set()
