In [3]:
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is available
print(torch.cuda.current_device())  # Should print the current GPU device index

True
0


In [4]:
torch.cuda.empty_cache()

In [5]:
!nvidia-smi

Tue Sep 10 19:32:01 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1080 Ti     On  |   00000000:81:00.0 Off |                  N/A |
| 20%   47C    P5             24W /  250W |       3MiB /  11264MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!kill -9 2965937

/bin/bash: line 1: kill: (2965937) - No such process


# FOR QUANTATIZED MODEL

In [6]:
#!pip install transformers bitsandbytes accelerate

In [7]:
import torch

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer


In [9]:
from transformers import pipeline

2024-09-10 19:32:08.818710: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 19:32:08.844179: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 19:32:08.852016: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-10 19:32:08.871737: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
import bitsandbytes as bnb

In [11]:
model_id = "meta-llama/Meta-Llama-3-8B"

In [12]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [13]:
# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,  # Enable 4-bit quantization
    device_map="auto",  # Automatically place model layers on GPU
    torch_dtype=torch.float16
    #quantization_config=bnb.BnbQuantizationConfig(load_in_4bit=True)
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
# Initialize the text-generation pipeline
text_generator = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer
    #device=0  # Use GPU 0
)

In [18]:
# Check if the model is on GPU or CPU
for param in model.parameters():
    print(f"Model is on: {param.device}")
    break  # Check one parameter

Model is on: cuda:0


In [63]:
import json

In [64]:
import jsonlines


In [65]:
import re

In [66]:
def load_prompts_data(filepath):
    with jsonlines.open(filepath) as reader:
        return list(reader)

In [67]:
def extract_prompt_info(item):
    prompt_id = item.get('id')
    prompt_text = item.get('prompt')
    print("prompt_text",prompt_text)
    return prompt_id, prompt_text

In [68]:
# Assuming 'text_generator' is already defined somewhere in your environment
def generate_text(prompt_text, max_length=500, num_return_sequences=1):
    response = text_generator(prompt_text, max_length=max_length, num_return_sequences=num_return_sequences)
    return response

In [69]:
def extract_test_output(response):
    print("response",response)
    if response and len(response) > 0:
        generated_text = response[0].get('generated_text', '')
        match = re.search(r'Test Output:\s*(.*?)\s*(?:\n|$)', generated_text, re.DOTALL)
        if match:
            return match.group(1).strip()
    return 'Output not found'

In [70]:
def parse_model_output(model_output):
    # Initialize an empty list to store triples
    triples = []
    
    # Remove leading/trailing whitespace and split the input data into lines
    lines = [line.strip() for line in model_output.strip().split('\n')]
    
    # Define the regex pattern to match triples in the format: relation(subject, object)
    pattern = re.compile(r'(.+?)\s*\(([^,]+),\s*([^)]+)\)')
    
    for line in lines:
        # Find all matches for the pattern in the line
        matches = pattern.findall(line)
        
        for match in matches:
            relation, subject, obj = match
            # Clean up subject and object values
            subject = subject.strip()
            obj = obj.strip()
            
            # Append the extracted triple to the list
            triples.append({"sub": subject, "rel": relation, "obj": obj})
    
    # Return the list of triples
    return triples

In [71]:
#JSONL_FILEPATH = 'ont_1_movie_prompts.jsonl'
#output_filepath='LLM_Response.jsonl'
JSONL_FILEPATH = 'Wikidata/Input_Prompts/ont_4_book_prompts.jsonl'
output_filepath='Wikidata/Response/ont_4_book_llm_response.jsonl'

#'Wikidata/Evaluation_Statistics/ont_6_computer_llm_stats.jsonl'

In [72]:
def main(filepath, output_filepath, num_prompts=4):
    prompts_data = load_prompts_data(filepath)
    processed_data = []
    for i in range(min(num_prompts, len(prompts_data))):
        item = prompts_data[i]
        prompt_id, prompt_text = extract_prompt_info(item)
        response = generate_text(prompt_text)
        test_output = extract_test_output(response)
        
        # Debugging: Print the test output to ensure it's correctly extracted
        print(f"Test Output for ID {prompt_id}: {test_output}")
        
        # Parse the test output into triples
        triples = parse_model_output(test_output)
        
        # Debugging: Print the parsed triples to ensure they're correct
        print(f"Parsed Triples for ID {prompt_id}: {triples}")
        
        processed_entry = {
            "id": prompt_id,
            "triples": triples
        }
        
        processed_data.append(processed_entry)
    
    # Save the processed data into a new jsonl file
    save_triples_to_jsonl(processed_data, output_filepath)
    print(f"Processed triples saved to {output_filepath}")

In [74]:
# Replace 'your_input_filepath.jsonl' with your actual input file path
# Replace 'your_output_filepath.jsonl' with your desired output file path
main(JSONL_FILEPATH, output_filepath, num_prompts=1)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


prompt_text 
Given the following ontology and sentences, please extract the triples from the sentence according to the relations in the ontology. In the output, only include the triples in the given output format.
CONTEXT:
Ontology Concepts: book, literary work, author, publisher, International Standard Book Number, library, calender date, film, fictional character, writer, scientific journal, article, human, literary genre, publication, trade magazine, language, intellectual work, country, territory,
Ontology Relations: illustrator(,human), followed_by(,), publication_date(book,), author(book,human), publisher(book,publisher), characters(literary work,fictional character), editor(trade magazine,human), place_of_publication(,), narrative_location(literary work,territory), genre(literary work,literary genre), language_of_work_or_name(literary work,language), depicts(,)

Example Sentence: The Foundling and Other Tales of Prydain is a collection of short high fantasy stories for children 

In [None]:
# Ignore the below cells

In [227]:
def load_jsonl(file_path):
    """
    Load data from a JSONL file.
    
    :param file_path: Path to the JSONL file.
    :return: A list of dictionaries representing each line of the file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return data

In [228]:
# Load the data
ground_truth = load_jsonl('ont_1_movie_prompts.jsonl')
system_predicted = load_jsonl('LLM_Response.jsonl')

In [231]:
# Normalize and print system predicted triples
print("\nNormalized System Predicted Triples:")
for entry in system_predicted:
    print(f"ID: {entry['id']}")
    for triple in entry['triples']:
        norm_triple = normalize_triple(triple['sub'], triple['rel'], triple['obj'])
        print(f"Original Triple: {triple}")
        print(f"Normalized Triple: {norm_triple}")


Normalized System Predicted Triples:
ID: ont_1_movie_test_1
Original Triple: {'sub': 'Bleach: Hell Verse', 'rel': 'director', 'obj': 'Noriyuki Abe'}
Normalized Triple: bleachhellversedirectornoriyukiabe
ID: ont_1_movie_test_2
Original Triple: {'sub': 'Keyboard Cat', 'rel': 'director', 'obj': 'Charlie Schmidt'}
Normalized Triple: keyboardcatdirectorcharlieschmidt
ID: ont_1_movie_test_3
Original Triple: {'sub': 'Tenchi Forever! The Movie', 'rel': 'director', 'obj': 'Mitsuko Kase'}
Normalized Triple: tenchiforeverthemoviedirectormitsukokase
Original Triple: {'sub': 'Tenchi Forever! The Movie', 'rel': ',director', 'obj': 'Takashi Imanishi'}
Normalized Triple: tenchiforeverthemoviedirectortakashiimanishi
ID: ont_1_movie_test_4


In [None]:
# Normalize and print ground truth triples
print("Normalized Ground Truth Triples:")
for entry in ground_truth:
    print(f"ID: {entry['id']}")
    for triple in entry['triples']:
        norm_triple = normalize_triple(triple['sub'], triple['rel'], triple['obj'])
        print(f"Original Triple: {triple}")
        print(f"Normalized Triple: {norm_triple}")

In [239]:
def calculate_precision(ground_truth, system_predicted):
    """
    Calculate the precision of the system-predicted triples against the ground truth.
    
    :param ground_truth: List of ground truth entries.
    :param system_predicted: List of system-predicted entries.
    :return: Precision value as a float.
    """
    correct_predictions = 0
    total_predictions = 0
    
    # Convert ground truth data into a dictionary for quick lookup by ID
    ground_truth_dict = {entry['id']: entry['triples'] for entry in ground_truth}
    
    # Loop through each system-predicted entry
    for entry in system_predicted:
        predicted_id = entry['id']
        predicted_triples = entry.get('triples', [])
        
        if predicted_id in ground_truth_dict:
            ground_truth_triples = ground_truth_dict[predicted_id]
            
            # Normalize the ground truth triples
            normalized_ground_truth = set(
                normalize_triple(triple['sub'], triple['rel'], triple['obj']) 
                for triple in ground_truth_triples
            )
            
            # Normalize the predicted triples
            normalized_predictions = set(
                normalize_triple(triple['sub'], triple['rel'], triple['obj']) 
                for triple in predicted_triples
            )
            
            # Count correctly predicted triples
            correct_predictions += len(normalized_predictions & normalized_ground_truth)
        
        # Update total predictions
        total_predictions += len(predicted_triples)
    
    # Calculate precision
    precision = correct_predictions / total_predictions if total_predictions > 0 else 0.0
    
    return precision

In [240]:
def main(ground_truth_file, system_predicted_file):
    # Load data from JSONL files
    ground_truth = load_jsonl(ground_truth_file)
    system_predicted = load_jsonl(system_predicted_file)
    
    # Calculate precision
    precision = calculate_precision(ground_truth, system_predicted)
    
    print(f"Precision: {precision:.4f}")

# Example usage:
# main('ground_truth.jsonl', 'LLM_Response.jsonl')


In [241]:
main(('ont_1_movie_prompts.jsonl'), 'LLM_Response.jsonl')


KeyError: 'triples'

In [245]:
def calculate_precision_recall_f1(gold: set, pred: set) -> (float, float, float):
    """
    Method to calculate precision, recall and f1:
        Precision is calculated as correct_triples/predicted_triples and
        Recall as correct_triples/gold_triples
        F1 as the harmonic mean of precision and recall.
    :param gold: items in the gold standard
    :param pred: items in the system prediction
    :return:
        p: float - precision
        r: float - recall
        f1: float - F1
    """
    if not pred:
        # If there are no predictions, precision and F1 are 0
        return 0, 0, 0

    # Calculate precision
    correct = len(gold.intersection(pred))
    p = correct / len(pred) if len(pred) > 0 else 0

    if not gold:
        # If there are no gold triples, recall and F1 are 0
        return p, 0, 0

    # Calculate recall
    r = correct / len(gold) if len(gold) > 0 else 0

    # Calculate F1 score
    if p + r > 0:
        f1 = 2 * (p * r) / (p + r)
    else:
        f1 = 0

    return p, r, f1


In [257]:
def main(ground_truth_file, system_predicted_file):
    # Load the ground truth and system predicted data
    ground_truth = load_jsonl(ground_truth_file)

    system_predicted = load_jsonl(system_predicted_file)
    
    # Calculate precision, recall, and F1-score
    precision, recall, f1 = calculate_precision_recall_f1(
        extract_normalized_triples(ground_truth),
         extract_normalized_triples(system_predicted)
    )
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

def extract_normalized_triples(data):
    """
    Extract and normalize triples from the dataset.
    :param data: List of dictionaries containing triples.
    :return: A set of normalized triples.
    """
    normalized_triples = set()
    for entry in data:
        for triple in entry.get('triples', []):
            norm_triple = normalize_triple(triple['sub'], triple['rel'], triple['obj'])
            normalized_triples.add(norm_triple)
    return normalized_triples


In [278]:
#main('ont_1_movie_prompts.jsonl', 'LLM_Response.jsonl')

In [279]:
# Record the start time
start_time = time.time()
# Execute the pipeline

result = text_generator("Hey how are you doing today?", max_length=50, num_return_sequences=1)

# Record the end time
end_time = time.time()

# Calculate the duration
execution_time = end_time - start_time

# Print the result and execution time
print("Result:", result)
print(f"Execution Time: {execution_time:.2f} seconds")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Result: [{'generated_text': "Hey how are you doing today? I'm doing great! I'm so happy that you have decided to join me on my journey to being more healthy and happy. This is a journey that I will be taking with you for the next 21"}]
Execution Time: 3.08 seconds


In [282]:
# Extract the first prompt
item = prompts_data[0]
prompt_id = item['id']
prompt_text = item['prompt']

In [283]:
# Generate output using the Llama model
response = text_generator(prompt_text, max_length=500, num_return_sequences=1)
response

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': '\nGiven the following ontology and sentences, please extract the triples from the sentence according to the relations in the ontology. In the output, only include the triples in the given output format.\nCONTEXT:\nOntology Concepts: human, city, country, film, film genre, genre, film production company, film award, award, written work, film character, film organization,\nOntology Relations: director(film,human), screenwriter(film,human), genre(film,genre), based_on(film,written work), cast_member(film,human), award_received(film,award), production_company(film,film production company), country_of_origin(film,country), publication_date(film,), characters(film,film character), narrative_location(film,city), filming_location(film,city), main_subject(film,), nominated_for(film,award), cost(film,)\n\nExample Sentence: Resident Evil: Damnation, known as Biohazard: Damnation ( , BaiohazÄ\x81do: DamunÄ\x93shon) in Japan, is a 2012 Japanese adult animated biopunk horror act