In [1]:
from langchain.llms import Ollama

In [2]:
import json

In [3]:
import jsonlines   

In [4]:
# Configuration
LOCAL_MODEL = "llama3:latest"
RDF_FILEPATH = r"D:\New folder\RDF_Extractor_Llama3\Barack-Obama.txt"  # Update this with your Text file path

In [5]:
# Update paths for input and output files
JSONL_FILEPATH = r"D:\New folder\RDF_Extractor_Llama3\Data\ont_1_movie_prompts.JSONL"
OUTPUT_FILEPATH = r"D:\New folder\RDF_Extractor_Llama3\Data\Llama3_Output\generated_output.jsonl"

In [6]:
# Initialize the language model
llm = Ollama(base_url="http://localhost:11434", model=LOCAL_MODEL, verbose=True)

In [7]:
# Load the JSONL file containing prompts
with jsonlines.open(JSONL_FILEPATH) as reader:
    prompts_data = list(reader)

In [8]:
# List to store the output results
results = []
# Process each prompt
for item in prompts_data:
    prompt_id = item['id']
    prompt_text = item['prompt']
    #print(prompt_id)
    # Generate output using the Llama model
    response = llm.generate([prompt_text], max_length=900)
    # Extract generated text
    if response and response.generations:
        generated_text = response.generations[0][0].text
    else:
        generated_text = "No text generated."
    # Store the result
    result = {
        "id": prompt_id,
        "prompt": prompt_text,
        "generated_text": generated_text
    }
    results.append(result)

    # Print the generated summary (optional)
    print(f"ID: {prompt_id}\nGenerated Summary:\n{generated_text}\n{'-'*50}")

ID: ont_1_movie_test_1
Generated Summary:
I'd be happy to help!

Here are the triples extracted from the test sentence according to the relations in the ontology:

* director(Bleach: Hell Verse,Noriyuki Abe)

Let me know if you have any further questions or if there's anything else I can help with!
--------------------------------------------------
ID: ont_1_movie_test_2
Generated Summary:
Here are the extracted triples:

* screenwriter(Keyboard Cat,Charlie Schmidt)
* director(Keyboard Cat,Charlie Schmidt)

Note that there is no specific film mentioned in the sentence, so I only extracted relations related to the human (Charlie Schmidt) and the work being described (Keyboard Cat). If you'd like me to extract more information or clarify anything, feel free to ask!
--------------------------------------------------
ID: ont_1_movie_test_3
Generated Summary:
Here are the triples extracted from the sentence according to the relations in the ontology:

* director(series,Hiroshi Negishi)
* di

KeyboardInterrupt: 

In [None]:
# Save the results to a new JSONL file
with jsonlines.open(OUTPUT_FILEPATH, mode='w') as writer:
    writer.write_all(results)

print(f"Results saved to {OUTPUT_FILEPATH}")

In [80]:
Llama_output = """
ID: ont_1_movie_test_1
Generated Summary:
I'd be happy to help!

Here are the triples extracted from the test sentence according to the relations in the ontology:

* director(Bleach: Hell Verse,Noriyuki Abe)

Let me know if you have any further questions or if there's anything else I can help with!
--------------------------------------------------
ID: ont_1_movie_test_2
Generated Summary:
Here are the extracted triples:

* screenwriter(Keyboard Cat,Charlie Schmidt)
* director(Keyboard Cat,Charlie Schmidt)

Note that there is no specific film mentioned in the sentence, so I only extracted relations related to the human (Charlie Schmidt) and the work being described (Keyboard Cat). If you'd like me to extract more information or clarify anything, feel free to ask!
--------------------------------------------------
ID: ont_1_movie_test_3
Generated Summary:
Here are the triples extracted from the sentence according to the relations in the ontology:

* director(series,Hiroshi Negishi)
* director(series,Mitsuko Kase)
* director(series,Takashi Imanishi)
--------------------------------------------------
ID: ont_1_movie_test_4
Generated Summary:
Here are the extracted triples in the given output format:

* written_and_directed_by(Spirited Away,Hayao Miyazaki)
* production_company(Spirited Away,Studio Ghibli)
* production_company(Spirited Away,Tokuma Shoten)
* production_company(Spirited Away,Nippon Television Network)
* production_company(Spirited Away,Dentsu)
* production_company(Spirited Away,Buena Vista Home Entertainment)
* production_company(Spirited Away,Tohokushinsha Film)
* production_company(Spirited Away,Mitsubishi)
* distribution_channel(Spirited Away,Toho)

Note: Since there are multiple production companies and a distribution channel mentioned in the sentence, I extracted separate triples for each.
--------------------------------------------------
ID: ont_1_movie_test_5
Generated Summary:
Based on the given ontology and sentence, I will extract the triples according to the relations:

Looney Tunes: Back in Action is a 2003 American live-action/animated comedy film directed by Joe Dante and written by Larry Doyle.

Output:

* director(Looney Tunes: Back in Action,Joe Dante)
* screenwriter(Looney Tunes: Back in Action,Larry Doyle)

Note that I only extracted the relations mentioned in the sentence, which are "directed by" and "written by". If there were other relationships mentioned, such as genre or production company, those would also be included.
--------------------------------------------------
ID: ont_1_movie_test_6
Generated Summary:
Here are the extracted triples from the test sentence:

* screenwriter(The Life on the Earth - The Summer of Dioxin,Satoshi Dezaki)
* production_company(The Life on the Earth - The Summer of Dioxin,Magic Bus)
* production_company(The Life on the earth - The Summer of Dioxin,GoGo Visual Planning)
* narrative_location(The Life on the Earth - The Summer of Dioxin,Seveso)
* country_of_origin(The Life on the Earth - The Summer of Dioxin,Italy)
--------------------------------------------------
ID: ont_1_movie_test_7
Generated Summary:
Here are the extracted triples in the given output format:

* director(The Flame of Love,Richard Eichberg)
* director(The Flame of Love,Walter Summers)
* cast_member(The Flame of Love,Anna May Wong)
* cast_member(The Flame of Love,John Longden)
--------------------------------------------------
ID: ont_1_movie_test_8
Generated Summary:
Based on the ontology and the sentence, I can extract the following triple:

* director(Metal Skin Panic MADOX-01,Shinji Aramaki)
* screenwriter(Metal Skin Panic MADOX-01,Shinji Aramaki)

Let me know if you'd like me to help with anything else!
--------------------------------------------------
ID: ont_1_movie_test_9
Generated Summary:
Based on the given ontology and sentence, I extracted the following triples:

* director(Bleach: Memories of Nobody,Noriyuki Abe)

Note that I assumed "Bleach: Memories of Nobody" as the film, as it is not explicitly mentioned in the test sentence. If you'd like to extract the full output format, including the publication date and release locations, here are the additional triples:

* director(Bleach: Memories of Nobody,Noriyuki Abe)
* screenwriter(Bleach: Memories of Nobody,,) (Note: the screenwriter is not explicitly mentioned in the sentence, so I left it blank)
* genre(Bleach: Memories of Nobody,) (Note: the film genre is not explicitly mentioned in the sentence, so I left it blank)
* publication_date(Bleach: Memories of Nobody,2008-12-13)
* release_location1(Bleach: Memories of Nobody,Japan)
* release_location2(Bleach: Memories of Nobody,US)
--------------------------------------------------
ID: ont_1_movie_test_10
Generated Summary:
Here are the triples extracted from the test sentence according to the relations in the ontology:

* director(Fullmetal Alchemist the Movie: Conqueror of Shamballa,Seiji Mizushima)
* screenwriter(Fullmetal Alchemist the Movie: Conqueror of Shamballa,Sho Aikawa)
--------------------------------------------------
ID: ont_1_movie_test_11
Generated Summary:
Here are the triples extracted from the test sentence:

* director(Alice's Wonderland,Walt Disney)
* production_company(Alice's Wonderland,Walt Disney)
* narrative_location(Alice's Wonderland,Kansas City)
--------------------------------------------------
ID: ont_1_movie_test_12
Generated Summary:
Based on the provided ontology, I will extract the triples from the given sentence.

The sentence is: "Welcome to Eltingville was written by Dorkin and Chuck Sheetz and received a positive reception."

From this sentence, we can extract the following triples:

* based_on(Welcome to Eltingville,written work)
* screenwriter(Welcome to Eltingville,Dorkin)
* screenwriter(Welcome to Eltingville,Chuck Sheetz)

Note that there is no mention of an award or a specific reception date, so those relationships cannot be inferred from this sentence.

The output in the desired format would be:

based_on(Welcome to Eltingville,written work)
screenwriter(Welcome to Eltingville,Dorkin)
screenwriter(Welcome to Eltingville,Chuck Sheetz)
--------------------------------------------------
ID: ont_1_movie_test_13
Generated Summary:
Based on the ontology concepts and relations, I can extract the triples from the sentence as follows:

* director(The Night of Taneyamagahara,Kazuo Oga)
* production_company(The Night of Taneyamagahara,Studio Ghibli)

Note that there is no mention of screenwriter in the sentence, so no triple can be extracted for that relation.
--------------------------------------------------
ID: ont_1_movie_test_14
Generated Summary:
Based on the given ontology and sentence, I extracted the following triples:

* director(The Magnificent World of El-Hazard,Hiroki Hayashi)
* screenwriter(The Magnificent World of El-Hazard,Ryoe Tsukimura)

Let me know if you'd like me to clarify or extend my response!
--------------------------------------------------
ID: ont_1_movie_test_15
Generated Summary:
Based on the ontology and sentence, I extracted the following triples:

* screenwriter(Daicon III,Hideaki Anno)
* screenwriter(Daicon III,Hiroyuki Yamaga)
* screenwriter(Daicon III,Takami Akai)
* director(Daicon IV,Yamaga)
* animation_supervisor(Daicon IV,Anno)
* animation_supervisor(Daicon IV,Akai)
--------------------------------------------------
ID: ont_1_movie_test_16
Generated Summary:
Here are the extracted triples from the test sentence according to the relations in the ontology:

* director(The Disappearance of Haruhi Suzumiya,Tatsuya Ishihara)
* screenwriter(The Disappearance of Haruhi Suzumiya,Fumihiko Shimo)
* screenwriter(The Disappearance of Haruhi Suzumiya,Yasuhiro Takemoto)
* production_company(The Disappearance of Haruhi Suzumiya,Kyoto Animation)
--------------------------------------------------
ID: ont_1_movie_test_17
Generated Summary:
Here are the extracted triples:

* director(Who Framed Roger Rabbit,Robert Zemeckis)
* screenwriter(Who Framed Roger Rabbit,Jeffrey Price)
* screenwriter(Who Framed Roger Rabbit,Peter S. Seaman)
* production_company(Who Framed Roger Rabbit,Frank Marshall)
* production_company(Who Framed Roger Rabbit,Robert Watts)
--------------------------------------------------
ID: ont_1_movie_test_18
Generated Summary:
Here are the extracted triples:

* director(Beyond the Clouds, Makoto Shinkai)
* written_by(Beyond the Clouds, Makoto Shinkai)
* produced_by(Beyond the Clouds, Makoto Shinkai)
* cinematographed_by(Beyond the Clouds, Makoto Shinkai)
* edited_by(Beyond the Clouds, Makoto Shinkai)
* storyboarded_by(Beyond the Clouds, Makoto Shinkai)

Note that there is no specific mention of genre, award, or country_of_origin in this sentence, so those triples are not included.
--------------------------------------------------
ID: ont_1_movie_test_19
Generated Summary:
Based on the ontology and the test sentence, I will extract the triples according to the relations in the ontology. Here are the extracted triples:

* director(Arcadia of My Youth,Tomoharu Katsumata)
* animation_director(Arcadia of My Youth,Kazuo Komatsubara)

Note that I did not include any genre or award information, as it was not present in the test sentence.
--------------------------------------------------
ID: ont_1_movie_test_20
Generated Summary:
director(Friday the 13th Part 2,Raja Gosnell)
"""

In [81]:
def process_llama_output(output):
    # Split the output by ID segments
    segments = output.strip().split('\n--------------------------------------------------\n')
    
    results = []
    for segment in segments:
        # Extract ID and summary
        id_match = re.search(r'ID: (\S+)', segment)
        if id_match:
            id_value = id_match.group(1)
        else:
            continue
        
        # Extract the summary part of the segment
        summary_match = re.search(r'Generated Summary:\n([\s\S]+)', segment)
        if summary_match:
            summary_text = summary_match.group(1)
            
            # Extract the triples
            triples = re.findall(r'\* (\w+)\(([^,]+),([^)]*)\)', summary_text)
            
            # Format the response
            formatted_response = {
                "id": id_value,
                "response": '\n'.join(f"{triple[0]}({triple[1]},{triple[2]})" for triple in triples),
                "triples": [[triple[1], triple[0], triple[2]] for triple in triples]
            }
            
            results.append(formatted_response)
    
    return results


In [90]:
# Process the raw output
formatted_LLM_Out_results = process_llama_output(Llama_output)

# Print formatted results as JSON
print(json.dumps(formatted_LLM_Out_results, indent=2))

[
  {
    "id": "ont_1_movie_test_1",
    "response": "director(Bleach: Hell Verse,Noriyuki Abe)",
    "triples": [
      [
        "Bleach: Hell Verse",
        "director",
        "Noriyuki Abe"
      ]
    ]
  },
  {
    "id": "ont_1_movie_test_2",
    "response": "screenwriter(Keyboard Cat,Charlie Schmidt)\ndirector(Keyboard Cat,Charlie Schmidt)",
    "triples": [
      [
        "Keyboard Cat",
        "screenwriter",
        "Charlie Schmidt"
      ],
      [
        "Keyboard Cat",
        "director",
        "Charlie Schmidt"
      ]
    ]
  },
  {
    "id": "ont_1_movie_test_3",
    "response": "director(series,Hiroshi Negishi)\ndirector(series,Mitsuko Kase)\ndirector(series,Takashi Imanishi)",
    "triples": [
      [
        "series",
        "director",
        "Hiroshi Negishi"
      ],
      [
        "series",
        "director",
        "Mitsuko Kase"
      ],
      [
        "series",
        "director",
        "Takashi Imanishi"
      ]
    ]
  },
  {
    "id": "ont_

In [66]:
import re
import json

In [83]:
# Function to read the JSONL file and extract ground truth data
def load_ground_truth(file_path, max_id=20):
    ground_truth = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            # Extract the numeric part of the ID and check if it's within the range
            numeric_id = int(data["id"].split("_")[-1])
            if numeric_id <= max_id:
                ground_truth.append(data)
    return ground_truth

In [84]:
def normalize_string(s):
    return s.replace(" :", ":").replace(": ", ":").strip()

In [85]:
# Function to calculate precision
def calculate_precision(llm_output, ground_truth):
    total_correct_matches = 0
    total_predicted = 0
    matching_triples = []

    # Convert ground truth triples to a dictionary for easier comparison
    ground_truth_dict = {}
    for entry in ground_truth:
        ground_truth_dict[entry["id"]] = [(normalize_string(t["sub"]), t["rel"].strip(), t["obj"].strip()) for t in entry["triples"]]

    for entry in llm_output:
        predicted_triples = [(normalize_string(t[0]), t[1].strip(), t[2].strip()) for t in entry["triples"]]
        total_predicted += len(predicted_triples)

        ground_truth_triples = ground_truth_dict.get(entry["id"], [])

        for triple in predicted_triples:
            if triple in ground_truth_triples:
                total_correct_matches += 1
                matching_triples.append(triple)

    # Calculate precision
    print("Total Correct Matches:", total_correct_matches)
    print("Total Predicted Triples:", total_predicted)
    print("Matching Triples:", matching_triples)
    
    precision = total_correct_matches / total_predicted if total_predicted > 0 else 0
    return precision

In [86]:
# File path to the ground truth JSONL file
file_path = r'D:\New folder\RDF_Extractor_Llama3\Data\ont_1_movie_ground_truth.jsonl'

In [87]:
# Load the ground truth data for IDs 1-20
ground_truth = load_ground_truth(file_path, max_id=20)

In [91]:
# Calculate precision
precision = calculate_precision(formatted_LLM_Out_results, ground_truth)
print(f"Precision: {precision:.2f}")

Total Correct Matches: 13
Total Predicted Triples: 70
Matching Triples: [('Bleach:Hell Verse', 'director', 'Noriyuki Abe'), ('Keyboard Cat', 'director', 'Charlie Schmidt'), ('Looney Tunes:Back in Action', 'director', 'Joe Dante'), ('The Flame of Love', 'director', 'Richard Eichberg'), ('Metal Skin Panic MADOX-01', 'director', 'Shinji Aramaki'), ('Fullmetal Alchemist the Movie:Conqueror of Shamballa', 'director', 'Seiji Mizushima'), ('The Night of Taneyamagahara', 'director', 'Kazuo Oga'), ('The Disappearance of Haruhi Suzumiya', 'director', 'Tatsuya Ishihara'), ('The Disappearance of Haruhi Suzumiya', 'screenwriter', 'Fumihiko Shimo'), ('Who Framed Roger Rabbit', 'director', 'Robert Zemeckis'), ('Who Framed Roger Rabbit', 'screenwriter', 'Jeffrey Price'), ('Who Framed Roger Rabbit', 'screenwriter', 'Peter S. Seaman'), ('Arcadia of My Youth', 'director', 'Tomoharu Katsumata')]
Precision: 0.19


Please ignore below cells

In [None]:
# Provided LLM output and ground truth
llm_output = [
    {
        "id": "ont_1_movie_test_1",
        "response": "director(Bleach: Hell Verse,Noriyuki Abe)",
        "triples": [
            ["Bleach: Hell Verse", "director", "Noriyuki Abe"]
        ]
    },
    {
        "id": "ont_1_movie_test_2",
        "response": "screenwriter(Keyboard Cat,Charlie Schmidt)\ndirector(Keyboard Cat,Charlie Schmidt)",
        "triples": [
            ["Keyboard Cat", "screenwriter", "Charlie Schmidt"],
            ["Keyboard Cat", "director", "Charlie Schmidt"]
        ]
    },
    {
        "id": "ont_1_movie_test_3",
        "response": "director(series,Hiroshi Negishi)\ndirector(series,Mitsuko Kase)\ndirector(series,Takashi Imanishi)",
        "triples": [
            ["series", "director", "Hiroshi Negishi"],
            ["series", "director", "Mitsuko Kase"],
            ["series", "director", "Takashi Imanishi"]
        ]
    }
]




ground_truth = [
    {
        "id": "ont_1_movie_test_1",
        "triples": [
            {"sub": "Bleach : Hell Verse", "rel": "director", "obj": "Noriyuki Abe"},
            {"sub": "Bleach : Hell Verse", "rel": "publication date", "obj": "01 January 2010"}
        ]
    },
    {
        "id": "ont_1_movie_test_2",
        "triples": [
            {"sub": "Keyboard Cat", "rel": "cast member", "obj": "Fatso the Cat"},
            {"sub": "Keyboard Cat", "rel": "director", "obj": "Charlie Schmidt"}
        ]
    },
    {
        "id": "ont_1_movie_test_3",
        "triples": [
            {"sub": "Mobile Suit Gundam 0083 : Stardust Memory", "rel": "director", "obj": "Takashi Imanishi"}
        ]
    }
]



In [None]:
def normalize_string(s):
    return s.replace(" :", ":").replace(": ", ":").strip()

def calculate_precision(llm_output, ground_truth):
    total_correct_matches = 0
    total_predicted = 0
    matching_triples = []

    # Convert ground truth triples to a dictionary for easier comparison
    ground_truth_dict = {}
    for entry in ground_truth:
        ground_truth_dict[entry["id"]] = [(normalize_string(t["sub"]), t["rel"].strip(), t["obj"].strip()) for t in entry["triples"]]

    for entry in llm_output:
        predicted_triples = [(normalize_string(t[0]), t[1].strip(), t[2].strip()) for t in entry["triples"]]
        total_predicted += len(predicted_triples)

        ground_truth_triples = ground_truth_dict.get(entry["id"], [])

        for triple in predicted_triples:
            if triple in ground_truth_triples:
                total_correct_matches += 1
                matching_triples.append(triple)

    # Calculate precision
    print("Total Correct Matches:", total_correct_matches)
    print("Total Predicted Triples:", total_predicted)
    print("Matching Triples:", matching_triples)
    
    precision = total_correct_matches / total_predicted if total_predicted > 0 else 0
    return precision


In [None]:
precision = calculate_precision(llm_output, ground_truth)
print(f"Precision: {precision:.2f}")