In [1]:
%load_ext autoreload
%autoreload 2


import sys
sys.path.append("..")

The goal of this notebook is to evaluate the triplets returned by iText2KG against human-extracted triplets. 

To achieve this, we have enhanced the music and computer science datasets curated by Othmane et al. (Kabal, Othmane, et al. "Enhancing Domain-Independent Knowledge Graph Construction through OpenIE Cleaning and LLMs Validation." International Conference on Knowledge-Based and Intelligent Information & Engineering Systems, 2024) by grouping triplets based on similar sentences and extracting triplets using iText2KG.

Additionally, we incorporated factoids (Chen, Tong, et al. "Dense x retrieval: What retrieval granularity should we use?" arXiv preprint arXiv:2312.06648, 2023) using GPT-4o and re-extracted triplets from these factoids with iText2KG. The aim is to compare triplets extracted from raw documents (using iText2KG) and factoids (using iText2KG) against human-extracted triplets.

There is no need to re-run all cells, as the datasets are available in the datasets/curated folder.

## Loading the datasets
---

In [2]:
import pandas as pd 

music = pd.read_excel("GS_Music.xlsx")
cs = pd.read_excel("GS_CS-corpus.xlsx")

## LLM Model and Embeddings Model
---

In [3]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "sk-##"

openai_llm_model = llm = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)

In [4]:
from itext2kg import iText2KG

  warn(


In [5]:
itext2kg = iText2KG(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

## Preprocessing of the two datasets, grouping the phrases and triplets. 
---

In [None]:
# Group by Sentences and aggregate
grouped_cs = cs.groupby('Sentences').agg({
    'Sentences': "first",  # Join sentences with a space
    'Triplet': lambda x: '; '.join(x),  # Join triplets as strings with a semicolon
    'Topic': 'first',  # Keep the first topic
    'Annotator 1': 'first',  # Keep the first annotator
    'Annotator 2': 'first',  # Keep the first annotator
    'Ttiple_id': 'first'  # Keep the first triplet_id
}).reset_index(drop=True)  # Avoid resetting the index with Sentences

# Rename columns to reflect the aggregated data
grouped_cs.rename(columns={
    'Sentences': 'Combined_Sentences',
    'Triplet': 'Combined_Triplets'
}, inplace=True)

In [6]:
# Group by Sentences and aggregate
music["Triplet"] = music.apply(lambda row: f"({row['Subject']}, {row['Predicate']}, {row['Object']})", axis=1)

grouped_music = music.groupby('Sentences').agg({
    'Sentences': "first",  # Join sentences with a space
    'Triplet': lambda x: '; '.join(x),  # Join triplets as strings with a semicolon
    'Topic': 'first',  # Keep the first topic
    'Annotator 1': 'first',  # Keep the first annotator
    'Annotator 2': 'first',  # Keep the first annotator
    'Triplet_id': 'first'  # Keep the first triplet_id
}).reset_index(drop=True)  # Avoid resetting the index with Sentences

# Rename columns to reflect the aggregated data
grouped_music.rename(columns={
    'Sentences': 'Combined_Sentences',
    'Triplet': 'Combined_Triplets'
}, inplace=True)

## Adding factoids for each sentence
---

In [7]:
from itext2kg.documents_distiller import DocumentsDistiller
from pydantic import BaseModel, Field
from typing import List

document_distiller = DocumentsDistiller(llm_model=openai_llm_model)

class SimplePhrase(BaseModel):
    phrase:str= Field(description="Propositions are defined as atomic expressions within text, each encapsulating a distinct factoid and presented in a concise, self-contained natural language format.")

class ListOfSimplePhrases(BaseModel):
    phrases:List[SimplePhrase] = Field(description="All Propositions")

IE_query = '''
        # DIRECTIVES : 
        - Act like an experienced paraphraser.
        - Paraphrase the following context into a list of simple phrases.
    '''
        
# Distill document content with query
def factoid(paragraph:str):
    print("distilling ..")
    distilled_doc = document_distiller.distill(
    documents=[paragraph],
    IE_query=IE_query,
    output_data_structure=ListOfSimplePhrases
)
    return [fact["phrase"] for fact in distilled_doc["phrases"]]

In [8]:
grouped_music["Factoids"] = grouped_music["Combined_Sentences"].apply(lambda x: factoid(x)) 

distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distil

In [171]:
grouped_cs["Factoids"] = grouped_cs["Combined_Sentences"].apply(lambda x: factoid(x))

distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distilling ..
distil

## Triplets from factoids and from original sentences
---

In [22]:
def extract_triplets_from_original_sentences(original_sentences:str):
    kg = itext2kg.build_graph(sections=[original_sentences])
    return [(rel.startEntity.name, rel.name, rel.endEntity.name) for rel in kg.relationships]

def extract_triplets_from_factoids(factoids:str):
    kg = itext2kg.build_graph(sections=[" ".join(factoids)])
    return [(rel.startEntity.name, rel.name, rel.endEntity.name) for rel in kg.relationships]

In [23]:
grouped_music["Triplets_From_Factoids"] = grouped_music["Factoids"].apply(lambda x: extract_triplets_from_factoids(x))

[INFO] ------- Extracting Entities from the Document 1
{'entities': [{'name': 'Methodist', 'label': 'Religious Denomination'}, {'name': 'Holiness', 'label': 'Religious Movement'}, {'name': 'Baptist', 'label': 'Religious Denomination'}]}
[INFO] ------- Extracting Relations from the Document 1
{'relationships': [{'startNode': {'name': 'methodist', 'label': 'Religious_Denomination'}, 'endNode': {'name': 'holiness', 'label': 'Religious_Movement'}, 'name': 'is_a_part_of'}, {'startNode': {'name': 'baptist', 'label': 'Religious_Denomination'}, 'endNode': {'name': 'holiness', 'label': 'Religious_Movement'}, 'name': 'is_a_part_of'}]}
[INFO] Verification of invented entities
[INFO] ------- Extracting Entities from the Document 1
{'entities': [{'name': 'Progressive Bluegrass', 'label': 'Genre'}, {'name': 'Electric Instruments', 'label': 'Instrument'}, {'name': 'Rock & Roll', 'label': 'Genre'}, {'name': 'Cadillac Sky', 'label': 'Band'}, {'name': 'Bearfoot', 'label': 'Band'}]}
[INFO] ------- Extrac

In [None]:
grouped_music["Triplets_From_Original_Sentences"] = grouped_music["Combined_Sentences"].apply(lambda x: extract_triplets_from_original_sentences(x))

## Save the two datasets
---

The samething applies for grouped_cs dataset

In [24]:
import pickle

with open("./grouped_music_.pkl", "wb") as file:
    pickle.dump(grouped_music, file)

In [26]:
grouped_music.to_excel("./grouped_music.xlsx", index=False)