In [4]:
from dotenv import load_dotenv

load_dotenv()
from openai import OpenAI
import json

In [5]:
openai = OpenAI()

with open("data/animals.txt", "r") as f:
    raw = f.readlines()
    print(len(raw))

i = 0
data = []
for line in raw:
    l = json.loads(line)
    data.append({})
    data[i]["id"] = i
    data[i]["question"] = l["anchor"]
    data[i]["answer"] = l["positive"]
    i += 1

356


In [29]:
SYSTEM = "You are a ontologist specialist. You have the skills to identify and extract any entities and relations between any pieces of data."

PROMPT = """
Task: Given the following questions and answers, extract the entities found in all of them. Try to keep it synthetic, general, and simple.

The output format should be a json object list with the following structure:
{{
  id: {{ question_id }}, entities: {{ entity_list }}
}}

Example:
input: {{"id": 0, "anchor": "What is the policy for carrying a falcon on Qatar Airways?", "positive": "Yes falcon can be carried only when traveling in economy class. After the booking is completed. Kindly visit the link below to place the request https://www.qatarairways.com/en/help.html?iid=ALL75199970#avih", "category": "animals", "keyword": "falcon"}}
output: {{"id": 0, "entities": ["falcon", "Qatar Airways", "policy", "economy class", "booking", "link", "request"]}}

Data:

{data}
"""

In [30]:
responses = [
    openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": SYSTEM},
            {"role": "user", "content": PROMPT.format(data=json.dumps(line))},
        ],
    )
    for line in data
]

In [31]:
import re

data = [
     r.choices[0].message.content
    for r in responses
]

# ['```json\n[\n  {\n    "id": 0,\n    "entities": ["policy", "falcon", "Qatar Airways", "economy class", "booking", "request"]\n  }\n]\n```',

parsed = [
    re.findall(r'\"(.*?)\"', d)
    for d in data
]

# set

In [32]:

entities = set()
for p in parsed:
    entities.update(p)

In [14]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
import concurrent
from langchain.docstore.document import Document
from concurrent.futures import ThreadPoolExecutor

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
)


llm_transformer = LLMGraphTransformer(
    llm=llm,
    node_properties=True,
    allowed_nodes=["pet", "fee", "geo", "policy", "question", "answer"],
    allowed_relationships=[],
)
futures = []
graph_document_list = []
combined_chunk_document_list = data
def run_graph():
    with ThreadPoolExecutor(max_workers=10) as executor:
        for chunk in combined_chunk_document_list:
            chunk_doc = Document(
                page_content=json.dumps(chunk)
            )
            futures.append(
                executor.submit(llm_transformer.convert_to_graph_documents, [chunk_doc])
            )

        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            graph_document = future.result()
            graph_document_list.append(graph_document[0])


run_graph()


In [15]:
graph_document_list

[GraphDocument(nodes=[Node(id='9', type='Question', properties={'text': 'How much does it cost to transport a falcon from Europe to the Americas?'}), Node(id='9_Answer', type='Answer', properties={'text': 'The cost to transport a falcon from Europe to the Americas is 630 USD (CAD 820).'})], relationships=[Relationship(source=Node(id='9', type='Question'), target=Node(id='9_Answer', type='Answer'), type='ANSWER')], source=Document(page_content='{"id": 9, "question": "How much does it cost to transport a falcon from Europe to the Americas?", "answer": "The cost to transport a falcon from Europe to the Americas is 630 USD (CAD 820)."}')),
 GraphDocument(nodes=[Node(id='What Is The Policy For Carrying A Falcon On Qatar Airways?', type='Question'), Node(id='Yes Falcon Can Be Carried Only When Traveling In Economy Class. After The Booking Is Completed. Kindly Visit The Link Below To Place The Request Https://Www.Qatarairways.Com/En/Help.Html?Iid=All75199970#Avih', type='Answer')], relationsh