In [None]:
from neo4j import GraphDatabase
from tqdm import tqdm
import config.EnvLoader as el

URI = "neo4j://localhost"
AUTH = ("neo4j", el.NEO4J_PWD)

In [None]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    model="gpt-4o-global",
    azure_deployment="gpt-4o-global",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.AZURE_ENDPOINT,
    openai_api_version="2024-02-15-preview",
)

In [3]:
cost_dict = {
    "gpt-4o": {
        "prompt_tokens": 2.5/1000000,
        "completion_tokens": 10/1000000,
    }
}

def compute_costs(token_usage:dict, costs:dict):
    return token_usage["completion_tokens"]*costs["completion_tokens"] + token_usage["prompt_tokens"]*costs["prompt_tokens"]

# System Prompt

In [None]:
system_prompt = """Imagine being a system made for building tests.
You are going to be given two or more chunks of text, and an entity recurring in each of these chunks.
Write a multi-hop question that needs at least two of the chunks to be answered AND its answer. Provide also the name of the chunks from which the question has been formulated.
The provided entity shouldn't either be the subject of the question nor the answer, but should be a piace needed to answer the question.
If no straightforward question can be extracted from the chunks, answer with "ND".
Here are some examples:

INPUT:
- chunk1: "The flag of [NATION] is [COLORS of FLAG]."
- chunk2: "[NATION] won the [SOCCER TOURNAMENT]."
- entity: [NATION]
OUTPUT:
Question: What color is the flag of the nation who won the [SOCCER TOURNAMENT]?
Answer: The flag of the nation that won the [SOCCER TORUNAMENT], [NATION], is [COLORS of FLAG].
Chunk used: chunk1, chunk2

INPUT:
- chunk1: "[PERSON1] is the sister of [PERSON2]"
- chunk2: "[PERSON2] is from [NATION]"
- chunk3: "[PERSON2] is the lead singer of [BAND]"
- entity: [PERSON2]
OUTPUT:
Question: Who is the sister of the lead singer of [BAND]?
Answer: The sister of the lead singer of [BAND], [PERSON2], is [PERSON1].
Chunk used: chunk1, chunk3

The question should not simply ask about information of the entity in the first and second chunk, but should ask for a link between them.
Here are some example of a good and bad question:
INPUT:
- chunk1: "In 1996 [ACTOR] performs in [FILM]."
- chunk2: "[FILM] talks about [TOPIC]."
- entity: [FILM]
BAD question: Who performed in [FILM] and what is [FILM] about?
GOOD question: What is the film [ACTOR] starred in 1996 about?

INPUT:
- chunk1: "[PERSON1] invented [INVENTION]."
- chunk2: "[PERSON1] suffered of [ILLNESS]."
- entity: [PERSON1]
BAD question: What did [PERSON1] invent and what illness was he suffering from?
GOOD question: What disease did the inventor of [INVENTION] suffer from?

You should make sure that the question has only one straightforward answer.
Here are some examples.
BAD question: Who is the actor who played Jack in one of the most influential movies of the 90's?
There could be more than one actor that played the role of a character named Jack in an influential movie of the 90's.
GOOD question: Who is the actor who played Jack in the movie that came out on February 11th 1996?
Making sure there is only ONE character named Jack in the movies that came out on that date.

If no GOOD question can be extracted from the context, answer with "ND".
"""

# Construction

In [46]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    results, summary, keys = driver.execute_query(
        """MATCH (p:Page)-[:cited_in]->(c:Chunk)
        WHERE p.text =~ ".*\\(.*\\)$"
        WITH p, COLLECT(c) AS chunks, COUNT(c) AS citationCount
        WHERE citationCount >= 2
        RETURN p.text, chunks
        ORDER BY rand()
        LIMIT 1000
        """
    )

In [47]:
len(results)

1000

In [48]:
tot_cost = 0
answers = []

for result in tqdm(results):
    # Build Context
    context = "INPUT:\n"
    chunk_dict = {}
    page_lst = []
    for i, chunk in enumerate(result[1]):
        page_uri = chunk["uri"].split("#")[0]
        if page_uri in page_lst: continue
        else: page_lst.append(page_uri)
        context += f"- chunk{i}: {chunk['uri']}\n{chunk['text']}\n"
        chunk_dict[f"chunk{i}"] = chunk["uri"]
    if len(page_lst) < 2:
        continue
    context += f"- entity: {result[0]}"
    context += "\nOUTPUT:\n"

    # Query the LLM
    message = [
        ("system", system_prompt),
        ("human", "CONTEXT:\n" + context),
    ]
    response = llm.invoke(message)
    # Compute costs
    model_name = "-".join(response.response_metadata["model_name"].split("-")[:-3])
    tot_cost += compute_costs(response.response_metadata["token_usage"], cost_dict[model_name])

    # Save response
    answer = {
        "response": response.content,
        "context": context,
        "entity": result[0],
        "chunks_used": []
    }
    # Parse response
    response = response.content.split("\n")
    # If answer = "ND" or is not parsable, continue
    if len(response) != 3:
        continue
    answer["question"] = ":".join(response[0].split(":")[1:])
    answer["answer"] = ":".join(response[1].split(":")[1:])
    chunk_used = response[2].split(":")[-1]
    for chunk in chunk_used.split(","):
        chunk = chunk.strip()
        chunk_uri = chunk_dict.get(chunk)
        if chunk_uri:
            answer["chunks_used"].append(chunk_uri)
    answers.append(answer)
    if len(answers) == 200:
        break

 34%|███▎      | 336/1000 [06:05<12:03,  1.09s/it]


In [49]:
tot_cost

2.682875

In [50]:
answers

[{'response': 'Question: Which scientist advised against patenting the drug that had a major impact during World War II and also contributed to the understanding of chemical transmission in the brain?\nAnswer: The scientist who advised against patenting penicillin and contributed to the understanding of chemical transmission in the brain was Henry Hallett Dale.\nChunks used: chunk1, chunk2',
  'context': 'INPUT:\n- chunk0: List of Nobel laureates#List of laureates#1\nYear|Physics|Chemistry|Physiology|or Medicine|Literature|Peace|Economics|1901|Rntgen, WilhelmWilhelm Rntgen|Hoff, Jacobus Henricus van \'tJacobus Henricus van \'t Hoff|von Behring, Emil AdolfEmil Adolf von Behring|Prudhomme, SullySully Prudhomme|Dunant, HenryHenry Dunant;|Passy, FrdricFrdric Passy||1902|Lorentz, HendrikHendrik Lorentz;|Zeeman, PieterPieter Zeeman|Fischer, Hermann EmilHermann Emil Fischer|Ross, RonaldRonald Ross|Mommsen, TheodorTheodor Mommsen|Ducommun, lielie Ducommun;|Gobat, Charles AlbertCharles Albert G

# Evaluator
First skimming via LLM

In [51]:
system_prompt = """Imagine being an evaluator.
You are going to be given a question-answer pair of a synthetic multihop question answering dataset.
Give a complexive score from 1 to 10 to the observation.
The score should be lower if:
- It is not a multi-hop question: the questions asks something about an entity that is directly and explicitly mentioned in the questioned itself.
- The question excessively convoluted.
- There could be more than one answer to the question.
- There may be ambiguity in identifying the entities the question is referring to.
- The question asks multiple different things.

Write a VERY BRIEF evaluation of the observation based on this question.
Based on this evalation, write "FINAL SCORE: " and assign the score to the observation.
"""

In [55]:
for obs in tqdm(answers):
    context = f"QUESTION:{obs['question']}\nANSWER:{obs['answer']}"
    message = [
        ("system", system_prompt),
        ("human", context),
    ]
    response = llm.invoke(message)
    # Compute costs
    model_name = "-".join(response.response_metadata["model_name"].split("-")[:-3])
    tot_cost += compute_costs(response.response_metadata["token_usage"], cost_dict[model_name])

    content = response.content.lower()
    score = content.split("final score:")[-1].strip()
    try:
        score = int(score)
    except:
        score = "ND"
    obs["score"] = score

100%|██████████| 200/200 [05:01<00:00,  1.51s/it]


In [56]:
tot_cost

3.153654999999997

In [58]:
sorted_list = sorted(answers, key=lambda x: x['score'], reverse = True)


In [59]:
sorted_list

[{'response': 'Question: Which song did Sam Smith contribute to with Band Aid 30 to raise money for the 2014 Ebola crisis?\nAnswer: Sam Smith contributed to the song "Do They Know It\'s Christmas?" with Band Aid 30 to raise money for the 2014 Ebola crisis.\nChunk used: chunk0, chunk2',
  'context': 'INPUT:\n- chunk0: UK Singles Chart records and statistics#Number ones by different artists#1\nCurrently two songs have reached number one four times by different artists: "Unchained Melody" and "Do They Know It\'s Christmas?". Three of the versions of "Unchained Melody" sold over a million copies, while two of the versions of "Do They Know It\'s Christmas?" achieved this. The lyrics of the Band Aid 30 version were changed to give it relevance to the 2014 Ebola crisis. Numerous artists appear on more than one version of "Do They Know It\'s Christmas?".\n- chunk2: Sam Smith (singer)#Music career#2014–2016: In the Lonely Hour and international success#4\nIn June 2014, Smith first appeared on t

In [60]:
sorted_list[100]

{'response': 'Question: During which NFL Kickoff Game event did the artist who performed "Glory Days" in Cars 3 sing the National Anthem?\nAnswer: The artist who performed "Glory Days" in Cars 3, Andra Day, sang the National Anthem during the NFL Kickoff 2016 event.\nChunks used: chunk2, chunk4',
 'context': 'INPUT:\n- chunk0: Mandy Moore (choreographer)#Choreography for So You Think You Can Dance#1\nSeason|Week|Dancers|Style|Song|Note|3|2|Sabra Johnson|Dominic Sandoval|Contemporary|"I\'ll Stand by You"The Pretenders|3|Anya Garnis|Danny Tidwell|Hip-Hop|"Oh Timbaland"Timbaland|5|Sara Von Gillern|Pasha Kovalev|Jazz|"Body Language"Queen|4|1|Kourtni Lind|Matt Dorame|"Tainted Love"Soft Cell|2|Chelsea Traille|Thayne Jasperson|"Untouched"The Veronicas|Courtney Galiano|Gev Manoukian|Contemporary|"Lost"Anouk|4|Jessica King|William Wingfield|Lyrical Jazz|"Alone"Heart|Chelsie Hightower|Mark Kanemura|Jazz|"Kiss Kiss"Holly Valance (incorrectly listed as "Holly Vance")|5|Comfort Fedoke|Thayne Jasper

In [57]:
import pickle

with open('multihop.pkl', 'wb') as f:
    pickle.dump(answers, f)

In [1]:
import pickle 

# Open the file in binary mode 
with open('multihop.pkl', 'rb') as file: 
	multihop = pickle.load(file) 

## Manual Skimming

In [4]:
sorted_list = sorted(multihop, key=lambda x: x['score'], reverse = True)

In [10]:
skip = []

In [199]:
i += 1
print(sorted_list[i]['question'])
print(sorted_list[i]['answer'])

 Who represented Nevada in the House of Representatives along with Joe Heck in the 114th United States Congress?
 In the 114th United States Congress, Joe Heck represented Nevada along with Dina Titus, Mark Amodei, and Cresent Hardy.


In [195]:
skip.append(i)

In [196]:
i - len(skip)

98

In [202]:
eval_df = []
i = 0

while len(eval_df) < 101:
    if i not in skip:
        eval_df.append(sorted_list[i])
    i += 1

In [204]:
import pickle

with open('multihop_eval_df.pkl', 'wb') as f:
    pickle.dump(eval_df, f)

# Run pipeline

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
import Levenshtein
import pandas as pd
from tqdm import tqdm
import time
from neo4j import GraphDatabase
from utils.GraphTraverser import CypherSearch

URI = "neo4j://localhost"
AUTH = ("neo4j", el.NEO4J_PWD)

In [None]:
# Initialize Azure Embeddings
embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.AZURE_ENDPOINT,
    openai_api_version="2023-03-15-preview",
)

llm = AzureChatOpenAI(
    model="gpt-4o-global",
    azure_deployment="gpt-4o-global",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.AZURE_ENDPOINT,
    openai_api_version="2024-02-15-preview",
)

In [29]:
import pickle 

# Open the file in binary mode 
with open('multihop_eval_df.pkl', 'rb') as file: 
	q_diz = pickle.load(file)

In [30]:
df = []
for obs in q_diz:
    diz = {}
    diz["question"] = obs['question']
    diz["embedding"] = embeddings.embed_query(obs['question'].strip())
    diz["observations"] = {}
    df.append(diz)

In [31]:
df = df[:100]

In [32]:
# GRAPH TRAVERSER
for el in tqdm(df):
    selected_chunks, cost, answer, subC_list = CypherSearch(el["embedding"], el["question"], llm)
    el["cost"] = cost
    for chunk_id in selected_chunks:
        el["observations"][chunk_id] = {
            "graph_traverser": 1
        }
    el["graph_traverser_output"] = {
        "answer": answer,
        "subC_list": subC_list
    }

100%|██████████| 100/100 [03:44<00:00,  2.24s/it]


In [33]:
# CHUNK RETRIEVAL
for el in tqdm(df):
    # Search for top 10 chunk
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_chunks, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("vector", 100, $embedding)
                YIELD node, score
                RETURN ID(node), score
                LIMIT 10""",
            embedding=el["embedding"]
        )
    for chunk in retrieved_chunks:
        if chunk["ID(node)"] not in el["observations"].keys():
            el["observations"][chunk["ID(node)"]] = {}
        el["observations"][chunk["ID(node)"]]["similarity_score"] = float(chunk["score"])

100%|██████████| 100/100 [00:03<00:00, 25.48it/s]


In [34]:
# PARENT/PAGE RETRIEVAL
for el in tqdm(df):
    # Search for top 3 pages
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_OP, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("original_page_vector", 100, $embedding)
            YIELD node, score
            RETURN ID(node), score
            LIMIT 3""",
            embedding=el["embedding"]
        )
    for OriginalPage in retrieved_OP:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            connected_chunks, _, _ = driver.execute_query(
                """MATCH (a:OriginalPage)<-[:is_chunk_of]-(b:Chunk)
                WHERE ID(a) = $id_a
                RETURN ID(b)""",
                id_a = OriginalPage["ID(node)"]
            )
        for chunk in connected_chunks:
            if chunk["ID(b)"] not in el["observations"].keys():
                el["observations"][chunk["ID(b)"]] = {}
            el["observations"][chunk["ID(b)"]]["page_similarity_score"] = float(OriginalPage["score"])
    # Search for top 5 SubChapters
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_subC, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("op_subc_vector", 100, $embedding)
            YIELD node, score
            WHERE node:OriginalPage OR node:SubChapter
            RETURN ID(node), score
            LIMIT 5""",
            embedding=el["embedding"]
        )
    for subC in retrieved_subC:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            connected_chunks, _, _ = driver.execute_query(
                """MATCH (a)-[:has_chunk]->(b:Chunk)
                WHERE ID(a) = $id_a
                RETURN ID(b)""",
                id_a = subC["ID(node)"]
            )
        for chunk in connected_chunks:
            if chunk["ID(b)"] not in el["observations"].keys():
                el["observations"][chunk["ID(b)"]] = {}
            el["observations"][chunk["ID(b)"]]["parent_similarity_score"] = float(subC["score"])

100%|██████████| 100/100 [00:08<00:00, 11.17it/s]


In [35]:
# ENTITY RETRIEVAL
for el in tqdm(df):
    # Search for top 5 entities
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_entities, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("page_vector", 100, $embedding)
            YIELD node, score
            RETURN ID(node), score
            LIMIT 5""",
            embedding=el["embedding"]
        )
    for entity in retrieved_entities:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            connected_chunks, _, _ = driver.execute_query(
                """MATCH (a:Page)-[:cited_in]->(b:Chunk)
                WHERE ID(a) = $id_a
                RETURN ID(b)""",
                id_a = entity["ID(node)"]
            )
        for chunk in connected_chunks:
            if chunk["ID(b)"] not in el["observations"].keys():
                el["observations"][chunk["ID(b)"]] = {}
            entity_similarity_score = el["observations"][chunk["ID(b)"]].get("entity_similarity_score", 0)
            el["observations"][chunk["ID(b)"]]["entity_similarity_score"] = max(float(entity["score"]), entity_similarity_score)
            n_relevant_entities = el["observations"][chunk["ID(b)"]].get("n_relevant_entities", 0)
            el["observations"][chunk["ID(b)"]]["n_relevant_entities"] = n_relevant_entities + 1

100%|██████████| 100/100 [00:14<00:00,  6.79it/s]


In [36]:
import json
with open('Dataset_multihop.json', 'w') as f:
    json.dump(df, f)

In [46]:
with open('Dataset_multihop.json', 'r') as file:
    df = json.load(file)

In [47]:
# IS ANSWER
for i in range(100):
    chunk_uris = q_diz[i]["chunks_used"]
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        chunk_used, _, _ = driver.execute_query(
            """WITH $chunk_uris AS list
            MATCH (a:Chunk)
            WHERE a.uri IN list
            RETURN ID(a)""",
            chunk_uris = chunk_uris
        )
    for chunk in chunk_used:
        if str(chunk["ID(a)"]) not in df[i]["observations"].keys():
            df[i]["observations"][str(chunk["ID(a)"])] = {}
        df[i]["observations"][str(chunk["ID(a)"])]["is_answer"] = 1

In [49]:
import json
with open('Dataset_multihop.json', 'w') as f:
    json.dump(df, f)

## Normalization

In [50]:
normalized_df = df.copy()
similarity_lst = []

for el in normalized_df:
    el = el["observations"]
    # Prendiamo tutti i similarity score presenti
    for value in el.values():
        if value.get("similarity_score"):
            similarity_lst.append(value["similarity_score"])
        if value.get("page_similarity_score"):
            similarity_lst.append(value["page_similarity_score"])
        if value.get("entity_similarity_score"):
            similarity_lst.append(value["entity_similarity_score"])
        if value.get("parent_similarity_score"):
            similarity_lst.append(value["parent_similarity_score"])

print(min(similarity_lst))

0.8770968914031982


In [51]:
# NORMALIZE CONTINUOUS VARIABLES

normalized_df = df.copy()

for el in normalized_df:
    el = el["observations"]
    # Prendiamo tutti i similarity score presenti
    similarity_lst = []
    for value in el.values():
        if value.get("similarity_score"):
            similarity_lst.append(value["similarity_score"])
        if value.get("page_similarity_score"):
            similarity_lst.append(value["page_similarity_score"])
        if value.get("entity_similarity_score"):
            similarity_lst.append(value["entity_similarity_score"])
        if value.get("parent_similarity_score"):
            similarity_lst.append(value["parent_similarity_score"])
    similarity_lst.append(0.75)
    similarity_lst = list(set(similarity_lst))
    # Compute mean and std_dev
    mean = sum(similarity_lst) / len(similarity_lst)
    variance = sum([((x - mean) ** 2) for x in similarity_lst]) / len(similarity_lst)
    std_dev = variance ** 0.5
    for value in el.values():
        if value.get("similarity_score"):
            value["similarity_score"] = (value["similarity_score"] - mean)/std_dev
        else:
            value["similarity_score"] = (0.75 - mean)/std_dev
        if value.get("page_similarity_score"):
            value["page_similarity_score"] = (value["page_similarity_score"] - mean)/std_dev
        else:
            value["page_similarity_score"] = (0.75 - mean)/std_dev
        if value.get("entity_similarity_score"):
            value["entity_similarity_score"] = (value["entity_similarity_score"] - mean)/std_dev
        else:
            value["entity_similarity_score"] = (0.75 - mean)/std_dev
        if value.get("parent_similarity_score"):
            value["parent_similarity_score"] = (value["parent_similarity_score"] - mean)/std_dev
        else:
            value["parent_similarity_score"] = (0.75 - mean)/std_dev

In [52]:
# RESOLVE BINARY/DISCRETE VARIABLES

for el in normalized_df:
    el = el["observations"]
    for value in el.values():
        if not value.get("n_relevant_entities"):
            value["n_relevant_entities"] = 0
        if not value.get("is_answer"):
            value["is_answer"] = 0
        if not value.get("graph_traverser"):
            value["graph_traverser"] = 0

In [53]:
import json
with open('Dataset_multihop_normalized.json', 'w') as f:
    json.dump(df, f)