In [None]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
import pandas as pd
from tqdm import tqdm
from neo4j import GraphDatabase
from utils.GraphTraverser import CypherSearch
import config.EnvLoader as el
import pickle, json

URI = "neo4j://localhost"
AUTH = ("neo4j", el.NEO4J_PWD)

In [None]:
# Initialize Azure Embeddings
embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.AZURE_ENDPOINT,
    openai_api_version="2023-03-15-preview",
)


llm = AzureChatOpenAI(
    model="gpt-4o-global",
    azure_deployment="gpt-4o-global",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.OPENAI_API_KEY,
    openai_api_version="2024-02-15-preview",
)

In [None]:
# To generate this file run Indexing.ipynb
with open('embedded_questions.pkl', 'rb') as file: 
	q_diz = pickle.load(file) 

# Initialize the dataframe

The training dataframe consists of 5223 observations: one for each question.
For each of the questions, it retains:
- The text of the question.
- The embedding of the question.
- The "observations", i.e. all the chunks that have been retrieved by at least one of the retrieval methods.<br>
The observation variable will be a python dictionary that contains:
- As **keys**: the ID of one chunk that have been retrieved by at least one of the retrieval methods.
- As **values**: a dictionary containing, for each of the retrieval methods, the value 1 if they have been retrieved by that method or 0 otherwise.

In [None]:
# Initialize training dataframe
df = []
for question, embedding in q_diz.items():
    diz = {}
    diz["question"] = question
    diz["embedding"] = embedding
    diz["observations"] = {}
    df.append(diz)

In [5]:
len(df)

5223

# Populate the Training Dataframe

## Path Retrieval
Is the chunk linked to the relevant KG path?

In [6]:
for el in tqdm(df):
    selected_chunks, cost, answer, subC_list = CypherSearch(el["embedding"], el["question"], llm)
    el["cost"] = cost
    for chunk_id in selected_chunks:
        el["observations"][chunk_id] = {
            "graph_traverser": 1
        }
    el["graph_traverser_output"] = {
        "answer": answer,
        "subC_list": subC_list
    }

100%|██████████| 5223/5223 [2:00:32<00:00,  1.38s/it]  


In [None]:
with open('Dataset.json', 'w') as f:
    json.dump(df, f)

## Vector Retrieval
How relevant is the chunk’s content?

In [8]:
for el in tqdm(df):
    # Search for top 10 chunk
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_chunks, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("vector", 100, $embedding)
                YIELD node, score
                RETURN ID(node), score
                LIMIT 10""",
            embedding=el["embedding"]
        )
    for chunk in retrieved_chunks:
        if chunk["ID(node)"] not in el["observations"].keys():
            el["observations"][chunk["ID(node)"]] = {}
        el["observations"][chunk["ID(node)"]]["similarity_score"] = float(chunk["score"])

100%|██████████| 5223/5223 [02:53<00:00, 30.12it/s]


## Page and Section Retrieval
How relevant is the document/section to which the chunk belongs?

In [None]:
# Create OriginalPage index
# Create nodes vector index for vector RAG
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    result, summary, keys = driver.execute_query(
        """CREATE VECTOR INDEX original_page_vector IF NOT EXISTS
        FOR (a:OriginalPage) ON (a.embedding)
        OPTIONS {indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
        }}"""
    )

In [None]:
# Create OriginalPage index
# Create nodes vector index for vector RAG
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    result, summary, keys = driver.execute_query(
        """CALL apoc.periodic.commit("
            MATCH (a:OriginalPage)
            WHERE NOT a:OpSubc
            WITH a LIMIT 1000
            SET a :OpSubc
            RETURN count(a)
        ")"""
    )

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    result, summary, keys = driver.execute_query(
        """CALL apoc.periodic.commit("
            MATCH (a:SubChapter)
            WHERE NOT a:OpSubc
            WITH a LIMIT 1000
            SET a :OpSubc
            RETURN count(a)
        ")"""
    )

In [None]:
# Create OpSubc index
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    result, summary, keys = driver.execute_query(
        """CREATE VECTOR INDEX op_subc_vector IF NOT EXISTS
        FOR (a:OpSubc) ON (a.embedding)
        OPTIONS {indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
        }}"""
    )

In [10]:
for el in tqdm(df):
    # Search for top 3 pages
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_OP, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("original_page_vector", 100, $embedding)
            YIELD node, score
            RETURN ID(node), score
            LIMIT 3""",
            embedding=el["embedding"]
        )
    for OriginalPage in retrieved_OP:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            connected_chunks, _, _ = driver.execute_query(
                """MATCH (a:OriginalPage)<-[:is_chunk_of]-(b:Chunk)
                WHERE ID(a) = $id_a
                RETURN ID(b)""",
                id_a = OriginalPage["ID(node)"]
            )
        for chunk in connected_chunks:
            if chunk["ID(b)"] not in el["observations"].keys():
                el["observations"][chunk["ID(b)"]] = {}
            el["observations"][chunk["ID(b)"]]["page_similarity_score"] = float(OriginalPage["score"])
    # Search for top 5 SubChapters
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_subC, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("op_subc_vector", 100, $embedding)
            YIELD node, score
            WHERE node:OriginalPage OR node:SubChapter
            RETURN ID(node), score
            LIMIT 5""",
            embedding=el["embedding"]
        )
    for subC in retrieved_subC:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            connected_chunks, _, _ = driver.execute_query(
                """MATCH (a)-[:has_chunk]->(b:Chunk)
                WHERE ID(a) = $id_a
                RETURN ID(b)""",
                id_a = subC["ID(node)"]
            )
        for chunk in connected_chunks:
            if chunk["ID(b)"] not in el["observations"].keys():
                el["observations"][chunk["ID(b)"]] = {}
            el["observations"][chunk["ID(b)"]]["parent_similarity_score"] = float(subC["score"])

100%|██████████| 5223/5223 [11:36<00:00,  7.50it/s]


## Entity Retrieval
How relevant are the entities that the chunk cites?

In [None]:
# Create OpSubc index
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    result, summary, keys = driver.execute_query(
        """CREATE VECTOR INDEX page_vector IF NOT EXISTS
        FOR (a:Page) ON (a.embedding)
        OPTIONS {indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
        }}"""
    )

In [11]:
for el in tqdm(df):
    # Search for top 5 entities
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_entities, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("page_vector", 100, $embedding)
            YIELD node, score
            RETURN ID(node), score
            LIMIT 5""",
            embedding=el["embedding"]
        )
    for entity in retrieved_entities:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            connected_chunks, _, _ = driver.execute_query(
                """MATCH (a:Page)-[:cited_in]->(b:Chunk)
                WHERE ID(a) = $id_a
                RETURN ID(b)""",
                id_a = entity["ID(node)"]
            )
        for chunk in connected_chunks:
            if chunk["ID(b)"] not in el["observations"].keys():
                el["observations"][chunk["ID(b)"]] = {}
            entity_similarity_score = el["observations"][chunk["ID(b)"]].get("entity_similarity_score", 0)
            el["observations"][chunk["ID(b)"]]["entity_similarity_score"] = max(float(entity["score"]), entity_similarity_score)
            n_relevant_entities = el["observations"][chunk["ID(b)"]].get("n_relevant_entities", 0)
            el["observations"][chunk["ID(b)"]]["n_relevant_entities"] = n_relevant_entities + 1

100%|██████████| 5223/5223 [07:33<00:00, 11.52it/s]


## Ground Truth
Does the chunk contain the actual answer to the question? (Ground truth from the NQ dataset)

In [12]:
for el in tqdm(df):
    # Search for top 3 pages
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        answers, _, _ = driver.execute_query(
            """MATCH (a:Chunk {is_answer_of: $question})
            RETURN ID(a)""",
            question=el["question"]
        )
    for chunk in answers:
        if chunk["ID(a)"] not in el["observations"].keys():
            el["observations"][chunk["ID(a)"]] = {}
        el["observations"][chunk["ID(a)"]]["is_answer"] = 1

100%|██████████| 5223/5223 [01:55<00:00, 45.37it/s]


In [None]:
with open('Dataset.json', 'w') as f:
    json.dump(df, f)

# Dataset exploration
This section of the notebook was used to compute the metrics for each of the individual retrieval systems (section 4.3.2 of Master Thesis).

- **Precision**: how many of the selected chunks contain the answer?
- **Recall**: what percentage of chunks containing the answer have been found?
- **Detection**: (Bool) does at least one of the collected chunks contain the answer?

In [None]:
import json

# Open and read the JSON file
with open('Dataset.json', 'r') as file:
    df = json.load(file)

## Vector Retrieval

In [None]:
def get_similarity_score(item):
    return item.get('similarity_score', 0)

# Precision, recall and detection for top 3, top 5 and top 10 relevant chunks
precision_lst = []
recall_lst = []
detection_lst = []

for question in df:
    sorted_lst = sorted(question["observations"].values(), key=get_similarity_score, reverse=True)[:10]
    precision_diz = {}
    recall_diz = {}
    detection_diz = {}
    for n in [3,5,10]:
        chunk_lst = sorted_lst[:n]
        total_answer = 0
        answers_found = sum([chunk.get("is_answer", 0) for chunk in chunk_lst])
        total_answer = sum([chunk.get("is_answer", 0) for chunk in question["observations"].values()])
        precision_diz[n] = answers_found/n
        recall_diz[n] = answers_found/total_answer
        detection_diz[n] = min(answers_found, 1)
    precision_lst.append(precision_diz)
    recall_lst.append(recall_diz)
    detection_lst.append(detection_diz)
        

In [15]:
length = len(df)

for n in [3,5,10]:
    print(f"TOP {n} CHUNKS")
    print(f"Average Precision: {round(sum([x[n] for x in precision_lst])/length,3)}")
    print(f"Average Recall: {round(sum([x[n] for x in recall_lst])/length,3)}")
    print(f"Average Detection: {round(sum([x[n] for x in detection_lst])/length,3)}")

TOP 3 CHUNKS
Average Precision: 0.237
Average Recall: 0.528
Average Detection: 0.617
TOP 5 CHUNKS
Average Precision: 0.172
Average Recall: 0.626
Average Detection: 0.708
TOP 10 CHUNKS
Average Precision: 0.103
Average Recall: 0.739
Average Detection: 0.801


## Page Retrieval

In [None]:
# Precision, recall and detection for top 1, top 2 and top 3 relevant pages
precision_lst = []
recall_lst = []
detection_lst = []

for question in df:
    pages_similarity_scores = list(set([x.get('page_similarity_score', 0) for x in question["observations"].values()]))
    pages_similarity_scores.sort(reverse=True)
    precision_diz = {}
    recall_diz = {}
    detection_diz = {}
    for n in [1,2,3]:
        similarity_lst = pages_similarity_scores[:n]
        chunk_lst = [x for x in question["observations"].values() if x.get('page_similarity_score', 0) in similarity_lst]
        total_answer = 0
        answers_found = sum([chunk.get("is_answer", 0) for chunk in chunk_lst])
        total_answer = sum([chunk.get("is_answer", 0) for chunk in question["observations"].values()])
        precision_diz[n] = answers_found/len(chunk_lst)
        recall_diz[n] = answers_found/total_answer
        detection_diz[n] = min(answers_found, 1)
    precision_lst.append(precision_diz)
    recall_lst.append(recall_diz)
    detection_lst.append(detection_diz)

In [17]:
length = len(df)

for n in [1,2,3]:
    print(f"TOP {n} Relevant Pages")
    print(f"Average Precision: {round(sum([x[n] for x in precision_lst])/length,3)}")
    print(f"Average Recall: {round(sum([x[n] for x in recall_lst])/length,3)}")
    print(f"Average Detection: {round(sum([x[n] for x in detection_lst])/length,3)}")

TOP 1 Relevant Pages
Average Precision: 0.071
Average Recall: 0.789
Average Detection: 0.789
TOP 2 Relevant Pages
Average Precision: 0.029
Average Recall: 0.861
Average Detection: 0.861
TOP 3 Relevant Pages
Average Precision: 0.018
Average Recall: 0.885
Average Detection: 0.885


## Section Retrieval

In [None]:
# Precision, recall and detection for top 1,2,3,4,5 relevant sections.
precision_lst = []
recall_lst = []
detection_lst = []

for question in df:
    pages_similarity_scores = list(set([x.get('parent_similarity_score', 0) for x in question["observations"].values()]))
    pages_similarity_scores.sort(reverse=True)
    precision_diz = {}
    recall_diz = {}
    detection_diz = {}
    for n in [1,2,3,4,5]:
        similarity_lst = pages_similarity_scores[:n]
        chunk_lst = [x for x in question["observations"].values() if x.get('parent_similarity_score', 0) in similarity_lst]
        total_answer = 0
        answers_found = sum([chunk.get("is_answer", 0) for chunk in chunk_lst])
        total_answer = sum([chunk.get("is_answer", 0) for chunk in question["observations"].values()])
        precision_diz[n] = answers_found/len(chunk_lst)
        recall_diz[n] = answers_found/total_answer
        detection_diz[n] = min(answers_found, 1)
    precision_lst.append(precision_diz)
    recall_lst.append(recall_diz)
    detection_lst.append(detection_diz)
        

In [19]:
length = len(df)

for n in [1,2,3,4,5]:
    print(f"TOP {n} Relevant Parents")
    print(f"Average Precision: {round(sum([x[n] for x in precision_lst])/length,3)}")
    print(f"Average Recall: {round(sum([x[n] for x in recall_lst])/length,3)}")
    print(f"Average Detection: {round(sum([x[n] for x in detection_lst])/length,3)}")

TOP 1 Relevant Parents
Average Precision: 0.207
Average Recall: 0.352
Average Detection: 0.421
TOP 2 Relevant Parents
Average Precision: 0.144
Average Recall: 0.513
Average Detection: 0.579
TOP 3 Relevant Parents
Average Precision: 0.113
Average Recall: 0.604
Average Detection: 0.663
TOP 4 Relevant Parents
Average Precision: 0.088
Average Recall: 0.676
Average Detection: 0.727
TOP 5 Relevant Parents
Average Precision: 0.058
Average Recall: 0.798
Average Detection: 0.829


## Entity Retrieval

In [None]:
# Precision, recall and detection for top 1,2,3,4,5 relevant entities.
precision_lst = []
recall_lst = []
detection_lst = []

i = 0
for question in df:
    pages_similarity_scores = list(set([x.get('entity_similarity_score', 0) for x in question["observations"].values()]))
    pages_similarity_scores.sort(reverse=True)
    pages_similarity_scores = pages_similarity_scores[:-1]
    precision_diz = {}
    recall_diz = {}
    detection_diz = {}
    if not pages_similarity_scores:
        print(i)
        continue
    for n in [1,2,3,4,5]:
        similarity_lst = pages_similarity_scores[:n]
        chunk_lst = [x for x in question["observations"].values() if x.get('entity_similarity_score', 0) in similarity_lst]
        total_answer = 0
        answers_found = sum([chunk.get("is_answer", 0) for chunk in chunk_lst])
        total_answer = sum([chunk.get("is_answer", 0) for chunk in question["observations"].values()])
        if len(chunk_lst) == 0:
            print(i)
        precision_diz[n] = answers_found/len(chunk_lst)
        recall_diz[n] = answers_found/total_answer
        detection_diz[n] = min(answers_found, 1)
    precision_lst.append(precision_diz)
    recall_lst.append(recall_diz)
    detection_lst.append(detection_diz)
    i += 1
        

165
839


In [21]:
length = len(df)

for n in [1,2,3,4,5]:
    print(f"TOP {n} Relevant Entities")
    print(f"Average Precision: {round(sum([x[n] for x in precision_lst])/length,3)}")
    print(f"Average Recall: {round(sum([x[n] for x in recall_lst])/length,3)}")
    print(f"Average Detection: {round(sum([x[n] for x in detection_lst])/length,3)}")

TOP 1 Relevant Entities
Average Precision: 0.059
Average Recall: 0.096
Average Detection: 0.118
TOP 2 Relevant Entities
Average Precision: 0.051
Average Recall: 0.161
Average Detection: 0.198
TOP 3 Relevant Entities
Average Precision: 0.046
Average Recall: 0.205
Average Detection: 0.249
TOP 4 Relevant Entities
Average Precision: 0.042
Average Recall: 0.233
Average Detection: 0.281
TOP 5 Relevant Entities
Average Precision: 0.041
Average Recall: 0.248
Average Detection: 0.299


## Graph Traverser

In [None]:
precision_lst = []
recall_lst = []
detection_lst = []

minus = 0
for question in df:
    chunk_lst = [x for x in question["observations"].keys() if "graph_traverser" in question["observations"][x].keys()]
    if len(chunk_lst) == 0:
        minus += 1
        continue
    answers_found = sum([question["observations"][chunk].get("is_answer", 0) for chunk in chunk_lst])
    total_answer = sum([chunk.get("is_answer", 0) for chunk in question["observations"].values()])
    precision_lst.append(answers_found/len(chunk_lst))
    recall_lst.append(answers_found/total_answer)
    detection_lst.append(min(answers_found, 1))

In [23]:
length = len(df) - minus

print(f"Cypher Search")
print(f"Average Precision: {round(sum([x for x in precision_lst])/length,3)}")
print(f"Average Recall: {round(sum([x for x in recall_lst])/length,3)}")
print(f"Average Detection: {round(sum([x for x in detection_lst])/length,3)}")

Cypher Search
Average Precision: 0.266
Average Recall: 0.423
Average Detection: 0.512


# Dataset cleaning
Apply local standardization and handle missing values, as reported in section 3.2.2 of Master Thesis.

In [24]:
import json

# Open and read the JSON file
with open('Dataset.json', 'r') as file:
    df = json.load(file)

In [None]:
# NORMALIZE CONTINUOUS VARIABLES

normalized_df = df.copy()

for el in normalized_df:
    el = el["observations"]
    # Prendiamo tutti i similarity score presenti
    similarity_lst = []
    for value in el.values():
        if value.get("similarity_score"):
            similarity_lst.append(value["similarity_score"])
        if value.get("page_similarity_score"):
            similarity_lst.append(value["page_similarity_score"])
        if value.get("entity_similarity_score"):
            similarity_lst.append(value["entity_similarity_score"])
        if value.get("parent_similarity_score"):
            similarity_lst.append(value["parent_similarity_score"])
    similarity_lst.append(0.75)
    similarity_lst = list(set(similarity_lst))
    # Compute mean and std_dev
    mean = sum(similarity_lst) / len(similarity_lst)
    variance = sum([((x - mean) ** 2) for x in similarity_lst]) / len(similarity_lst)
    std_dev = variance ** 0.5
    # Apply local standardization
    for value in el.values():
        if value.get("similarity_score"):
            value["similarity_score"] = (value["similarity_score"] - mean)/std_dev
        else:
            value["similarity_score"] = (0.75 - mean)/std_dev
        if value.get("page_similarity_score"):
            value["page_similarity_score"] = (value["page_similarity_score"] - mean)/std_dev
        else:
            value["page_similarity_score"] = (0.75 - mean)/std_dev
        if value.get("entity_similarity_score"):
            value["entity_similarity_score"] = (value["entity_similarity_score"] - mean)/std_dev
        else:
            value["entity_similarity_score"] = (0.75 - mean)/std_dev
        if value.get("parent_similarity_score"):
            value["parent_similarity_score"] = (value["parent_similarity_score"] - mean)/std_dev
        else:
            value["parent_similarity_score"] = (0.75 - mean)/std_dev

In [None]:
# RESOLVE BINARY/DISCRETE VARIABLES: report 0 when the value is not present
for el in normalized_df:
    el = el["observations"]
    for value in el.values():
        if not value.get("n_relevant_entities"):
            value["n_relevant_entities"] = 0
        if not value.get("is_answer"):
            value["is_answer"] = 0
        if not value.get("graph_traverser"):
            value["graph_traverser"] = 0

In [None]:
# SPLIT INTO TRAIN AND TEST SET
import random, json
random.seed(42)
split = int(len(normalized_df) * 0.2)

random.shuffle(normalized_df)

test_data = normalized_df[:split]
train_data = normalized_df[split:]

print(f"LENGTH TRAIN DATA: {len(train_data)}")
print(f"LENGTH TEST DATA: {len(test_data)}")

with open('train_data.json', 'w') as f:
    json.dump(train_data, f)
with open('test_data.json', 'w') as f:
    json.dump(test_data, f)

LENGTH TRAIN DATA: 4179
LENGTH TEST DATA: 1044


In [None]:
# EXPAND THE DATASET
# Instead of having one observation for each question, we expand it to have one observation for each chunk
tot_lst = []
for el in train_data:
    tot_lst += list(el["observations"].values())

dataframe = pd.DataFrame(tot_lst)
dataframe.head(25)

Unnamed: 0,graph_traverser,page_similarity_score,parent_similarity_score,entity_similarity_score,n_relevant_entities,similarity_score,is_answer
0,1,0.334291,0.334291,0.334291,3,-4.42001,0
1,1,0.334291,0.334291,-4.42001,0,0.637866,1
2,1,0.334291,0.334291,0.175379,1,-4.42001,0
3,1,0.334291,0.334291,-4.42001,0,-4.42001,0
4,1,0.334291,0.334291,-4.42001,0,0.291659,0
5,1,0.334291,0.334291,-4.42001,0,-4.42001,0
6,0,-4.42001,-4.42001,0.334291,1,0.467902,0
7,0,-4.42001,-4.42001,-4.42001,0,0.369943,0
8,0,-4.42001,0.122613,0.175379,1,0.319289,0
9,0,-4.42001,-4.42001,-4.42001,0,0.295008,0


In [None]:
# Subset the df to balance classes
positive_obs = dataframe[dataframe["is_answer"] == 1]
num_positive_obs = len(positive_obs)
print(f"LENGHT OF POSITIVE OBSERVATIONS: {num_positive_obs}")

negative_obs = dataframe[dataframe["is_answer"] == 0]
negative_obs = negative_obs.sample(num_positive_obs)

final_df = pd.concat([positive_obs, negative_obs])
# Shuffle
final_df = final_df.sample(frac=1)

LENGHT OF POSITIVE OBSERVATIONS: 6245


In [None]:
print(f"LENGTH OF THE FINAL TRAINING DF: {len(final_df)}")

LENGTH OF THE FINAL TRAINING DF: 12490


In [39]:
final_df.to_csv("training_dataframe.csv")