In [None]:
from tqdm import tqdm
import io, json
import pandas as pd
import config.EnvLoader as el

In [None]:
# To get the test data run Training DF contrustion ipynb
with open('test_data.json', 'r') as file:
    test_data = json.load(file)

In [None]:
question_lst = [x["question"] for x in test_data]
len(question_lst)

1044

In [7]:
ground_truth = {}
test_data_short_answers = []

with open("v1.0-simplified_nq-dev-all.jsonl") as file:
    for line in tqdm(file):
        line = io.StringIO(line)
        jsonObj = pd.read_json(path_or_buf=line, lines=True)
        question = jsonObj['question_text'][0]
        # Check if question is in test data
        if question not in question_lst: continue
        # Check if observation has a short answer
        short_answers_spans = []
        for ann in jsonObj['annotations'][0]:
            ann_sa = ann.get("short_answers", [])
            if ann_sa:
                short_answers_spans.append(ann_sa)
        if not short_answers_spans: continue
        # Extract the short answer
        short_answers = []
        for short_answer_span_lst in short_answers_spans:
            candidate_answers = set()
            for short_answer_span in short_answer_span_lst:
                candidate_answers.add(' '.join([x['token'] for x in jsonObj['document_tokens'][0][short_answer_span['start_token']:short_answer_span['end_token']]]))
            if candidate_answers not in short_answers:
                short_answers.append(candidate_answers)
        ground_truth[question] = short_answers

7830it [01:07, 115.57it/s]


In [8]:
len(ground_truth)

803

In [9]:
ground_truth.keys()

dict_keys(['what do the 3 dots mean in math', 'who won the 2017 sports personality of the year', 'who was the first chief minister of west bengal', 'a good that can be used in place of another good', "who plays unis in she's the man", 'who is recognized as the founder of islam', 'who plays emma in air bud world pup', 'how old was sasuke when his clan died', "who hosted they think it's all over", "where did the saying monkey's uncle come from", 'where does us highway 1 start and end', 'what type of fuel goes in a zippo', 'who played the mad hatter in the batman tv show', 'in photosynthesis the carbon in co2 is initially fixed to what molecule', 'how many countries does cadbury sell its products', 'who wrote the book my country my life', 'what does it mean to be on parole', 'who is the founder of google and when was it founded', 'how did early humans make use of stones during the prehistoric period', 'who won the champions league final in 2016', 'what do the economic systems of the uk ge

In [10]:
ground_truth['a good that can be used in place of another good']

[{'substitute good'}, {'A substitute good'}]

In [32]:
import pickle
with open("ground_truth.pkl", 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(ground_truth, file)

# Dataset Construction
## Extraction of context chunks

In [None]:
from neo4j import GraphDatabase
from tqdm import tqdm

URI = "neo4j://localhost"
AUTH = ("neo4j", el.NEO4J_PWD)

In [12]:
import pickle 

# Open the file in binary mode 
with open('embedded_questions.pkl', 'rb') as file: 
	q_diz = pickle.load(file) 

In [None]:
extracted_chunks_naive = {}

for question, gt in tqdm(ground_truth.items()):
    # Search for top 3 chunk
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_chunks, _, _ = driver.execute_query(
            """CALL db.index.vector.queryNodes("vector", 100, $embedding)
                YIELD node, score
                RETURN ID(node), node.text
                LIMIT 3""",
            embedding=q_diz[question]
        )
    top_3_chunks = [x["ID(node)"] for x in retrieved_chunks]
    context = [x["node.text"] for x in retrieved_chunks]
    extracted_chunks_naive[question] = {
        'top_3_chunks': top_3_chunks,
        'context': context
    }

In [14]:
with open('extracted_chunks_hybrid.json', 'r') as file:
    extracted_chunks_hybrid_tot = json.load(file)

In [None]:
extracted_chunks_hybrid = {}

for question, gt in ground_truth.items():
    top_3_chunks = extracted_chunks_hybrid_tot[question]
    # Search for top 10 chunk
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        retrieved_texts, _, _ = driver.execute_query(
            """WITH $top_3_chunks AS list
            MATCH (a:Chunk)
            WHERE ID(a) IN list
            RETURN a.text""",
            top_3_chunks=top_3_chunks
        )
    context = [x["a.text"] for x in retrieved_texts]
    extracted_chunks_hybrid[question] = {
        'top_3_chunks': top_3_chunks,
        'context': context
    }

## Answers extraction from retrieved context

In [None]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    model="gpt-4o-global",
    azure_deployment="gpt-4o-global",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.AZURE_ENDPOINT,
    openai_api_version="2024-02-15-preview",
)

In [17]:
cost_dict = {
    "gpt-4o": {
        "prompt_tokens": 2.5/1000000,
        "completion_tokens": 10/1000000,
    }
}

def compute_costs(token_usage:dict, costs:dict):
    return token_usage["completion_tokens"]*costs["completion_tokens"] + token_usage["prompt_tokens"]*costs["prompt_tokens"]

In [18]:
system_prompt = """Imagine being an extractive Question Answering System.
Respond to the user's question extracting the smallest possible slice of text from the context, without changing anything.
If the answers are multiple, separate the extracted slices of text with a forward slash ("/").
If no answer to the user's query can be found in the context, output "ND".
In any case, do not add any comments to your answer."""

In [19]:
tot_cost = 0

for key, value in tqdm(extracted_chunks_naive.items()):
    context = "\n\n".join(value['context'])
    # Query the LLM
    message = [
        ("system", system_prompt),
        ("human", "QUERY:\n" + key + "\n\nCONTEXT:\n" + context),
    ]
    response = llm.invoke(message)
    # Compute costs
    model_name = "-".join(response.response_metadata["model_name"].split("-")[:-3])
    tot_cost += compute_costs(response.response_metadata["token_usage"], cost_dict[model_name])
    value['response'] = response.content

print(f"TOTAL COSTS: {round(tot_cost,3)}€")

100%|██████████| 803/803 [05:52<00:00,  2.28it/s]

TOTAL COSTS: 1.4€





In [23]:
tot_cost = 0

for key, value in tqdm(extracted_chunks_hybrid.items()):
    if value.get('response'):
        continue
    context = "\n\n".join(value['context'])
    # Query the LLM
    message = [
        ("system", system_prompt),
        ("human", "QUERY:\n" + key + "\n\nCONTEXT:\n" + context),
    ]
    response = llm.invoke(message)
    # Compute costs
    model_name = "-".join(response.response_metadata["model_name"].split("-")[:-3])
    tot_cost += compute_costs(response.response_metadata["token_usage"], cost_dict[model_name])
    value['response'] = response.content

print(f"TOTAL COSTS: {round(tot_cost,3)}€")

100%|██████████| 803/803 [03:04<00:00,  4.35it/s] 

TOTAL COSTS: 0.764€





In [24]:
with open('extracted_chunks_hybrid_full.json', 'w') as f:
    json.dump(extracted_chunks_hybrid, f)

with open('extracted_chunks_naive_full.json', 'w') as f:
    json.dump(extracted_chunks_naive, f)

In [25]:
with open("gt_full.pkl", 'wb') as file:
    pickle.dump(ground_truth, file)

# Evaluation
## Exact Match

In [26]:
for question, gts in ground_truth.items():
    naive_answer = extracted_chunks_naive[question]['response']
    naive_answer = "".join([x.replace(" ", "").lower() for x in naive_answer.split("/")])
    hybrid_answer = extracted_chunks_hybrid[question]['response']
    hybrid_answer = "".join([x.replace(" ", "").lower() for x in hybrid_answer.split("/")])
    formatted_gts = []
    for gt in gts:
        gt = "".join([x.replace(" ", "").lower() for x in gt])
        formatted_gts.append(gt)
    if naive_answer in formatted_gts:
        extracted_chunks_naive[question]["evaluation"] = True
    else:
        extracted_chunks_naive[question]["evaluation"] = False
    if hybrid_answer in formatted_gts:
        extracted_chunks_hybrid[question]["evaluation"] = True
    else:
        extracted_chunks_hybrid[question]["evaluation"] = False

In [31]:
correct = 0

for question, values in extracted_chunks_naive.items():
    if values['evaluation']:
        correct += 1

print(f"Naive Evaluation: {round(correct/len(ground_truth),3)}")

Naive Evaluation: 0.355


In [32]:
correct = 0

for question, values in extracted_chunks_hybrid.items():
    if values['evaluation']:
        correct += 1

print(f"Hybrid Evaluation: {round(correct/len(ground_truth),3)}")

Hybrid Evaluation: 0.399


## ROUGE

In [None]:
from ragas import EvaluationDataset, evaluate
from ragas.metrics import RougeScore

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# DATASET for naive retrieval

dataset = []

for question, value in ground_truth.items():
    observation = {
        'user_input': question,
        'retrieved_contexts': extracted_chunks_naive[question]['context'],
        'response': extracted_chunks_naive[question]['response'],
        'reference': " ".join(value[0])
    }
    dataset.append(observation)
eval_dataset_naive = EvaluationDataset.from_dict(dataset)

# DATASET for hybrid retrieval
dataset = []

for question, value in ground_truth.items():
    observation = {
        'user_input': question,
        'retrieved_contexts': extracted_chunks_hybrid[question]['context'],
        'response': extracted_chunks_hybrid[question]['response'],
        'reference': " ".join(value[0])
    }
    dataset.append(observation)
eval_dataset_hybrid = EvaluationDataset.from_dict(dataset)

In [39]:
metrics = [
    RougeScore()
]
results_naive = evaluate(dataset=eval_dataset_naive, metrics=metrics)
results_hybrid = evaluate(dataset=eval_dataset_hybrid, metrics=metrics)

Evaluating: 100%|██████████| 803/803 [00:00<00:00, 6448.76it/s]
Evaluating: 100%|██████████| 803/803 [00:00<00:00, 6856.97it/s]


In [None]:
print(f"RESULTS NAIVE RETRIEVAL: {results_naive['rouge_score']}")
print(f"RESULTS HYBRID RETRIEVAL: {results_hybrid['rouge_score']}")

RESULTS NAIVE RETRIEVAL: 0.4606
RESULTS HYBRID RETRIEVAL: 0.5148


## Cosine Similarity

In [None]:
from ragas.metrics import SemanticSimilarity
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import AzureOpenAIEmbeddings

evaluator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version="2023-03-15-preview",
    azure_endpoint=el.AZURE_ENDPOINT,
    model="text-embedding-ada-002",
    api_key=el.OPENAI_API_KEY,
))

metrics = [
    SemanticSimilarity(embeddings=evaluator_embeddings)
]

In [43]:
results_naive = evaluate(dataset=eval_dataset_naive, metrics=metrics)
results_hybrid = evaluate(dataset=eval_dataset_hybrid, metrics=metrics)

Evaluating: 100%|██████████| 803/803 [01:09<00:00, 11.57it/s]
Evaluating: 100%|██████████| 803/803 [01:11<00:00, 11.21it/s]


In [None]:
print(f"RESULTS NAIVE RETRIEVAL: {results_naive['semantic_similarity']}")
print(f"RESULTS HYBRID RETRIEVAL: {results_hybrid['semantic_similarity']}")

RESULTS NAIVE RETRIEVAL: 0.8907
RESULTS HYBRID RETRIEVAL: 0.9030


## RAGAS Factual Correctness

In [None]:
from langchain_openai import AzureChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import FactualCorrectness

evaluator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version="2024-02-15-preview",
    azure_endpoint=el.AZURE_ENDPOINT,
    azure_deployment="gpt-4o-global",
    model="gpt-4o-global",
    validate_base_url=False,
    api_key=el.OPENAI_API_KEY,
))

In [None]:
# DATASET for naive retrieval
dataset = []
for question, value in ground_truth.items():
    if question[-1] != "?":
        formatted_question = question + "?"
    else:
        formatted_question = question
    observation = {
        'user_input': formatted_question,
        'retrieved_contexts': extracted_chunks_naive[question]['context'],
        'response': formatted_question + " " + extracted_chunks_naive[question]['response'],
        'reference': formatted_question + " " + " ".join(value[0])
    }
    dataset.append(observation)
eval_dataset_naive = EvaluationDataset.from_dict(dataset)

# DATASET for hybrid retrieval
dataset = []
for question, value in ground_truth.items():
    if question[-1] != "?":
        formatted_question = question + "?"
    else:
        formatted_question = question
    observation = {
        'user_input': formatted_question,
        'retrieved_contexts': extracted_chunks_hybrid[question]['context'],
        'response': formatted_question + " " + extracted_chunks_hybrid[question]['response'],
        'reference': formatted_question + " " + " ".join(value[0])
    }
    dataset.append(observation)
eval_dataset_hybrid = EvaluationDataset.from_dict(dataset)

In [None]:
metrics = [
    FactualCorrectness(llm=evaluator_llm),
]

results_naive = evaluate(dataset=eval_dataset_naive, metrics=metrics)
results_hybrid = evaluate(dataset=eval_dataset_hybrid, metrics=metrics)

Evaluating:   2%|▏         | 13/803 [00:06<04:39,  2.82it/s]ERROR:ragas.executor:Exception raised in Job[16]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:   9%|▉         | 76/803 [00:23<03:11,  3.79it/s]ERROR:ragas.executor:Exception raised in Job[78]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  15%|█▍        | 119/803 [00:36<03:09,  3.61it/s]ERROR:ragas.executor:Exception raised in Job[129]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  18%|█▊        | 142/803 [00:42<05:12,  2.12it/s]ERROR:ragas.executor:Exception raised in Job[144]: TypeError(ufunc 'invert' not supported for the input types

In [None]:
print(f"RESULTS NAIVE RETRIEVAL: {results_naive['factual_correctness']}")
print(f"RESULTS HYBRID RETRIEVAL: {results_hybrid['factual_correctness']}")

RESULTS NAIVE RETRIEVAL: 0.6356
RESULTS HYBRID RETRIEVAL: 0.6722
