In [None]:
import pandas as pd
import os
import pprint

In [None]:
path = os.getcwd()
csv_input_path = os.path.dirname(path) + "/Doc_Panthera_Augmented/augmented_dataset_final_outputs.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(csv_input_path, encoding='utf-8')

# Display the first few rows of the DataFrame to check the contents
display(df)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="Text")
docs_data = loader.load()

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

for doc in docs_data:
    doc.page_content = preprocessing.clean_text_template(doc.page_content)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

In [None]:
docs_processed = []
chunk_number = 1  # Initialize chunk number

for doc in docs_data:
    # Split the document into chunks
    split_docs = text_splitter.split_documents([doc])
    
    # Add chunk number as metadata to each split document
    for split_doc in split_docs:
        split_doc.metadata['chunk_number'] = chunk_number
        docs_processed.append(split_doc)
        chunk_number += 1  # Increment chunk number for the next chunk

# Print the first 6 processed documents and their count
pprint.pprint(docs_processed[0:6])
print(len(docs_processed))

In [None]:
from FlagEmbedding import BGEM3FlagModel
from langchain_community.vectorstores import FAISS

model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)
    
embd = M3EmbeddingFP16()

In [None]:
#vectorstore = FAISS.from_documents(documents=docs_processed, embedding=embd)
#vectorstore.save_local("cleaned_whitespaces_recursive_augmented_faiss_index_1500_with_chunk_number")

In [None]:
# Contains the documents without any data preprocessing steps
#vectorstore = FAISS.load_local("cleaned_recursive_augmented_faiss_index_with_chunk_number", embd, allow_dangerous_deserialization=True)
vectorstore = FAISS.load_local("cleaned_whitespaces_recursive_augmented_faiss_index_1500_with_chunk_number", embd, allow_dangerous_deserialization=True)
vectorstore, vectorstore.index.ntotal

In [None]:
import pandas as pd

# Load the saved CSV file
eval_df = pd.read_csv('filtered_matching_questions.csv')

# Display the first few rows of the loaded DataFrame
display(eval_df)

# Prepare the ground truth and retrieved chunks dictionary to evaluate the results

In [None]:
from ranx import Qrels, Run, evaluate, compare

qrels = {}  # Ground truth: {query_id: {chunk_id: relevance}}

# Iterate through each question in the evaluation dataframe
for idx, row in eval_df.iterrows():
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    
    # Ground Truth: Extract relevant chunk numbers
    ground_truth_chunks = list(map(str, row['chunk_num'].strip('[]').split()))
    qrels[query_id] = {chunk: 1 for chunk in ground_truth_chunks}  # Mark all chunks as relevant

qrels_eval = Qrels(qrels)
pprint.pprint(qrels_eval)

In [None]:
from ranx import Qrels, Run, evaluate, compare

# Create qrels (ground truth relevance judgments)
qrels = {}
for idx, row in eval_df.iterrows():
    query_id = f"q{idx}"
    qrels[query_id] = {row['context']: 1}  # Ground truth relevance is 1

# Convert qrels and run to ranx objects
#qrels_eval = Qrels(qrels)  # Ground truth relevance set
pprint.pprint(qrels)

# Run 1 - sparse retriever BM25

In [None]:
from ranx import Run, evaluate, compare
from langchain.retrievers import BM25Retriever

# Initialize the BM25 retriever
retriever = BM25Retriever.from_documents(docs_processed)
retriever.k =  4 

run = {}    # Retrieved chunks: {query_id: {chunk_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in eval_df.iterrows():
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    
    # Retrieve relevant documents using EnsembleRetriever
    question = row['question']
    retrieved_docs = retriever.get_relevant_documents(question)  # Replace with actual retriever call
    
    # Prepare Run: Extract retrieved chunk numbers and scores
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        chunk_id = str(doc.metadata.get('chunk_number'))  # Extract chunk_number from metadata
        score = doc.metadata.get('score', 1.0)  # Use retrieval score or default to 1.0
        retrieved_chunks_scores[chunk_id] = score
    
    run[query_id] = retrieved_chunks_scores  # Store retrieved chunks and scores for this query

run1 = Run(run)

# Run 2 - dense retriever

In [None]:
from ranx import Run, evaluate, compare

# Initialize the dense retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

run = {}    # Retrieved chunks: {query_id: {chunk_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in eval_df.iterrows():
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    
    # Retrieve relevant documents using EnsembleRetriever
    question = row['question']
    retrieved_docs = retriever.get_relevant_documents(question)  # Replace with actual retriever call
    
    # Prepare Run: Extract retrieved chunk numbers and scores
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        chunk_id = str(doc.metadata.get('chunk_number'))  # Extract chunk_number from metadata
        score = doc.metadata.get('score', 1.0)  # Use retrieval score or default to 1.0
        retrieved_chunks_scores[chunk_id] = score
    
    run[query_id] = retrieved_chunks_scores  # Store retrieved chunks and scores for this query

run2 = Run(run)

# Run 3 - hybrid retriever 

In [None]:
from ranx import Run, evaluate, compare
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Initialize the dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs_processed)
bm25_retriever.k =  3 

# Initialize the ensemble retriever
retriever = EnsembleRetriever(retrievers=[bm25_retriever, dense_retriever], weights=[0.4, 0.6])

run = {}    # Retrieved chunks: {query_id: {chunk_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in eval_df.iterrows():
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    
    # Retrieve relevant documents using EnsembleRetriever
    question = row['question']
    retrieved_docs = retriever.get_relevant_documents(question)  # Replace with actual retriever call
    
    # Prepare Run: Extract retrieved chunk numbers and scores
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        chunk_id = str(doc.metadata.get('chunk_number'))  # Extract chunk_number from metadata
        score = doc.metadata.get('score', 1.0)  # Use retrieval score or default to 1.0
        retrieved_chunks_scores[chunk_id] = score
    
    run[query_id] = retrieved_chunks_scores  # Store retrieved chunks and scores for this query

run3 = Run(run)

# Variant not having similar chunking - comparing semantic similarity for extracted content

In [None]:
# Create DataFrame to store results for different retrieval models
retrieval_results = pd.DataFrame(columns=[
    "Model", "Hit Rate", "Precision", "Recall", "F1-score", "MAP", "MRR", "NDCG"
])

# Sparse retriever

In [None]:
from ranx import Run, evaluate, compare
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from sentence_transformers import SentenceTransformer, util
import tqdm 

# Initialize the BM25 retriever
retriever = BM25Retriever.from_documents(docs_processed)
retriever.k = 4

# Initialize a semantic similarity model
similarity_model = SentenceTransformer('BAAI/bge-m3')  # Use a suitable pre-trained model

# Semantic similarity threshold
similarity_threshold = 0.7  # Adjust based on experimentation

run_sparse = {}  # Retrieved contexts: {query_id: {retrieved_context_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in tqdm.tqdm(eval_df.iterrows(), total=len(eval_df), desc="Processing queries"):
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    question = row['question']
    eval_context = row['context']  # The ground truth context from the evaluation dataframe

    # Retrieve relevant documents using EnsembleRetriever
    retrieved_docs = retriever.get_relevant_documents(question)

    # Prepare Run: Compute semantic similarity and determine matches
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        retrieved_context = doc.page_content  # Extract document text
        score = util.pytorch_cos_sim(
            similarity_model.encode(eval_context, convert_to_tensor=True),
            similarity_model.encode(retrieved_context, convert_to_tensor=True)
        ).item()  # Compute semantic similarity score

        if score >= similarity_threshold:
            # If similarity surpasses threshold, assign a score of 1 (match)
            retrieved_chunks_scores[retrieved_context] = 1
        else:
            # Otherwise, assign a score of 0 (no match)
            retrieved_chunks_scores[retrieved_context] = 0

    run_sparse[query_id] = retrieved_chunks_scores  # Store retrieved contexts and scores for this query

In [None]:
import csv

# Filepath to save the CSV
file_path = 'run_sparse_data.csv'

# Define the explicit list of contexts
header_contexts = ['context1', 'context2', 'context3', 'context4']

# Write to CSV
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    header = ['Question'] + header_contexts
    writer.writerow(header)
    
    # Write each row
    for question, contexts in run_sparse.items():
        # Start the row with the question
        row = [question]
        for content, score in contexts.items():
            row.append(score)
        
        writer.writerow(row)
        
print(f"Data saved to {file_path}")

In [None]:
# Load the CSV into a Pandas DataFrame
file_path = 'run_sparse_data.csv'
df = pd.read_csv(file_path)

# Replace NaN values with 0
df.fillna(0, inplace=True)

# Convert columns to integers
df[['context1', 'context2', 'context3', 'context4']] = df[['context1', 'context2', 'context3', 'context4']].astype(int)

# Display the DataFrame
print(df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Extract relevance values
binary_relevance = df.iloc[:, 1:].values  # Extract only 0/1 relevance values

### HIT RATE (HR) ###
hit_rate = np.mean(np.any(binary_relevance, axis=1))

### PRECISION, RECALL, AND F1 ###
precision_per_q = [precision_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
recall_per_q = [recall_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
f1_per_q = [f1_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]

precision_avg = np.mean(precision_per_q)
recall_avg = np.mean(recall_per_q)
f1_avg = np.mean(f1_per_q)

### MEAN AVERAGE PRECISION (MAP) ###
def average_precision(row):
    relevant = np.sum(row)
    if relevant == 0:
        return 0
    precisions = [np.sum(row[:i+1]) / (i+1) for i in range(len(row)) if row[i] == 1]
    return np.mean(precisions) if precisions else 0

map_score = np.mean([average_precision(row) for row in binary_relevance])

### MEAN RECIPROCAL RANK (MRR) ###
def reciprocal_rank(row):
    for i, val in enumerate(row):
        if val == 1:
            return 1 / (i + 1)
    return 0

mrr_score = np.mean([reciprocal_rank(row) for row in binary_relevance])

### NORMALIZED DISCOUNTED CUMULATIVE GAIN (NDCG) ###
def dcg(row):
    return np.sum(row / np.log2(np.arange(2, len(row) + 2)))  # DCG formula

def ndcg(row):
    ideal = np.sort(row)[::-1]  # Ideal ranking
    ideal_dcg = dcg(ideal) if np.sum(ideal) > 0 else 1  # Avoid division by zero
    return dcg(row) / ideal_dcg if ideal_dcg > 0 else 0

ndcg_score = np.mean([ndcg(row) for row in binary_relevance])

# Add results for Hybrid Retriever
retrieval_results = pd.concat([retrieval_results, pd.DataFrame([{
    "Model": "Sparse Retriever",
    "Hit Rate": hit_rate,
    "Precision": precision_avg,
    "Recall": recall_avg,
    "F1-score": f1_avg,
    "MAP": map_score,
    "MRR": mrr_score,
    "NDCG": ndcg_score
}])], ignore_index=True)

display(retrieval_results)

# Dense retriever

In [None]:
from ranx import Run, evaluate, compare
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from sentence_transformers import SentenceTransformer, util
import tqdm 

# Initialize the dense retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Initialize a semantic similarity model
similarity_model = SentenceTransformer('BAAI/bge-m3')  # Use a suitable pre-trained model

# Semantic similarity threshold
similarity_threshold = 0.7  # Adjust based on experimentation

run_dense = {}  # Retrieved contexts: {query_id: {retrieved_context_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in tqdm.tqdm(eval_df.iterrows(), total=len(eval_df), desc="Processing queries"):
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    question = row['question']
    eval_context = row['context']  # The ground truth context from the evaluation dataframe

    # Retrieve relevant documents using EnsembleRetriever
    retrieved_docs = retriever.get_relevant_documents(question)

    # Prepare Run: Compute semantic similarity and determine matches
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        retrieved_context = doc.page_content  # Extract document text
        score = util.pytorch_cos_sim(
            similarity_model.encode(eval_context, convert_to_tensor=True),
            similarity_model.encode(retrieved_context, convert_to_tensor=True)
        ).item()  # Compute semantic similarity score

        if score >= similarity_threshold:
            # If similarity surpasses threshold, assign a score of 1 (match)
            retrieved_chunks_scores[retrieved_context] = 1
        else:
            # Otherwise, assign a score of 0 (no match)
            retrieved_chunks_scores[retrieved_context] = 0

    run_dense[query_id] = retrieved_chunks_scores  # Store retrieved contexts and scores for this query

In [None]:
import csv

# Filepath to save the CSV
file_path = 'run_dense_data.csv'

# Define the explicit list of contexts
header_contexts = ['context1', 'context2', 'context3', 'context4']

# Write to CSV
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    header = ['Question'] + header_contexts
    writer.writerow(header)
    
    # Write each row
    for question, contexts in run_dense.items():
        # Start the row with the question
        row = [question]
        for content, score in contexts.items():
            row.append(score)
        
        writer.writerow(row)
        
print(f"Data saved to {file_path}")

In [None]:
# Load the CSV into a Pandas DataFrame
file_path = 'run_dense_data.csv'
df = pd.read_csv(file_path)

# Replace NaN values with 0
df.fillna(0, inplace=True)

# Convert columns to integers
df[['context1', 'context2', 'context3', 'context4']] = df[['context1', 'context2', 'context3', 'context4']].astype(int)

# Display the DataFrame
print(df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Extract relevance values
binary_relevance = df.iloc[:, 1:].values  # Extract only 0/1 relevance values

### HIT RATE (HR) ###
hit_rate = np.mean(np.any(binary_relevance, axis=1))

### PRECISION, RECALL, AND F1 ###
precision_per_q = [precision_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
recall_per_q = [recall_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
f1_per_q = [f1_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]

precision_avg = np.mean(precision_per_q)
recall_avg = np.mean(recall_per_q)
f1_avg = np.mean(f1_per_q)

### MEAN AVERAGE PRECISION (MAP) ###
def average_precision(row):
    relevant = np.sum(row)
    if relevant == 0:
        return 0
    precisions = [np.sum(row[:i+1]) / (i+1) for i in range(len(row)) if row[i] == 1]
    return np.mean(precisions) if precisions else 0

map_score = np.mean([average_precision(row) for row in binary_relevance])

### MEAN RECIPROCAL RANK (MRR) ###
def reciprocal_rank(row):
    for i, val in enumerate(row):
        if val == 1:
            return 1 / (i + 1)
    return 0

mrr_score = np.mean([reciprocal_rank(row) for row in binary_relevance])

### NORMALIZED DISCOUNTED CUMULATIVE GAIN (NDCG) ###
def dcg(row):
    return np.sum(row / np.log2(np.arange(2, len(row) + 2)))  # DCG formula

def ndcg(row):
    ideal = np.sort(row)[::-1]  # Ideal ranking
    ideal_dcg = dcg(ideal) if np.sum(ideal) > 0 else 1  # Avoid division by zero
    return dcg(row) / ideal_dcg if ideal_dcg > 0 else 0

ndcg_score = np.mean([ndcg(row) for row in binary_relevance])

# Add results for Hybrid Retriever
retrieval_results = pd.concat([retrieval_results, pd.DataFrame([{
    "Model": "Dense Retriever",
    "Hit Rate": hit_rate,
    "Precision": precision_avg,
    "Recall": recall_avg,
    "F1-score": f1_avg,
    "MAP": map_score,
    "MRR": mrr_score,
    "NDCG": ndcg_score
}])], ignore_index=True)

display(retrieval_results)

# Hybrid retriever

In [None]:
from ranx import Run, evaluate, compare
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from sentence_transformers import SentenceTransformer, util
import tqdm 

# Initialize the dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs_processed)
bm25_retriever.k = 4

# Initialize the ensemble retriever
retriever = EnsembleRetriever(retrievers=[bm25_retriever, dense_retriever], weights=[0.4, 0.6])

# Initialize a semantic similarity model
similarity_model = SentenceTransformer('BAAI/bge-m3')  # Use a suitable pre-trained model

# Semantic similarity threshold
similarity_threshold = 0.7  # Adjust based on experimentation

run = {}  # Retrieved contexts: {query_id: {retrieved_context_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in tqdm.tqdm(eval_df.iterrows(), total=len(eval_df), desc="Processing queries"):
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    question = row['question']
    eval_context = row['context']  # The ground truth context from the evaluation dataframe

    # Retrieve relevant documents using EnsembleRetriever
    retrieved_docs = retriever.get_relevant_documents(question)

    # Prepare Run: Compute semantic similarity and determine matches
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        retrieved_context = doc.page_content  # Extract document text
        score = util.pytorch_cos_sim(
            similarity_model.encode(eval_context, convert_to_tensor=True),
            similarity_model.encode(retrieved_context, convert_to_tensor=True)
        ).item()  # Compute semantic similarity score

        if score >= similarity_threshold:
            # If similarity surpasses threshold, assign a score of 1 (match)
            retrieved_chunks_scores[retrieved_context] = 1
        else:
            # Otherwise, assign a score of 0 (no match)
            retrieved_chunks_scores[retrieved_context] = 0

    run[query_id] = retrieved_chunks_scores  # Store retrieved contexts and scores for this query

In [None]:
import csv

# Filepath to save the CSV
file_path = 'run_hybrid_data.csv'

# Define the explicit list of contexts
header_contexts = ['context1', 'context2', 'context3', 'context4', 'context5', 'context6']

# Write to CSV
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    header = ['Question'] + header_contexts
    writer.writerow(header)
    
    # Write each row
    for question, contexts in run.items():
        # Start the row with the question
        row = [question]
        for content, score in contexts.items():
            row.append(score)
        
        writer.writerow(row[:7])
        
print(f"Data saved to {file_path}")

In [None]:
# Load the CSV into a Pandas DataFrame
file_path = 'run_hybrid_data.csv'
df = pd.read_csv(file_path)

# Replace NaN values with 0
df.fillna(0, inplace=True)

# Convert columns to integers
df[['context1', 'context2', 'context3', 'context4', 'context5', 'context6']] = df[['context1', 'context2', 'context3', 'context4', 'context5', 'context6']].astype(int)

# Display the DataFrame
print(df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Extract relevance values
binary_relevance = df.iloc[:, 1:].values  # Extract only 0/1 relevance values

### HIT RATE (HR) ###
hit_rate = np.mean(np.any(binary_relevance, axis=1))

### PRECISION, RECALL, AND F1 ###
precision_per_q = [precision_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
recall_per_q = [recall_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
f1_per_q = [f1_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]

precision_avg = np.mean(precision_per_q)
recall_avg = np.mean(recall_per_q)
f1_avg = np.mean(f1_per_q)

### MEAN AVERAGE PRECISION (MAP) ###
def average_precision(row):
    relevant = np.sum(row)
    if relevant == 0:
        return 0
    precisions = [np.sum(row[:i+1]) / (i+1) for i in range(len(row)) if row[i] == 1]
    return np.mean(precisions) if precisions else 0

map_score = np.mean([average_precision(row) for row in binary_relevance])

### MEAN RECIPROCAL RANK (MRR) ###
def reciprocal_rank(row):
    for i, val in enumerate(row):
        if val == 1:
            return 1 / (i + 1)
    return 0

mrr_score = np.mean([reciprocal_rank(row) for row in binary_relevance])

### NORMALIZED DISCOUNTED CUMULATIVE GAIN (NDCG) ###
def dcg(row):
    return np.sum(row / np.log2(np.arange(2, len(row) + 2)))  # DCG formula

def ndcg(row):
    ideal = np.sort(row)[::-1]  # Ideal ranking
    ideal_dcg = dcg(ideal) if np.sum(ideal) > 0 else 1  # Avoid division by zero
    return dcg(row) / ideal_dcg if ideal_dcg > 0 else 0

ndcg_score = np.mean([ndcg(row) for row in binary_relevance])

# Add results for Hybrid Retriever
retrieval_results = pd.concat([retrieval_results, pd.DataFrame([{
    "Model": "Hybrid Retriever",
    "Hit Rate": hit_rate,
    "Precision": precision_avg,
    "Recall": recall_avg,
    "F1-score": f1_avg,
    "MAP": map_score,
    "MRR": mrr_score,
    "NDCG": ndcg_score
}])], ignore_index=True)

display(retrieval_results)


# Use created retrieval evaluation class

In [None]:
from langchain.retrievers import BM25Retriever
#from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from retrieval_evaluation import RetrievalEvaluator

# Initialize the dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs_processed)
bm25_retriever.k = 2

# Initialize the ensemble retriever
retriever = EnsembleRetriever(retrievers=[bm25_retriever, dense_retriever], weights=[0.2, 0.8])

# Instantiate the evaluator
evaluator = RetrievalEvaluator(retriever=retriever, similarity_model=SentenceTransformer('BAAI/bge-m3'))

evaluator.evaluate(eval_df)  # Pass the evaluation dataframe (eval_df)

In [None]:
evaluator.save_to_csv('run_hybrid2_data.csv')

In [None]:
file_path = "run_hybrid2_data.csv"
model_name = "Hybrid retriever 2"
results = evaluator.calculate_metrics(file_path=file_path, model_name=model_name)
display(results)

# Top hybrid configuration

In [None]:
# Initialize the dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs_processed)
bm25_retriever.k = 2

# Initialize the ensemble retriever
retriever = EnsembleRetriever(retrievers=[bm25_retriever, dense_retriever], weights=[0.2, 0.8])

# Initialize a semantic similarity model
similarity_model = SentenceTransformer('BAAI/bge-m3')  # Use a suitable pre-trained model

# Semantic similarity threshold
similarity_threshold = 0.7  # Adjust based on experimentation

run_hybrid_top = {}  # Retrieved contexts: {query_id: {retrieved_context_id: score}}

# Iterate through each question in the evaluation dataframe
for idx, row in tqdm.tqdm(eval_df.iterrows(), total=len(eval_df), desc="Processing queries"):
    query_id = f"q{idx}"  # Query IDs: q0, q1, etc.
    question = row['question']
    eval_context = row['context']  # The ground truth context from the evaluation dataframe

    # Retrieve relevant documents using EnsembleRetriever
    retrieved_docs = retriever.get_relevant_documents(question)

    # Prepare Run: Compute semantic similarity and determine matches
    retrieved_chunks_scores = {}
    for doc in retrieved_docs:
        retrieved_context = doc.page_content  # Extract document text
        score = util.pytorch_cos_sim(
            similarity_model.encode(eval_context, convert_to_tensor=True),
            similarity_model.encode(retrieved_context, convert_to_tensor=True)
        ).item()  # Compute semantic similarity score

        if score >= similarity_threshold:
            # If similarity surpasses threshold, assign a score of 1 (match)
            retrieved_chunks_scores[retrieved_context] = 1
        else:
            # Otherwise, assign a score of 0 (no match)
            retrieved_chunks_scores[retrieved_context] = 0

    run_hybrid_top[query_id] = retrieved_chunks_scores  # Store retrieved contexts and scores for this query

In [None]:
import csv

# Filepath to save the CSV
file_path = 'run_hybrid2_data.csv'

# Define the explicit list of contexts
header_contexts = ['context1', 'context2', 'context3', 'context4']

# Write to CSV
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    header = ['Question'] + header_contexts
    writer.writerow(header)
    
    # Write each row
    for question, contexts in run_hybrid_top.items():
        # Start the row with the question
        row = [question]
        for content, score in contexts.items():
            row.append(score)
        
        writer.writerow(row[:5])
        
print(f"Data saved to {file_path}")

In [None]:
# Load the CSV into a Pandas DataFrame
file_path = 'run_hybrid2_data.csv'
df = pd.read_csv(file_path)

# Replace NaN values with 0
df.fillna(0, inplace=True)

# Convert columns to integers
df[['context1', 'context2', 'context3', 'context4']] = df[['context1', 'context2', 'context3', 'context4']].astype(int)

# Display the DataFrame
print(df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Extract relevance values
binary_relevance = df.iloc[:, 1:].values  # Extract only 0/1 relevance values

### HIT RATE (HR) ###
hit_rate = np.mean(np.any(binary_relevance, axis=1))

### PRECISION, RECALL, AND F1 ###
precision_per_q = [precision_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
recall_per_q = [recall_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]
f1_per_q = [f1_score(row, np.ones_like(row), zero_division=0) for row in binary_relevance]

precision_avg = np.mean(precision_per_q)
recall_avg = np.mean(recall_per_q)
f1_avg = np.mean(f1_per_q)

### MEAN AVERAGE PRECISION (MAP) ###
def average_precision(row):
    relevant = np.sum(row)
    if relevant == 0:
        return 0
    precisions = [np.sum(row[:i+1]) / (i+1) for i in range(len(row)) if row[i] == 1]
    return np.mean(precisions) if precisions else 0

map_score = np.mean([average_precision(row) for row in binary_relevance])

### MEAN RECIPROCAL RANK (MRR) ###
def reciprocal_rank(row):
    for i, val in enumerate(row):
        if val == 1:
            return 1 / (i + 1)
    return 0

mrr_score = np.mean([reciprocal_rank(row) for row in binary_relevance])

### NORMALIZED DISCOUNTED CUMULATIVE GAIN (NDCG) ###
def dcg(row):
    return np.sum(row / np.log2(np.arange(2, len(row) + 2)))  # DCG formula

def ndcg(row):
    ideal = np.sort(row)[::-1]  # Ideal ranking
    ideal_dcg = dcg(ideal) if np.sum(ideal) > 0 else 1  # Avoid division by zero
    return dcg(row) / ideal_dcg if ideal_dcg > 0 else 0

ndcg_score = np.mean([ndcg(row) for row in binary_relevance])

# Add results for Hybrid Retriever
retrieval_results = pd.concat([retrieval_results, pd.DataFrame([{
    "Model": "Hybrid2 Retriever",
    "Hit Rate": hit_rate,
    "Precision": precision_avg,
    "Recall": recall_avg,
    "F1-score": f1_avg,
    "MAP": map_score,
    "MRR": mrr_score,
    "NDCG": ndcg_score
}])], ignore_index=True)

display(retrieval_results)