## Install dependencies

In [None]:
!pip install langchain-community langchain-core pinecone fastembed google-cloud-translate langchain-google-genai langchain-pinecone bert_score

In [None]:
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_google_genai import GoogleGenerativeAI
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings
from google.colab import files
from bert_score import score

import pandas as pd
import json
from google.colab import userdata

## Implement the Retrieval Chain

In [None]:
class RetrievalChain:
    def __init__(self, embedding_model, vector_store, top_k=15):
        self.embedding_model = embedding_model
        self.vector_store = vector_store
        self.top_k = top_k

    def get_retrieved_context_ids(self, query):
        """
        Returns ranked list of retrieved context_ids (top_k)
        """
        results = self.vector_store.similarity_search_with_score(
            query,
            k=self.top_k
        )

        return [doc.metadata["context_id"] for doc, _ in results]

    def compute_recall_flags(self, gt_context_id, retrieved_ids):
        """
        Computes Recall@k flags
        """
        return {
            "r@1": int(gt_context_id in retrieved_ids[:1]),
            "r@3": int(gt_context_id in retrieved_ids[:3]),
            "r@5": int(gt_context_id in retrieved_ids[:5]),
            "r@10": int(gt_context_id in retrieved_ids[:10]),
            "r@15": int(gt_context_id in retrieved_ids[:15]),
        }

    def run(self, question_id, query, gt_context_id):
        retrieved_ids = self.get_retrieved_context_ids(query)

        recall_flags = self.compute_recall_flags(
            gt_context_id,
            retrieved_ids
        )

        return {
            "question_id": question_id,
            **recall_flags,
            "contexts": "|".join(map(str, retrieved_ids)),
            "ground_truth": gt_context_id
        }


## Initiaize the RetrievalChain

In [None]:
class MultilingualE5Embeddings(Embeddings):
    def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        # E5 requires "passage:" prefix for documents
        return [
            self.model.encode(f"passage: {text}").tolist()
            for text in texts
        ]

    def embed_query(self, text):
        # E5 requires "query:" prefix for queries
        return self.model.encode(f"query: {text}").tolist()


In [None]:
experiment_id = "cle_me5"

In [None]:
embedding_model = MultilingualE5Embeddings()

pc_index_name = "gic-me5"
pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))
index = pc.Index(pc_index_name)
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

retrieval_chain = RetrievalChain(
    embedding_model=embedding_model,
    vector_store=vector_store,
)

## Load test data

In [None]:
import pandas as pd
qa_data = pd.read_csv("gic_qa_with_ids.csv")
qa_data.head(1)

## tamil questions

In [None]:
results_ta = []

for _, row in qa_data.iterrows():
    print(f"Processing question: {row['question_id']}")
    # print(f"Question: {row['question_ta']}")
    result = retrieval_chain.run(
        question_id=row["question_id"],
        query=row["question_ta"],
        gt_context_id=row["context_id"]
    )
    # print("Ground truth: ", row["context_id"])
    # print("Retrieved: ", result["contexts"])
    results_ta.append(result)


In [None]:
df = pd.DataFrame(results_ta)
df.tail(5)

In [None]:
df.to_csv(f"result_ta_{experiment_id}.csv", index=False)

In [None]:
summary_baseline_ta = {
    "Recall@1": df["r@1"].mean(),
    "Recall@3": df["r@3"].mean(),
    "Recall@5": df["r@5"].mean(),
    "Recall@10": df["r@10"].mean(),
    "Recall@15": df["r@15"].mean(),
}

summary_baseline_ta

In [None]:
summary_pct_baseline_ta = {
    k: v * 100
    for k, v in summary_baseline_ta.items()
}

summary_pct_baseline_ta

In [None]:
summary_pct_baseline_ta['language'] = "ta"
summary_pct_baseline_ta

## Sinhala questions

In [None]:
results_si = []

for _, row in qa_data.iterrows():
    print(f"Processing question: {row['question_id']}")
    # print(f"Question: {row['question_si']}")
    result = retrieval_chain.run(
        question_id=row["question_id"],
        query=row["question_si"],
        gt_context_id=row["context_id"]
    )
    # print("Ground truth: ", row["context_id"])
    # print("Retrieved: ", result["contexts"])
    results_si.append(result)
    # break;

In [None]:
df = pd.DataFrame(results_si)
df.tail(5)

In [None]:
df.to_csv(f"result_si_{experiment_id}.csv", index=False)

In [None]:
summary_baseline_si = {
    "Recall@1": df["r@1"].mean(),
    "Recall@3": df["r@3"].mean(),
    "Recall@5": df["r@5"].mean(),
    "Recall@10": df["r@10"].mean(),
    "Recall@15": df["r@15"].mean(),
}

summary_baseline_si

In [None]:
summary_pct_baseline_si = {
    k: v * 100
    for k, v in summary_baseline_si.items()
}

summary_pct_baseline_si

In [None]:
summary_pct_baseline_si['language'] = "si"
summary_pct_baseline_si

## Overall summary

In [None]:
summary_baseline = pd.DataFrame([summary_pct_baseline_ta, summary_pct_baseline_si])
summary_baseline

In [None]:
cols = ["language"] + [c for c in summary_baseline.columns if c != "language"]
summary_baseline = summary_baseline[cols]
summary_baseline

In [None]:
summary_baseline.to_csv(f"summary_{experiment_id}.csv", index=False)