In [32]:
# pip install pandas numpy sentence-transformers tensorflow tensorflow_hub scipy transformers langchain_openai ragas openai load_dotenv

In [17]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm
import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer
import scipy.special
from dotenv import load_dotenv
import os

### Ragas libraries

In [18]:
from ragas.metrics import ContextEntityRecall
from ragas.metrics import Faithfulness
from ragas import SingleTurnSample
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import ContextEntityRecall

In [19]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# Initialize the OpenAI LLM and wrap it
llm = ChatOpenAI(model="gpt-4o-mini")
wrapped_llm = LangchainLLMWrapper(llm)

In [41]:
async def calculate_context_precision(Question, ground_truth_Context, Context):
    context_precision = LLMContextPrecisionWithReference(llm=wrapped_llm)
    sample = SingleTurnSample(
        user_input=Question,
        reference=ground_truth_Context,
        retrieved_contexts=[Context],
    )
    score = await context_precision.single_turn_ascore(sample)
    print(f"LLM-based context precision with reference: {score}")
    return score

In [42]:
async def evaluate_context_entity_recall(ground_truth_Context, Context):
    # Create a sample for evaluation
    sample = SingleTurnSample(
        reference=ground_truth_Context,
        retrieved_contexts=[Context],
    )

    # Initialize the ContextEntityRecall scorer with the wrapped LLM
    scorer = ContextEntityRecall(llm=wrapped_llm)

    # Calculate the score
    score = await scorer.single_turn_ascore(sample)
    print(f"LLM-based context entities recall with reference answer: {score}")
    return score

In [43]:
async def evaluate_faithfulness(Question, Answer, Context):
    sample = SingleTurnSample(
        user_input=Question,
        response=Answer,
        retrieved_contexts=[Context]
    )

    scorer = Faithfulness(llm=wrapped_llm)
    score = await scorer.single_turn_ascore(sample)
    print(f"Faithfulness: {score}")
    return score

In [44]:
class ParticipantVisibleError(Exception):
    pass

In [45]:
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (norm(a) * norm(b) + 1e-8)  # Avoid division by zero

In [73]:
async def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    
    # Drop the row ID column
    solution = solution.drop(columns=[row_id_column_name])
    submission = submission.drop(columns=[row_id_column_name])

    # Validate columns
    required_submission_cols = {'Question', 'Context', 'Answer', 'Sections', 'Pages'}
    required_solution_cols = {'Question', 'ground_truth_Context', 'ground_truth_Answer', 'ground_truth_Sections', 'ground_truth_Pages'}

    if not required_submission_cols.issubset(submission.columns):
        missing = required_submission_cols - set(submission.columns)
        raise ParticipantVisibleError(f"Missing columns in submission: {missing}")

    if not required_solution_cols.issubset(solution.columns):
        missing = required_solution_cols - set(solution.columns)
        raise ParticipantVisibleError(f"Missing columns in solution: {missing}")

    # Merge on 'question'
    merged = pd.merge(solution, submission, on='Question', how='inner')

    if merged.empty:
        raise ParticipantVisibleError("No matching questions between submission and solution.")

    # Initialize models
    model = SentenceTransformer('all-MiniLM-L6-v2')
    bem = hub.load('https://www.kaggle.com/models/google/bert/TensorFlow2/answer-equivalence-bem/1')
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Initialize scores
    context_matching_scores = []
    context_precision_scores = []
    context_entity_recall_scores = []
    faithfulness_scores = []
    answer_correctness_scores = []

    for _, row in merged.iterrows():
        try:
            q_emb = model.encode(row["Question"])
            ctx_emb = model.encode(row["Context"])
            context_matching_scores.append(cosine_sim(q_emb, ctx_emb))

            Question = row["Question"]
            Context = row["Context"]
            ground_truth_Context = row["ground_truth_Context"]
            ground_truth_Answer = row["ground_truth_Answer"]
            Answer = row["Answer"]

            precision_score = await calculate_context_precision(Question, ground_truth_Context, Context)
            entity_recall_score = await evaluate_context_entity_recall(ground_truth_Context, Context)
            faithfulness_score = await evaluate_faithfulness(Question, Answer, Context)
            
            context_precision_scores.append(precision_score)
            context_entity_recall_scores.append(entity_recall_score)
            faithfulness_scores.append(faithfulness_score)

            input_text = f"[CLS] {Question} [SEP] {ground_truth_Answer} [SEP] {Answer} [SEP]"
            # Tokenize input
            encoded = tokenizer(
                input_text,
                return_tensors="tf",
                padding="max_length",
                truncation=True,
                max_length=512,
            )
            # Prepare input dict with correct types
            inputs = {
                "input_ids": tf.cast(encoded["input_ids"], tf.int64),
                "segment_ids": tf.cast(encoded["token_type_ids"], tf.int64)
            }
            # Run model
            raw_outputs = bem(inputs)
            # Convert logits to probabilities using softmax
            probabilities = scipy.special.softmax(raw_outputs.numpy(), axis=1)
            # BERT Answer Equivalence Score (Probability of class 1)
            equivalence_score = probabilities[0][1]
            answer_correctness_scores.append(equivalence_score)
            
        except Exception as e:
            raise ParticipantVisibleError(f"Embedding computation failed for a row: {e}")

    # Convert to numpy arrays and clip values
    context_matching_scores = np.clip(np.array(context_matching_scores), 0, 1)
    context_precision_scores = np.clip(np.array(context_precision_scores), 0, 1)
    context_entity_recall_scores = np.clip(np.array(context_entity_recall_scores), 0, 1)
    faithfulness_scores = np.clip(np.array(faithfulness_scores), 0, 1)
    answer_correctness_scores = np.clip(np.array(answer_correctness_scores), 0, 1)
    
    # Compute weighted average
    final_scores = (
        0.1 * context_matching_scores +
        0.2 * context_precision_scores +
        0.2 * context_entity_recall_scores +
        0.2 * faithfulness_scores +
        0.3 * answer_correctness_scores
    )

    # Add total score to submission DataFrame
    Total_Score = final_scores.mean() * 90
    print(f"Total score is {Total_Score} out of 90")

    return context_matching_scores,context_precision_scores,context_entity_recall_scores,faithfulness_scores,answer_correctness_scores,final_scores,Total_Score

In [74]:
solution = pd.read_csv("Solution.csv")
solution

Unnamed: 0,Query ID,Question,ground_truth_Context,ground_truth_Answer,ground_truth_Sections,ground_truth_Pages
0,6,Why did the English begin to focus more on Sri...,The English began to pay more attention to Sri...,The English began to focus on Sri Lanka due to...,2.2 The British focusing their Attention on Sr...,20
1,7,What administrative practices did the British ...,When the English East India Trade Company gain...,The British East India Trade Company governed ...,Governance of the Coastal Areas under the East...,24


In [61]:
submissions_folder = "submissions"
for filename in os.listdir(submissions_folder):
    if filename.endswith(".csv"):
        submission_path = os.path.join(submissions_folder, filename)
        submission = pd.read_csv(submission_path)
        context_matching_scores,context_precision_scores,context_entity_recall_scores,faithfulness_scores,answer_correctness_scores,final_scores,Total_Score  = await score(solution, submission, "Query ID")
        submission['Context Matching Score'] = context_matching_scores
        submission['Context Precision Score'] = context_precision_scores
        submission['Context Entity Recall Score'] = context_entity_recall_scores
        submission['Faithfulness Score'] = faithfulness_scores
        submission['Answer Correctness Score'] = answer_correctness_scores
        submission['Final Score'] = final_scores
        submission.loc[0, 'Total Score'] = Total_Score
        submission.to_csv(submission_path, index=False)






done 1
LLM-based context precision with reference: 0.9999999999
LLM-based context entities recall with reference answer: 0.9999999985714286
Faithfulness: 1.0
done 4
done 5
done 1
LLM-based context precision with reference: 0.9999999999
LLM-based context entities recall with reference answer: 0.9999999992857143
Faithfulness: 0.7
done 4
done 5
Total score is 78.92337913358595 out of 90
