In [32]:
# pip install pandas numpy sentence-transformers tensorflow tensorflow_hub scipy transformers langchain_openai ragas openai load_dotenv

In [17]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm
import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer
import scipy.special
from dotenv import load_dotenv
import os

### Ragas libraries

In [111]:
from ragas.metrics import ContextEntityRecall
from ragas.metrics import Faithfulness
from ragas import SingleTurnSample
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import ContextEntityRecall

In [112]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# Initialize the OpenAI LLM and wrap it
llm = ChatOpenAI(model="gpt-4o-mini")
wrapped_llm = LangchainLLMWrapper(llm)

In [113]:
async def calculate_context_precision(Question, ground_truth_Context, Context):
    context_precision = LLMContextPrecisionWithReference(llm=wrapped_llm)
    sample = SingleTurnSample(
        user_input=Question,
        reference=ground_truth_Context,
        retrieved_contexts=[Context],
    )
    score = await context_precision.single_turn_ascore(sample)
    return score

In [114]:
# async def evaluate_context_entity_recall(ground_truth_Context, Context):
#     # Create a sample for evaluation
#     sample = SingleTurnSample(
#         reference=ground_truth_Context,
#         retrieved_contexts=[Context],
#     )

#     # Initialize the ContextEntityRecall scorer with the wrapped LLM
#     scorer = ContextEntityRecall(llm=wrapped_llm)

#     # Calculate the score
#     score = await scorer.single_turn_ascore(sample)
#     print(f"LLM-based context entities recall with reference answer: {score}")
#     return score

In [115]:
# async def evaluate_faithfulness(Question, Answer, Context):
#     sample = SingleTurnSample(
#         user_input=Question,
#         response=Answer,
#         retrieved_contexts=[Context]
#     )

#     scorer = Faithfulness(llm=wrapped_llm)
#     score = await scorer.single_turn_ascore(sample)
#     print(f"Faithfulness: {score}")
#     return score

In [116]:
class ParticipantVisibleError(Exception):
    pass

In [117]:
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (norm(a) * norm(b) + 1e-8)  # Avoid division by zero

In [118]:
async def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    
    # Drop the row ID column
    solution = solution.drop(columns=[row_id_column_name])
    submission = submission.drop(columns=[row_id_column_name])

    # Validate columns
    required_submission_cols = {'Question', 'Context', 'Answer', 'Sections', 'Pages'}
    required_solution_cols = {'Question', 'ground_truth_Context', 'ground_truth_Answer', 'ground_truth_Sections', 'ground_truth_Pages'}

    if not required_submission_cols.issubset(submission.columns):
        missing = required_submission_cols - set(submission.columns)
        raise ParticipantVisibleError(f"Missing columns in submission: {missing}")

    if not required_solution_cols.issubset(solution.columns):
        missing = required_solution_cols - set(solution.columns)
        raise ParticipantVisibleError(f"Missing columns in solution: {missing}")

    # Merge on 'question'
    merged = pd.merge(solution, submission, on='Question', how='inner')

    if merged.empty:
        raise ParticipantVisibleError("No matching questions between submission and solution.")

    # Initialize models
    model = SentenceTransformer('all-MiniLM-L6-v2')
    bem = hub.load('https://www.kaggle.com/models/google/bert/TensorFlow2/answer-equivalence-bem/1')
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Initialize scores
    context_matching_scores = []
    context_precision_scores = []
    answer_correctness_scores = []
    page_corectness_scores = []

    for index, row in merged.iterrows():
        try:
            if not row["ground_truth_Answer"]:
                if not row["Answer"]:
                    context_matching_scores.append(1)
                    context_precision_scores.append(1)
                    answer_correctness_scores.append(1)
                    page_corectness_scores.append(1)
                else:
                    context_matching_scores.append(0)
                    context_precision_scores.append(0)
                    answer_correctness_scores.append(0)
                    page_corectness_scores.append(0)
                    continue

            Context = row["Context"][:200]
            q_emb = model.encode(row["Question"])
            ctx_emb = model.encode(Context)
            sec_emb = model.encode(row["Sections"])
            groundsec_emb = model.encode(row["ground_truth_Sections"])
            context_matching_scores.append(cosine_sim(q_emb, ctx_emb) + cosine_sim(sec_emb, groundsec_emb))

            Question = row["Question"]
            ground_truth_Context = row["ground_truth_Context"]
            ground_truth_Answer = row["ground_truth_Answer"]
            Answer = row["Answer"]

            precision_score = await calculate_context_precision(Question, ground_truth_Context, Context)            
            context_precision_scores.append(precision_score)

            input_text = f"[CLS] {Question} [SEP] {ground_truth_Answer} [SEP] {Answer} [SEP]"
            # Tokenize input
            encoded = tokenizer(
                input_text,
                return_tensors="tf",
                padding="max_length",
                truncation=True,
                max_length=512,
            )
            # Prepare input dict with correct types
            inputs = {
                "input_ids": tf.cast(encoded["input_ids"], tf.int64),
                "segment_ids": tf.cast(encoded["token_type_ids"], tf.int64)
            }
            # Run model
            raw_outputs = bem(inputs)
            # Convert logits to probabilities using softmax
            probabilities = scipy.special.softmax(raw_outputs.numpy(), axis=1)
            # BERT Answer Equivalence Score (Probability of class 1)
            equivalence_score = probabilities[0][1]
            answer_correctness_scores.append(equivalence_score)

            # Extract ground truth pages and submission pages
            ground_truth_pages = set(row["ground_truth_Pages"].split(',')) if isinstance(row["ground_truth_Pages"], str) else {row["ground_truth_Pages"]}
            submission_pages = set(row["Pages"].split(',')) if isinstance(row["Pages"], str) else {row["Pages"]}
            # Calculate the number of correctly given pages
            correct_pages = ground_truth_pages.intersection(submission_pages)
            total_pages = len(ground_truth_pages)
            # Calculate the proportion of correctly given pages
            if total_pages > 0:
                page_proportion = len(correct_pages) / total_pages
            else:
                page_proportion = 0
            # Append the proportion to the scores list
            page_corectness_scores.append(page_proportion)
            print(f"Row {index} evaluated")

        except Exception as e:
            raise ParticipantVisibleError(f"Embedding computation failed for a row: {e}")

    # Convert to numpy arrays and clip values
    context_matching_scores = np.clip(np.array(context_matching_scores), 0, 1)
    context_precision_scores = np.clip(np.array(context_precision_scores), 0, 1)
    answer_correctness_scores = np.clip(np.array(answer_correctness_scores), 0, 1)
    page_corectness_scores = np.clip(np.array(page_corectness_scores), 0, 1)  # Corrected this line
    
    # Compute weighted average
    final_scores = (
        0.2 * context_matching_scores +
        0.3 * context_precision_scores +
        0.4 * answer_correctness_scores +
        0.1 * page_corectness_scores
    )

    # Add total score to submission DataFrame
    Total_Score = final_scores.mean() * 90
    print(f"Total score is {Total_Score} out of 90")

    return context_matching_scores, context_precision_scores, answer_correctness_scores, page_corectness_scores, final_scores, Total_Score

In [119]:
solution = pd.read_csv("Solution.csv")
solution

Unnamed: 0,Query ID,Question,ground_truth_Context,ground_truth_Answer,ground_truth_Sections,ground_truth_Pages
0,6,Why did the English begin to focus more on Sri...,The English began to pay more attention to Sri...,The English began to focus on Sri Lanka due to...,2.2 The British focusing their Attention on Sr...,20
1,7,What administrative practices did the British ...,When the English East India Trade Company gain...,The British East India Trade Company governed ...,Governance of the Coastal Areas under the East...,24
2,8,What was the significance of the establishment...,A landmark in Buddhist education field was the...,"The Parama Dhamma Chethiya Pirivena, founded b...",3.2 Buddhist Renaissance,43
3,9,What were the major contributions of Arumuga N...,There was a religious and a cultural renaissan...,Arumuga Navalar was a key leader of the Hindu ...,3.3 Hindu Religious Renaissance,48
4,10,What were the main reasons that led the upcoun...,Although the people of the upcountry could esc...,The people of the upcountry rose against the B...,2.4. Protests against Foreign Domination,32


In [120]:
submissions_folder = "submissions"
for filename in os.listdir(submissions_folder):
    if filename.endswith(".csv"):
        submission_path = os.path.join(submissions_folder, filename)
        submission = pd.read_csv(submission_path)
        print(f"Evaluating file: {filename}")
        context_matching_scores,context_precision_scores,answer_correctness_scores,page_corectness_scores,final_scores,Total_Score  = await score(solution, submission, "Query ID")
        submission['Context Matching Score'] = context_matching_scores
        submission['Context Precision Score'] = context_precision_scores
        submission['Answer Correctness Score'] = answer_correctness_scores
        submission['Page Corectness Score'] = page_corectness_scores
        submission['Final Score'] = final_scores
        submission.loc[0, 'Total Score'] = Total_Score
        submission.to_csv(submission_path, index=False)






Evaluating file: submission_1.csv
Row 0 evaluated
Row 1 evaluated
Row 2 evaluated
Row 3 evaluated
Row 4 evaluated
Total score is 84.41650503603825 out of 90
