# SQuAD-v1

In [None]:
import torch
import gc

torch.cuda.empty_cache()  # Free unused GPU memory
gc.collect()  # Garbage collect to free RAM


In [3]:
import torch
torch.cuda.empty_cache()

In [4]:
!nvidia-smi


Thu Dec 12 12:36:32 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P8             10W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [5]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the dataset
file_path = "/kaggle/input/eng-benchmark/SQuAD-v1.1.csv"  # Path to the dataset
data = pd.read_csv(file_path)
benchmark_data = data.head(50)  # Use the first 10 rows for testing

# Load the model and tokenizer
model_name = "/kaggle/input/gemma-2/transformers/gemma-2-2b/2"  # Replace with your model path if needed
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading the model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model and tokenizer loaded successfully.")

# Evaluation function
def evaluate_model(row):
    question = row["question"]
    context = row["context"]
    
    # Explicit prompt to enforce answering from the context
    input_text = (
        f"Read the following context and answer the question strictly based on it, give me only one answer, do not give me A, B, C and D.\n\n"
        f"Context: {context}\n"
        f"Question: {question}\n"
        f"Answer:"
    )

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    # Generate output with controlled length
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)

    # Decode and extract the answer
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Post-process: Remove prompt from generated text
    if "Answer:" in generated_answer:
        generated_answer = generated_answer.split("Answer:")[-1].strip()

    return generated_answer

# Evaluate the model
results = []
print("Evaluating model on the dataset...")
for _, row in benchmark_data.iterrows():
    generated_answer = evaluate_model(row)
    results.append({
        "question": row["question"],
        "context": row["context"],
        "expected_answer": row["answer"],
        "generated_answer": generated_answer
    })

# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("squad_benchmark_results4.csv", index=False)

print("Evaluation completed. Results saved to 'squad_benchmark_results.csv'.")


Loading the model and tokenizer...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model and tokenizer loaded successfully.
Evaluating model on the dataset...
Evaluation completed. Results saved to 'squad_benchmark_results.csv'.


# ROUGE & BLUE SCORE

In [None]:
!pip install rouge_score
!pip install bert_score

In [None]:
import pandas as pd
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Load original and generated data
original_data_path = "/kaggle/input/eng-benchmark/SQuAD-v1.1.csv"  # Original file path
results_data_path = "/kaggle/working/squad_benchmark_results.csv"  # Your results file

# Load the original and generated data
original_data = pd.read_csv(original_data_path).head(500)
generated_data = pd.read_csv(results_data_path)

# Extract relevant columns
expected_answers = original_data['answer'].tolist()
generated_answers = generated_data['generated_answer'].tolist()

# Initialize ROUGE scorer
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
rouge_scores = []
for expected, generated in zip(expected_answers, generated_answers):
    scores = rouge_scorer_obj.score(str(expected), str(generated))
    rouge_scores.append(scores)

# Calculate average ROUGE scores
avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

print(f"Average ROUGE-1: {avg_rouge1:.2f}")
print(f"Average ROUGE-2: {avg_rouge2:.2f}")
print(f"Average ROUGE-L: {avg_rougeL:.2f}")


# BLEU Score Calculation
smooth_func = SmoothingFunction().method1  # Smoothing to handle short sentences

bleu_scores = []
for expected, generated in zip(expected_answers, generated_answers):
    reference = [str(expected).split()]  # BLEU expects a list of reference tokens
    hypothesis = str(generated).split()  # Hypothesis (generated answer)
    score = sentence_bleu(reference, hypothesis, smoothing_function=smooth_func)
    bleu_scores.append(score)

average_bleu = sum(bleu_scores) / len(bleu_scores)

print(f"Average BLEU Score: {average_bleu:.2f}")


In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import load_dataset

# Load the first 50 rows of the MMLU dataset
ds = load_dataset("cais/mmlu", "all")
test_data = ds["test"].select(range(50))  # First 50 rows for benchmarking

benchmark_data = pd.DataFrame({
    "question": test_data["question"],
    "choices": test_data["choices"],
    "answer": test_data["answer"]
})

# Simulate or load context (for this example, using a placeholder context)
# Replace this with real context if available
context = "This is a general knowledge context. Use it to answer the question accurately."

# Load the Gemma model and tokenizer
model_name = "/kaggle/input/gemma-2/transformers/gemma-2-2b/2"  # Replace with the correct path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Evaluation function with context
def evaluate_model(row, context):
    question = row["question"]
    choices = row["choices"]
    correct_answer_index = row["answer"]
    correct_answer = choices[correct_answer_index]

    # Construct a clear input prompt with context
    input_text = (
        f"Context: {context}\n\n"
        f"Answer the following question by choosing the correct option.\n\n"
        f"Question: {question}\n"
        f"Choices:\n"
        f"A) {choices[0]}\n"
        f"B) {choices[1]}\n"
        f"C) {choices[2]}\n"
        f"D) {choices[3]}\n"
        "Answer:"
    )

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50, eos_token_id=tokenizer.eos_token_id)
    
    # Decode and clean the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    generated_answer = None
    
    # Match the generated text to one of the choices
    for choice in choices:
        if choice in generated_text:
            generated_answer = choice
            break

    # If no match, return the full generated text for debugging
    return generated_answer if generated_answer else generated_text

# Evaluate the model
results = []
for _, row in benchmark_data.iterrows():
    generated_answer = evaluate_model(row, context)
    correct_answer = row["choices"][row["answer"]]
    results.append({
        "question": row["question"],
        "correct_answer": correct_answer,
        "generated_answer": generated_answer
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to a file
results_df.to_csv("gemma_mmlu_with_context_results.csv", index=False)
print("Evaluation completed. Results saved to 'gemma_mmlu_with_context_results.csv'.")


In [None]:
benchmark_data.head()

In [None]:
import pandas as pd
from rouge_score import rouge_scorer
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Load the MMLU dataset
ds = load_dataset("cais/mmlu", "all")
test_data = ds["test"].select(range(50))  # First 50 rows for benchmarking

# Prepare the original data DataFrame
original_data = pd.DataFrame({
    "question": test_data["question"],
    "choices": test_data["choices"],
    "answer": test_data["answer"]
})

# Load the generated results
results_data_path = "/kaggle/working/gemma_mmlu_with_context_results.csv"  # Path to generated results
generated_data = pd.read_csv(results_data_path)

# Extract the correct answers
correct_answers = [
    row['choices'][row['answer']]  # Extract the correct answer using the index
    for _, row in original_data.iterrows()
]

# Extract the generated answers
generated_answers = generated_data['generated_answer'].tolist()

# Initialize ROUGE scorer
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
rouge_scores = []
for correct, generated in zip(correct_answers, generated_answers):
    scores = rouge_scorer_obj.score(str(correct), str(generated))
    rouge_scores.append(scores)

# Calculate average ROUGE scores
avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

print(f"Average ROUGE-1: {avg_rouge1:.2f}")
print(f"Average ROUGE-2: {avg_rouge2:.2f}")
print(f"Average ROUGE-L: {avg_rougeL:.2f}")


# BLEU Score Calculation
smooth_func = SmoothingFunction().method1  # Use smoothing to handle short sentences

bleu_scores = []
for correct, generated in zip(correct_answers, generated_answers):
    reference = [str(correct).split()]  # BLEU expects tokenized reference as a list
    hypothesis = str(generated).split()  # Tokenized hypothesis (generated answer)
    score = sentence_bleu(reference, hypothesis, smoothing_function=smooth_func)
    bleu_scores.append(score)

average_bleu = sum(bleu_scores) / len(bleu_scores)

# Print BLEU score
print(f"Average BLEU Score: {average_bleu:.2f}")


# LLM judge CHATgpt

In [7]:
!pip install openai pandas tqdm

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openai
  Downloading openai-1.57.2-py3-none-any.whl.metadata (24 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading openai-1.57.2-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.9/389.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading jiter-0.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.0/345.0 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jiter, openai
Successfully installed jiter-0.8.2 openai-1.57.2


In [8]:
!pip install --upgrade openai


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
import openai
print(openai.__version__)


In [None]:
# Relevance

In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# OpenAI API setup
client = OpenAI(api_key = "")  # Replace with your key

# Load Dataset
file_path = "/kaggle/working/gemma_mmlu_with_context_results.csv"  # Replace with your dataset path
df = pd.read_csv(file_path)

# Function to generate a question from a given answer using ChatGPT
def generate_question_from_answer(answer):
    try:
        response = client.chat.completions.create(
            model="gpt-4",  # Use GPT model available to you
            messages=[
                {"role": "system", "content": "You are a helpful assistant generating questions."},
                {"role": "user", "content": f"Generate a question for which this could be an answer: {answer}"}
            ],
            max_tokens=50,
            temperature=0.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating question: {e}")
        return None

# Function to generate embeddings for a text
def get_embeddings(text):
    try:
        response = client.embeddings.create(
            model="text-embedding-3-small",  # Embedding model
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return None

# Calculate Relevance
def calculate_relevance(question, generated_question):
    try:
        # Generate embeddings
        question_emb = get_embeddings(question)
        generated_question_emb = get_embeddings(generated_question)
        
        if question_emb and generated_question_emb:
            # Calculate cosine similarity
            similarity = cosine_similarity(
                [question_emb],
                [generated_question_emb]
            )[0][0]
            return similarity
        return None
    except Exception as e:
        print(f"Error calculating relevance: {e}")
        return None

# Process Dataset
tqdm.pandas()
df["generated_question"] = df["generated_answer"].progress_apply(generate_question_from_answer)
df["relevance_score"] = df.progress_apply(
    lambda row: calculate_relevance(row["question"], row["generated_question"]), axis=1
)

# Save results
output_file = "relevance_results2.xlsx"
df.to_excel(output_file, index=False)
print(f"Relevance scores saved to {output_file}")
print(df[["question", "generated_answer", "generated_question", "relevance_score"]])


# Faithfulness

In [9]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
import re

# OpenAI API setup
client = OpenAI(api_key = "")  # Replace with your key

# Load Dataset
file_path = "/kaggle/working/squad_benchmark_results4.csv"  # Replace with your dataset path
df = pd.read_csv(file_path)

# System prompt for claim extraction
CLAIM_EXTRACTION_PROMPT = """
Extract all factual claims made in the following text. Present each claim as a concise statement:

Text: {answer}

Claims:
"""

# System prompt for claim verification
CLAIM_VERIFICATION_PROMPT = """
You will be given a factual claim and some context. Your task is to determine whether the claim matches the context.

For each claim:
- If the claim agrees with the context, respond "Yes".
- If the claim cannot be verified from the context, respond "Idk".
- If the claim contradicts the context, respond "No".

Claim: {claim}
Context: {context}

Answer (Yes, Idk, or No):
"""

# Function to extract claims from an answer
def extract_claims(answer):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert at extracting factual claims."},
                {"role": "user", "content": CLAIM_EXTRACTION_PROMPT.format(answer=answer)}
            ],
            max_tokens=200,
            temperature=0.0
        )
        # Split extracted claims into a list
        claims = [claim.strip() for claim in response.choices[0].message.content.strip().split("\n") if claim.strip()]
        return claims
    except Exception as e:
        print(f"Error extracting claims: {e}")
        return []

# Function to verify claims against context
def verify_claim(claim, context):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a fact-checking assistant."},
                {"role": "user", "content": CLAIM_VERIFICATION_PROMPT.format(claim=claim, context=context)}
            ],
            max_tokens=10,
            temperature=0.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error verifying claim: {e}")
        return "Idk"

# Function to calculate faithfulness score
def calculate_faithfulness(answer, context):
    try:
        # Step 1: Extract claims
        claims = extract_claims(answer)
        if not claims:
            return 0  # No claims found, faithfulness is 0

        # Step 2: Verify claims
        valid_claims = 0
        for claim in claims:
            result = verify_claim(claim, context)
            if result in ["Yes", "Idk"]:
                valid_claims += 1

        # Step 3: Calculate faithfulness score
        faithfulness_score = valid_claims / len(claims)
        return faithfulness_score
    except Exception as e:
        print(f"Error calculating faithfulness: {e}")
        return 0

# Process Dataset
tqdm.pandas()
df["faithfulness_score"] = df.progress_apply(
    lambda row: calculate_faithfulness(row["generated_answer"], row["context"]), axis=1
)

# Save results
output_file = "squad_faithfulness_results4.xlsx"
df.to_excel(output_file, index=False)
print(f"Faithfulness scores saved to {output_file}")
print(df[["question", "generated_answer", "faithfulness_score"]])


  0%|          | 0/50 [00:00<?, ?it/s]

Faithfulness scores saved to squad_faithfulness_results4.xlsx
                                             question  \
0   To whom did the Virgin Mary allegedly appear i...   
1   What is in front of the Notre Dame Main Building?   
2   The Basilica of the Sacred heart at Notre Dame...   
3                   What is the Grotto at Notre Dame?   
4   What sits on top of the Main Building at Notre...   
5   When did the Scholastic Magazine of Notre dame...   
6    How often is Notre Dame's the Juggler published?   
7   What is the daily student paper at Notre Dame ...   
8   How many student news papers are found at Notr...   
9   In what year did the student paper Common Sens...   
10  Where is the headquarters of the Congregation ...   
11  What is the primary seminary of the Congregati...   
12        What is the oldest structure at Notre Dame?   
13  What individuals live at Fatima House at Notre...   
14         Which prize did Frederick Buechner create?   
15  How many BS level degr