In [1]:
import os
import nltk
import logging
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
import google.generativeai as genai
from google.oauth2 import service_account
from pydantic import BaseModel, Field
from rouge_score import rouge_scorer
from collections import Counter
import re
import sacrebleu
from collections import Counter
from docx import Document as DocxDocument
from tqdm.notebook import tqdm
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from bert_score import score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Download required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# csv_path = "data/answers_rag1.csv"
# csv_path = "data/answers_rag2.csv"
csv_path = "data/answers_rag3.csv"

rag_number = re.findall(r'\d+', csv_path)[0]

In [4]:
try:
    # Load the CSV file with a specified encoding
    data = pd.read_csv(csv_path, encoding="utf-8")
except UnicodeDecodeError:
    # If utf-8 fails, try a different encoding
    data = pd.read_csv(csv_path, encoding="ISO-8859-1")

In [5]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english")
                 )
# Define text preprocessing function with lemmatization
def preprocess_text(text):
    # 1. Strip whitespace
    text = text.strip()

    # 2. Remove special characters and numbers (optional, depending on requirements)
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # 3. Tokenize the text
    tokens = nltk.word_tokenize(text)

    # 4. Remove stopwords and apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tqdm(tokens, desc="Processing tokens") if token.lower() not in stop_words]

    # 5. Join the tokens back into a string
    preprocessed_text = " ".join(lemmatized_tokens)

    return preprocessed_text

def evaluate_rouge(predicted, reference):
    if predicted is None:
        predicted = ""
    if reference is None:
        reference = ""
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
    return scorer.score(reference, predicted)


def evaluate_bleu(predicted, reference):
    if predicted is None:
        predicted = ""
    if reference is None:
        reference = ""
    bleu = sacrebleu.corpus_bleu([predicted], [[reference]])
    return bleu.score


def evaluate_f1(predicted, reference):
    if predicted is None:
        predicted = ""
    if reference is None:
        reference = ""
    predicted_tokens = nltk.word_tokenize(preprocess_text(predicted))
    reference_tokens = nltk.word_tokenize(preprocess_text(reference))
    predicted_counts = Counter(predicted_tokens)
    reference_counts = Counter(reference_tokens)

    common_tokens = sum((predicted_counts & reference_counts).values())

    precision = common_tokens / sum(predicted_counts.values()) if predicted_counts else 0
    recall = common_tokens / sum(reference_counts.values()) if reference_counts else 0

    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def evaluate_bert_score(predicted, reference):
    if predicted is None:
        predicted = ""
    if reference is None:
        reference = ""
    P, R, F1 = score([predicted], [reference], lang="en", verbose=False)
    return F1.mean().item()


# Initialize SentenceTransformer for embedding similarity
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

def evaluate_embedding_similarity(predicted, reference):
    if predicted is None:
        predicted = ""
    if reference is None:
        reference = ""
    predicted_embedding = sentence_model.encode(predicted)
    reference_embedding = sentence_model.encode(reference)
    similarity = cosine_similarity([predicted_embedding], [reference_embedding])[0][0]
    return similarity

In [6]:
# Initialize lists to hold metric scores
rouge_scores, bleu_scores, f1_scores, bert_scores, embedding_similarities = [], [], [], [], []

In [7]:
# First pass to collect metric values and determine min and max for normalization
for index, row in tqdm(data.iterrows(), total=len(data), desc="Processing questions"):
    question = row["Question"]
    reference_answer = row["Answer"]
    predicted_answer = row[f"RAG {rag_number}"]

    # Evaluate the metrics
    rouge = evaluate_rouge(predicted_answer, reference_answer)
    bleu = evaluate_bleu(predicted_answer, reference_answer)
    f1 = evaluate_f1(predicted_answer, reference_answer)
    bert_score = evaluate_bert_score(predicted_answer, reference_answer)
    embedding_similarity = evaluate_embedding_similarity(predicted_answer, reference_answer)

    # Append metric values to respective lists
    rouge_scores.append(rouge["rougeL"].fmeasure)
    bleu_scores.append(bleu)
    f1_scores.append(f1)
    bert_scores.append(bert_score)
    embedding_similarities.append(embedding_similarity)

Processing questions:   0%|          | 0/80 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/39 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/57 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Processing tokens:   0%|          | 0/62 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/156 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/59 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/43 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/45 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/67 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/119 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/80 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/60 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/214 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/276 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/88 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/27 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/80 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/77 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/327 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/88 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/42 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/23 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/426 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/73 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/43 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/83 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/22 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/85 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/34 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/78 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/41 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/38 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/29 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/43 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/57 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/29 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/35 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/51 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/21 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/75 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/48 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/84 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/34 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/84 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/44 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/129 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/28 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/56 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/39 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/43 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/41 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/69 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/40 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/66 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/35 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/48 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/37 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/86 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/39 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/67 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/227 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/491 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/81 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/133 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/74 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/37 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/45 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/35 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/62 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/174 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/99 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/16 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/144 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/246 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/57 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/80 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/66 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/54 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/62 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/40 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/57 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/37 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/55 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/61 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/35 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/65 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/11 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/45 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/42 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/39 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/9 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/260 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/81 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/86 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/32 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing tokens:   0%|          | 0/64 [00:00<?, ?it/s]

Processing tokens:   0%|          | 0/36 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Perform min-max normalization for each metric
def min_max_normalize(values):
    min_val, max_val = min(values), max(values)
    return [(val - min_val) / (max_val - min_val) if max_val != min_val else 0.5 for val in values]


rouge_scores_normalized = min_max_normalize(rouge_scores)
bleu_scores_normalized = min_max_normalize(bleu_scores)
f1_scores_normalized = min_max_normalize(f1_scores)
bert_scores_normalized = min_max_normalize(bert_scores)
embedding_similarities_normalized = min_max_normalize(embedding_similarities)

In [9]:
hybrid_scores = [
    (0.2 * rouge) + (0.2 * bleu) + (0.2 * f1) + (0.2 * bert) + (0.2 * embedding)
    for rouge, bleu, f1, bert, embedding in zip(
        rouge_scores_normalized, bleu_scores_normalized, f1_scores_normalized, bert_scores_normalized, embedding_similarities_normalized
    )
]

In [10]:
# Add the hybrid score to the DataFrame
data["Hybrid Score"] = hybrid_scores

# Print each result
for index in range(len(data)):
    print(f"[Processing question {index + 1}]:\n")
    print(f"ROUGE-L: {rouge_scores[index]}")
    print(f"BLEU: {bleu_scores[index]}")
    print(f"F1: {f1_scores[index]}")
    print(f"BERT Score: {bert_scores[index]}")
    print(f"Embedding Similarity: {embedding_similarities[index]}")
    print(f"Hybrid Score: {hybrid_scores[index]}")
    print("-" * 80)

[Processing question 1]:

ROUGE-L: 0.18749999999999997
BLEU: 4.733388305843651
F1: 0.26666666666666666
BERT Score: 0.8736918568611145
Embedding Similarity: 0.7417100071907043
Hybrid Score: 0.3304803442071542
--------------------------------------------------------------------------------
[Processing question 2]:

ROUGE-L: 0.2566371681415929
BLEU: 2.932606721642658
F1: 0.2782608695652174
BERT Score: 0.8739281296730042
Embedding Similarity: 0.64577317237854
Hybrid Score: 0.3241879730781364
--------------------------------------------------------------------------------
[Processing question 3]:

ROUGE-L: 0.1923076923076923
BLEU: 1.9917842709544666
F1: 0.16129032258064516
BERT Score: 0.8658549785614014
Embedding Similarity: 0.6244524717330933
Hybrid Score: 0.2742361633350776
--------------------------------------------------------------------------------
[Processing question 4]:

ROUGE-L: 0.06896551724137931
BLEU: 0.047246800770359
F1: 0.0
BERT Score: 0.8146019577980042
Embedding Similarit

In [11]:
# Save the evaluation metrics to a CSV file
evaluation_metrics = {
    "ROUGE-L": rouge_scores,
    "BLEU": bleu_scores,
    "F1": f1_scores,
    "BERT Score": bert_scores,
    "Embedding Similarity": embedding_similarities,
    "Hybrid Score": hybrid_scores
}
evaluation_df = pd.DataFrame(evaluation_metrics)

evaluation_df.to_csv(f"data/rag{rag_number}_evaluation_metrics.csv", index=False)

# Print evaluation summary
print("=== Evaluation Summary ===")
print(f"Average ROUGE-L: {sum(rouge_scores) / len(rouge_scores):.4f}")
print(f"Average BLEU: {sum(bleu_scores) / len(bleu_scores):.2f}")
print(f"Average F1: {sum(f1_scores) / len(f1_scores):.4f}")
print(f"Average BERT Score: {sum(bert_scores) / len(bert_scores):.4f}")
print(f"Average Embedding Similarity: {sum(embedding_similarities) / len(embedding_similarities):.4f}")
print(f"Average Hybrid Score: {sum(hybrid_scores) / len(hybrid_scores):.4f}")

=== Evaluation Summary ===
Average ROUGE-L: 0.3584
Average BLEU: 25.73
Average F1: 0.3599
Average BERT Score: 0.8896
Average Embedding Similarity: 0.6372
Average Hybrid Score: 0.4195
