# Packages

In [1]:
pip install transformers sentence-transformers datasets faiss-cpu torch pandas tqdm huggingface_hub ragas bitsandbytes==0.41.3 streamlit

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ragas
  Downloading ragas-0.2.8-py3-none-any.whl.metadata (9.1 kB)
Collecting bitsandbytes==0.41.3
  Downloading bitsandbytes-0.41.3-py3-none-any.whl.metadata (9.8 kB)
Collecting streamlit
  Downloading streamlit-1.41.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 k

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import pandas as pd
import torch
import networkx as nx
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from huggingface_hub import HfApi, login
import faiss
from tqdm import tqdm
import numpy as np
import streamlit as st
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity
)


In [14]:
def clean_dataset(df):
    """
    Clean the dataset by removing rows with unwanted answers or NaN values
    """
    # Make a copy of the dataframe
    cleaned_df = df.copy()

    # Remove rows with NaN values
    cleaned_df = cleaned_df.dropna(subset=['question', 'answer'])

    # Define patterns to remove
    unwanted_patterns = [
        "we cannot provide",
        "against",
        "cannot provide an answer ",
        "I apologize",
        "I'm sorry",
        "cannot assist",
        "unable to",
        "do not provide"
    ]

    # Create pattern matching condition
    pattern = '|'.join(unwanted_patterns)

    # Remove rows containing unwanted patterns (case insensitive)
    cleaned_df = cleaned_df[~cleaned_df['answer'].str.lower().str.contains(pattern, na=False)]

    # Print cleaning statistics
    print(f"Original dataset size: {len(df)}")
    print(f"Cleaned dataset size: {len(cleaned_df)}")
    print(f"Removed {len(df) - len(cleaned_df)} rows")

    # Show distribution of categories after cleaning
    print("\nCategory distribution after cleaning:")
    print(cleaned_df['category'].value_counts())

    return cleaned_df

df = pd.read_csv("/content/drive/MyDrive/Roux/DS5983/combined_dataset.csv")
# Clean the dataset
cleaned_df = clean_dataset(df)

# Save cleaned dataset if needed
cleaned_df.to_csv("cleaned_combined_dataset.csv", index=False)

Original dataset size: 51096
Cleaned dataset size: 18699
Removed 32397 rows

Category distribution after cleaning:
category
E1    10247
G      6942
R      1453
N        57
Name: count, dtype: int64


In [16]:
def balance_dataset(df: pd.DataFrame, max_samples: int = 700) -> pd.DataFrame:
    """
    Balance the dataset by sampling each category to have equal representation,
    not exceeding max_samples per category.
    """
    # Make a copy to avoid modifying original data
    df = df.copy()

    # Split data by category
    g_df = df[df['category'] == 'G']
    r_df = df[df['category'] == 'R']
    e1_df = df[df['category'] == 'E1']
    e2_df = df[df['category'] == 'E2']
    n_df = df[df['category'] == 'N']

    # Calculate sampling sizes
    g_size = min(len(g_df), max_samples)
    r_size = min(len(r_df), max_samples)
    e1_size = min(len(e1_df), max_samples)
    e2_size = min(len(e2_df), max_samples)
    n_size = len(n_df)  # Keep all N samples

    # Sample from each category
    sampled_g = g_df.sample(n=g_size, random_state=42)
    sampled_r = r_df.sample(n=r_size, random_state=42)
    sampled_e1 = e1_df.sample(n=e1_size, random_state=42)
    sampled_e2 = e2_df.sample(n=e2_size, random_state=42)
    sampled_n = n_df  # Keep all N samples

    # Combine all samples
    balanced_df = pd.concat([
        sampled_g,
        sampled_r,
        sampled_e1,
        sampled_e2,
        sampled_n
    ], axis=0)

    # Shuffle the combined dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Print statistics
    print("\nDetailed Sampling Statistics:")
    print("-" * 50)
    print("Original Dataset:")
    print(f"Total samples: {len(df)}")
    print(f"G: {len(g_df)} samples ({len(g_df)/len(df)*100:.2f}%)")
    print(f"R: {len(r_df)} samples ({len(r_df)/len(df)*100:.2f}%)")
    print(f"E1: {len(e1_df)} samples ({len(e1_df)/len(df)*100:.2f}%)")
    print(f"E2: {len(e2_df)} samples ({len(e2_df)/len(df)*100:.2f}%)")
    print(f"N: {len(n_df)} samples ({len(n_df)/len(df)*100:.2f}%)")

    print("\nBalanced Dataset:")
    print(f"Total samples: {len(balanced_df)}")
    print(f"G: {len(sampled_g)} samples ({len(sampled_g)/len(balanced_df)*100:.2f}%)")
    print(f"R: {len(sampled_r)} samples ({len(sampled_r)/len(balanced_df)*100:.2f}%)")
    print(f"E1: {len(sampled_e1)} samples ({len(sampled_e1)/len(balanced_df)*100:.2f}%)")
    print(f"E2: {len(sampled_e2)} samples ({len(sampled_e2)/len(balanced_df)*100:.2f}%)")
    print(f"N: {len(sampled_n)} samples ({len(sampled_n)/len(balanced_df)*100:.2f}%)")
    print("-" * 50)

    return balanced_df


# df = pd.read_csv("/content/drive/MyDrive/Roux/DS5983/combined_dataset.csv")
balanced_df = balance_dataset(cleaned_df, max_samples=2000)

# Save balanced dataset for reference
balanced_df.to_csv("balanced_dataset.csv", index=False)


Detailed Sampling Statistics:
--------------------------------------------------
Original Dataset:
Total samples: 18699
G: 6942 samples (37.12%)
R: 1453 samples (7.77%)
E1: 10247 samples (54.80%)
E2: 0 samples (0.00%)
N: 57 samples (0.30%)

Balanced Dataset:
Total samples: 5510
G: 2000 samples (36.30%)
R: 1453 samples (26.37%)
E1: 2000 samples (36.30%)
E2: 0 samples (0.00%)
N: 57 samples (1.03%)
--------------------------------------------------


# Building GreenChat: RAG-Enhanced Mistral Instruct 7B

In [17]:
# Configuration
DATA_PATH = "./combined_dataset.csv"
HF_TOKEN = "HF_TOKEN"
REPO_NAME = "Jiaaaaaaax/greenchat-20241211"
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

In [18]:
def initialize_hf():
    """Initialize Hugging Face API"""
    api = HfApi()
    login(token=HF_TOKEN)
    return api

def setup_embedding_model():
    """Setup the sentence transformer model for embeddings"""
    return SentenceTransformer('all-MiniLM-L6-v2')

def create_embeddings(texts, model):
    """Create embeddings for a list of texts"""
    embeddings = []
    for text in tqdm(texts):
        embedding = model.encode(text)
        embeddings.append(embedding)
    return np.array(embeddings)

def setup_faiss_index(embeddings):
    """Setup FAISS index with embeddings"""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [None]:
def split_train_test(df, test_size=500):
    """
    Split dataset into train and test sets maintaining category distribution
    """
    test_df = pd.DataFrame()
    train_df = pd.DataFrame()

    # Calculate distribution in original dataset
    total_samples = len(df)
    category_dist = df['category'].value_counts(normalize=True)

    for category in category_dist.index:
        category_samples = int(test_size * category_dist[category])
        category_data = df[df['category'] == category]

        # Random sampling for test set
        test_category = category_data.sample(n=category_samples, random_state=42)
        train_category = category_data.drop(test_category.index)

        test_df = pd.concat([test_df, test_category])
        train_df = pd.concat([train_df, train_category])

    return train_df, test_df

In [None]:
def setup_model():
    """Setup the Mistral model with proper quantization and device mapping"""
    # Configure quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        padding_side="left"
    )
    tokenizer.pad_token = tokenizer.eos_token

    # Initialize model with proper device mapping and quantization
    quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
    )

    # model = AutoModelForCausalLM.from_pretrained(
    #     MODEL_ID,
    #     quantization_config=bnb_config,
    #     device_map="auto",
    #     trust_remote_code=True,
    #     load_in_4bit=True,
    #     torch_dtype=torch.float16,
    # )
    model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
    )

    return model, tokenizer

In [None]:
class KnowledgeGraph:
    def __init__(self):
        self.graph = nx.Graph()

    def build_graph(self, df):
        """Build knowledge graph from QA pairs"""
        print("Building knowledge graph...")
        for _, row in tqdm(df.iterrows(), total=len(df)):
            # Add nodes for category and concepts
            category = row['category']

            # Convert question and answer to string, handle NaN values
            question = str(row['question']) if pd.notna(row['question']) else ""
            answer = str(row['answer']) if pd.notna(row['answer']) else ""

            # Combine text and extract concepts
            text = f"{question} {answer}".strip()
            if text:  # Only process if there's text
                concepts = self.extract_concepts(text)

                # Add edges between concepts and categories
                for concept in concepts:
                    self.graph.add_edge(category, concept, weight=1.0)

                # Add edges between related concepts
                for i in range(len(concepts)):
                    for j in range(i+1, len(concepts)):
                        self.graph.add_edge(concepts[i], concepts[j], weight=0.5)

    def extract_concepts(self, text):
        """Extract key concepts from text (simplified version)"""
        # This could be improved with NER or keyword extraction
        words = text.lower().split()
        # Basic filtering for concepts
        concepts = [w for w in words if len(w) > 4]
        return list(set(concepts))[:5]  # Limit to top 5 concepts

    def get_related_concepts(self, concept, max_distance=2):
        """Get related concepts within max_distance"""
        try:
            related = nx.single_source_shortest_path_length(self.graph, concept, cutoff=max_distance)
            return list(related.keys())
        except:
            return []

In [19]:
class GreenChatRAG:
    def __init__(self, model, tokenizer, embedding_model, faiss_index, qa_pairs, knowledge_graph):
        self.model = model
        self.tokenizer = tokenizer
        self.embedding_model = embedding_model
        self.faiss_index = faiss_index
        self.qa_pairs = qa_pairs
        self.knowledge_graph = knowledge_graph
        self.cache = {}

    def two_stage_retrieval(self, query, k=5):
        """Two-stage retrieval process"""
        # Stage 1: FAISS retrieval
        query_embedding = self.embedding_model.encode([query])
        D, I = self.faiss_index.search(query_embedding, k)
        initial_results = [self.qa_pairs.iloc[i] for i in I[0]]

        # Stage 2: Graph-based re-ranking
        concepts = self.knowledge_graph.extract_concepts(query)
        scores = []

        for result in initial_results:
            score = 0
            result_concepts = self.knowledge_graph.extract_concepts(
                result['question'] + " " + result['answer']
            )

            # Calculate relevance score based on shared concepts
            for c1 in concepts:
                for c2 in result_concepts:
                    if c1 == c2 or c2 in self.knowledge_graph.get_related_concepts(c1):
                        score += 1

            scores.append(score)

        # Re-rank results based on scores
        ranked_results = [x for _, x in sorted(
            zip(scores, initial_results),
            key=lambda pair: pair[0],
            reverse=True
        )]

        return ranked_results[:3]  # Return top 3 after re-ranking

    def generate_response(self, query):
        """Generate response using improved RAG"""
        try:
            # Check cache
            if query in self.cache:
                return self.cache[query]

            # Get relevant documents
            relevant_pairs = self.two_stage_retrieval(query)

            # Construct context
            context = "\n".join([
                f"Q: {pair['question']}\nA: {pair['answer']}"
                for pair in relevant_pairs
            ])

            # Construct prompt
            prompt = f"""Based on the following context:
    {context}

    Please answer this question:
    {query}

    Answer:"""

            # Tokenize
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=2048
            ).to("cuda")

            # Generate
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True,  # avoid the temperature warning
                num_return_sequences=1,
                pad_token_id=self.tokenizer.eos_token_id
            )

            # Decode
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Cache and return
            self.cache[query] = response
            return response


        except Exception as e:
            return f"Error generating response: {str(e)}"

In [20]:
# Initialize Hugging Face
api = initialize_hf()

# Load and split dataset
print("Loading dataset...")
# df = pd.read_csv(DATA_PATH)
df = balanced_df
train_df, test_df = split_train_test(df)
# Clean the dataset
print("Cleaning dataset...")
df['question'] = df['question'].fillna("")
df['answer'] = df['answer'].fillna("")

# Remove rows where both question and answer are empty
df = df[~((df['question'] == "") & (df['answer'] == ""))]

# train_df, test_df = split_train_test(df)
print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

# Setup embedding model and create embeddings
print("Creating embeddings...")
embedding_model = setup_embedding_model()
embeddings = create_embeddings(train_df['question'].tolist(), embedding_model)

# Setup FAISS index
print("Setting up FAISS index...")
faiss_index = setup_faiss_index(embeddings)

# Build knowledge graph
knowledge_graph = KnowledgeGraph()
knowledge_graph.build_graph(train_df)

# # Setup Mistral model
print("Initializing Mistral model...")
model, tokenizer = setup_model()

# Initialize RAG system
print("Setting up RAG system...")
rag_system = GreenChatRAG(
    model=model,
    tokenizer=tokenizer,
    embedding_model=embedding_model,
    faiss_index=faiss_index,
    qa_pairs=train_df,
    knowledge_graph=knowledge_graph
)


Loading dataset...
Cleaning dataset...
Training set size: 5012
Test set size: 498
Creating embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 5012/5012 [00:34<00:00, 147.00it/s]


Setting up FAISS index...
Building knowledge graph...


100%|██████████| 5012/5012 [00:00<00:00, 5525.46it/s]


Initializing Mistral model...


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Setting up RAG system...


# Push to Hub

In [21]:
def push_to_huggingface(rag_system, train_df, test_df, repo_name):
    login(token=HF_TOKEN)

    # Push model
    print("Pushing model...")
    rag_system.model.save_pretrained("./temp_model")
    rag_system.model.push_to_hub(repo_name)

    # Push tokenizer
    print("Pushing tokenizer...")
    rag_system.tokenizer.save_pretrained("./temp_tokenizer")
    rag_system.tokenizer.push_to_hub(repo_name)

    # Push datasets
    print("Pushing datasets...")
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_dataset.push_to_hub(f"{repo_name}")
    test_dataset.push_to_hub(f"{repo_name}")

push_to_huggingface(
    rag_system=rag_system,
    train_df=train_df,
    test_df=test_df,
    repo_name=REPO_NAME
)


Pushing model...


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

Pushing tokenizer...
Pushing datasets...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/390 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/394 [00:00<?, ?B/s]

# Evaluation

##Qualitative Analysis


In [22]:
print(rag_system.generate_response("How do I reduce carbon emissions?"))

Based on the following context:
    Q: Are there any specific goals or targets for reducing greenhouse gas emissions?
A: - The specific goal for reducing greenhouse gas emissions is not explicitly mentioned in the provided sources.
Q: What are the key enabling conditions for reducing emissions across all sectors of the economy?
A: The key enabling conditions for reducing emissions across all sectors of the economy are: 

* **Economy-wide strategies** like carbon pricing, clean fuels, and reducing methane emissions. [2]
* **Increasing energy efficiency** across all sectors. [4] 
* **Generating more and cleaner electricity** and improving electricity transmission infrastructure. [4] 
* **Decarbonizing transportation** through electrification and alternative fuels. [4] 
* **Transforming industrial processes** through alternative processes or new technologies. [4] 
* **Harnessing nature's ability to capture and store carbon** by protecting, sustainably managing, and restoring land and ocea

In [23]:
print(rag_system.generate_response("What is the federal carbon pricing benchmark?"))

Based on the following context:
    Q: What is the federal carbon pricing benchmark?
A: - The federal carbon pricing benchmark is a set of principles for carbon pricing in Canada. [0]
- The benchmark includes the following elements:
    - Carbon pricing should apply to a broad set of emission sources. [1]
    - Carbon pricing should have increasing stringency over time. [1]
    - Jurisdictions can implement either an explicit price-based system or a cap-and-trade system. [2]
Q: What is the federal carbon pricing benchmark?
A: - The federal carbon pricing benchmark reflects the principles proposed by the Working Group on Carbon Pricing Mechanisms and the Vancouver Declaration. [1]
- It ensures that carbon pricing applies to a broad set of emission sources throughout Canada with increasing stringency over time. [1]
- The benchmark includes two options: (1) an explicit price-based system (a carbon tax or a carbon levy and performance-based emissions system); or (2) a cap-and-trade system.

In [25]:
print(rag_system.generate_response("What is the impact of climate change on the frequency of extreme heat events?"))

Based on the following context:
    Q: What is the impact of climate change on the frequency of extreme heat events?
A: - The likelihood of extreme heat events has increased significantly because of anthropogenic climate change. [1]
- The risk of heat extremes has been found to have significantly increased because of anthropogenic climate change. [4]
Q: What is the impact of climate change on the frequency of extreme heat events?
A: - The likelihood of extreme heat events has increased significantly because of anthropogenic climate change. [1]
- The risk of heat extremes has been found to have significantly increased because of anthropogenic climate change. [4]
Q: How will the plan address climate change adaptation and extreme weather events?
A: - The plan will address climate change adaptation by developing solutions for preparing for climate change and extreme weather events during the execution of electricity projects [0].
- It will implement solutions for natural disaster preparedn

In [None]:
test_df.head()

Unnamed: 0,question,answer,category
2019,What is the federal carbon pricing benchmark?,- The federal carbon pricing benchmark reflect...,R
2973,What is the strategy's approach to building a ...,"I cannot provide an answer to this question, a...",R
2994,Is there a mention of carbon pricing in the do...,- The Carbon Pricing Act imposes measurement a...,R
2668,What is the impact of climate change on the fr...,- The likelihood of extreme heat events has in...,R
1613,What are the challenges in implementing this p...,I cannot provide an answer to this question ba...,R


## Quantitative Analysis

In [26]:
def evaluate_rag_system(rag_system, test_df, n_samples=50):
    """
    Custom evaluation system for GreenChat RAG with filtered test data and improved error handling
    """
    # Clean and filter the test data
    clean_test_df = test_df.copy()

    # Convert 'nan' strings to NaN and drop rows with NaN answers
    clean_test_df['answer'] = clean_test_df['answer'].replace('nan', np.nan)
    clean_test_df = clean_test_df.dropna(subset=['answer'])

    # Ensure proper data types
    clean_test_df = clean_test_df.astype({
        'question': str,
        'answer': str,
        'category': str
    })

    print(f"Original test set size: {len(test_df)}")
    print(f"Clean test set size: {len(clean_test_df)}")

    # If clean dataset is smaller than requested samples, adjust n_samples
    n_samples = min(n_samples, len(clean_test_df))
    print(f"Will evaluate {n_samples} samples")

    evaluation_results = {
        'response_generation': [],
        'context_relevance': [],
        'answer_accuracy': [],
        'detailed_samples': []
    }

    # Sample test cases from clean dataset
    test_samples = clean_test_df.sample(n=n_samples, random_state=42)

    print("\nStarting evaluation...")
    for idx, row in tqdm(test_samples.iterrows(), total=len(test_samples)):
        try:
            # 1. Generate Response
            query = str(row['question']).strip()
            ground_truth = str(row['answer']).strip()

            # Skip empty queries
            if not query:
                continue

            generated_response = rag_system.generate_response(query)

            # 2. Get Retrieved Context
            retrieved_pairs = rag_system.two_stage_retrieval(query)

            # Ensure proper string conversion for retrieved contexts
            retrieved_contexts = []
            for pair in retrieved_pairs:
                q = str(pair['question']).strip()
                a = str(pair['answer']).strip()
                if q and a:  # Only add if both question and answer are non-empty
                    retrieved_contexts.append(f"Q: {q}\nA: {a}")

            # 3. Evaluate Individual Components
            sample_evaluation = {
                'query': query,
                'ground_truth': ground_truth,
                'generated_response': str(generated_response),
                'retrieved_contexts': retrieved_contexts,
                'category': str(row['category']).strip()
            }

            # 4. Basic Metrics
            # Response Generation Success
            response_success = bool(generated_response and len(str(generated_response).strip()) > 0)
            evaluation_results['response_generation'].append(response_success)

            # Context Retrieval Success
            context_retrieved = bool(retrieved_contexts)
            evaluation_results['context_relevance'].append(context_retrieved)

            # Basic Answer Presence Check
            answer_provided = not str(generated_response).endswith(
                "I cannot provide an answer to this question"
            )
            evaluation_results['answer_accuracy'].append(answer_provided)

            # Store detailed sample results
            evaluation_results['detailed_samples'].append(sample_evaluation)

        except Exception as e:
            print(f"Error evaluating sample {idx}: {str(e)}")
            continue

    # Calculate Overall Metrics
    metrics = {
        'response_generation_rate': float(np.mean(evaluation_results['response_generation'])),
        'context_retrieval_rate': float(np.mean(evaluation_results['context_relevance'])),
        'answer_provision_rate': float(np.mean(evaluation_results['answer_accuracy'])),
        'total_samples_evaluated': len(evaluation_results['detailed_samples']),
        'evaluation_timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    # Category-wise Analysis
    category_metrics = {}
    for category in clean_test_df['category'].unique():
        category_samples = [
            sample for sample in evaluation_results['detailed_samples']
            if sample['category'] == str(category)
        ]
        if category_samples:
            category_metrics[str(category)] = {
                'samples_count': len(category_samples),
                'response_rate': float(sum(1 for s in category_samples
                                   if len(str(s['generated_response'])) > 0) / len(category_samples))
            }

    # Save Results
    evaluation_output = {
        'overall_metrics': metrics,
        'category_metrics': category_metrics,
        'sample_results': evaluation_results['detailed_samples'][:5]  # Store first 5 samples for reference
    }

    # Save to file with error handling
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'evaluation_results_{timestamp}.json'
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(evaluation_output, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to {filename}")
    except Exception as e:
        print(f"Error saving results: {str(e)}")

    # Print Summary
    print("\nEvaluation Summary:")
    print("-" * 50)
    print(f"Total Samples Evaluated: {metrics['total_samples_evaluated']}")
    print(f"Response Generation Rate: {metrics['response_generation_rate']:.2%}")
    print(f"Context Retrieval Rate: {metrics['context_retrieval_rate']:.2%}")
    print(f"Answer Provision Rate: {metrics['answer_provision_rate']:.2%}")
    print("\nCategory-wise Performance:")
    for category, cat_metrics in category_metrics.items():
        print(f"Category {category}:")
        print(f"  Samples: {cat_metrics['samples_count']}")
        print(f"  Response Rate: {cat_metrics['response_rate']:.2%}")

    return evaluation_output

# Required imports
import numpy as np
from datetime import datetime
from tqdm import tqdm
import json


evaluation_results = evaluate_rag_system(rag_system, test_df, n_samples=50)



Original test set size: 498
Clean test set size: 498
Will evaluate 50 samples

Starting evaluation...


100%|██████████| 50/50 [06:14<00:00,  7.48s/it]


Results saved to evaluation_results_20241212_045207.json

Evaluation Summary:
--------------------------------------------------
Total Samples Evaluated: 50
Response Generation Rate: 100.00%
Context Retrieval Rate: 100.00%
Answer Provision Rate: 100.00%

Category-wise Performance:
Category G:
  Samples: 23
  Response Rate: 100.00%
Category E1:
  Samples: 16
  Response Rate: 100.00%
Category R:
  Samples: 11
  Response Rate: 100.00%





In [None]:
results

{'overall_metrics': {'response_generation_rate': 1.0,
  'context_retrieval_rate': 1.0,
  'answer_provision_rate': 1.0,
  'total_samples_evaluated': 45,
  'evaluation_timestamp': '2024-12-11 21:16:59'},
 'category_metrics': {'R': {'samples_count': 18, 'response_rate': 1.0},
  'E1': {'samples_count': 13, 'response_rate': 1.0},
  'G': {'samples_count': 14, 'response_rate': 1.0}},
 'sample_results': [{'query': 'When did ExxonMobil publicly acknowledge the risk of climate change for the first time?',
   'ground_truth': 'ExxonMobil released a report publicly acknowledging climate change risk for the first time in April 2014.',
   'generated_response': 'Based on the following context:\n    Q: Does the document mention any specific financial mechanisms or sources of funding for climate change actions?\nA: Yes, the document mentions specific financial mechanisms or sources of funding for climate change actions:\n- Internal climate revolving civil fund replenished by environmental fees, ecosyste

In [28]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [52]:
from ragas import evaluate
from ragas.metrics import (
    ContextPrecision,
    ContextRecall,
    AnswerRelevancy,
    AnswerCorrectness,
    AnswerSimilarity
)
from datasets import Dataset
import re
import pandas as pd
import signal
from contextlib import contextmanager
import time

def safe_str_conversion(value):
    """
    Safely convert any value to string, handling NaN and special cases
    """
    if pd.isna(value):
        return ""
    try:
        return str(value)
    except:
        return ""

@contextmanager
def timeout(seconds):
    def signal_handler(signum, frame):
        raise TimeoutError(f"Timed out after {seconds} seconds")

    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)

    try:
        yield
    finally:
        signal.alarm(0)

def safe_generate_response(rag_system, query, timeout_seconds):
    try:
        with timeout(timeout_seconds):
            return rag_system.generate_response(query)
    except TimeoutError:
        raise TimeoutError("Response generation timed out")

def evaluate_with_ragas(rag_system, test_df, n_samples=50, timeout_seconds=30):
    test_df = test_df.astype(str)
    problematic_rows = []
    timeout_rows = []

    evaluation_data = {
        'question': [],
        'answer': [],
        'contexts': [],
        'ground_truth': []
    }

    # Skip the first row by starting from index 1
    test_samples = test_df.iloc[1:].sample(n=n_samples-1, random_state=42) if n_samples else test_df.iloc[1:]
    print("Starting evaluation with RAG system...")
    print(f"Total samples in original test_df: {len(test_df)}, sampling {n_samples-1} for evaluation (excluding first row).")

    for idx, row in test_samples.iterrows():
        try:
            # Basic input validation
            query = safe_str_conversion(row['question'])
            ground_truth = safe_str_conversion(row['answer'])

            ground_truth = re.sub(r'\[\d+\]', '', ground_truth).strip()

            if not query or not ground_truth:
                print(f"Skipping row {idx} due to empty query or ground truth")
                problematic_rows.append(idx)
                continue

            try:
                response = safe_generate_response(rag_system, query, timeout_seconds)
                response = safe_str_conversion(response)
            except TimeoutError:
                print(f"Timeout occurred for row {idx}")
                timeout_rows.append(idx)
                continue
            except Exception as e:
                print(f"Error generating response for row {idx}: {str(e)}")
                problematic_rows.append(idx)
                continue

            # Remove citations from response
            response = re.sub(r'\[\d+\]', '', response).strip()

            # If response is not a proper string or empty, skip
            if not response or isinstance(response, float):
                print(f"Invalid response (float or empty) for row {idx}")
                problematic_rows.append(idx)
                continue

            print(f"[DEBUG] Row {idx}:")
            print(f"    Query: {query}")
            print(f"    Ground Truth: {ground_truth}")
            print(f"    RAG Response: {response}")

            # Get and process contexts with timeout
            try:
                with timeout(timeout_seconds):
                    retrieved_pairs = rag_system.two_stage_retrieval(query)
                    contexts = []

                    print(f"    Retrieved pairs for row {idx}: {retrieved_pairs}")

                    for pair in retrieved_pairs:
                        try:
                            if isinstance(pair, pd.Series):
                                q = safe_str_conversion(pair.get('question', ''))
                                a = safe_str_conversion(pair.get('answer', ''))
                            elif isinstance(pair, dict):
                                q = safe_str_conversion(pair.get('question', ''))
                                a = safe_str_conversion(pair.get('answer', ''))
                            else:
                                continue

                            a = re.sub(r'\[\d+\]', '', a).strip()
                            if q or a:
                                context = f"{q} {a}".strip()
                                # If context is float or empty, skip pair
                                if isinstance(context, float) or not context:
                                    continue
                                contexts.append(context)
                        except Exception as e:
                            print(f"Error processing pair in row {idx}: {str(e)}")
                            continue
            except TimeoutError:
                print(f"Context retrieval timeout for row {idx}")
                timeout_rows.append(idx)
                continue

            print(f"    Contexts for row {idx}: {contexts}")

            # Only add if we have valid data
            if contexts:
                evaluation_data['question'].append(query)
                evaluation_data['answer'].append(response)
                evaluation_data['contexts'].append(contexts)
                evaluation_data['ground_truth'].append(ground_truth)
            else:
                print(f"No valid contexts for row {idx}")
                problematic_rows.append(idx)

        except Exception as e:
            print(f"Error processing row {idx}: {str(e)}")
            problematic_rows.append(idx)
            continue

    # Remove problematic and timeout rows
    rows_to_remove = list(set(problematic_rows + timeout_rows))
    if rows_to_remove:
        print(f"\nRemoving problematic rows: {rows_to_remove}")
        test_df = test_df.drop(rows_to_remove)
        test_df = test_df.reset_index(drop=True)
        print(f"New dataframe shape after removing problematic rows: {test_df.shape}")

    if not evaluation_data['question']:
        raise ValueError("No valid examples were processed")

    dataset = Dataset.from_dict(evaluation_data)

    print("\nProcessed data statistics:")
    print(f"Total examples processed (after cleanup): {len(dataset)}")
    print(f"Total timeout errors: {len(timeout_rows)}")
    print(f"Total problematic rows: {len(problematic_rows)}")
    print(f"Average contexts per question: {sum(len(c) for c in evaluation_data['contexts']) / len(evaluation_data['contexts']):.2f}")

    print(f"\nSample of the dataset: {dataset[0] if len(dataset) > 0 else 'No data'}")

    metrics = [
        ContextPrecision(),
        ContextRecall(),
        AnswerRelevancy(),
        AnswerCorrectness(),
        AnswerSimilarity()
    ]

    print("\nRunning RAGAS evaluation...")
    print(f"[DEBUG] Metrics: {[m.__class__.__name__ for m in metrics]}")

    try:
        results = evaluate(dataset=dataset, metrics=metrics)

        print("\n[DEBUG] Raw evaluation results:")
        print(results)

        # If results is a single float, wrap it into a dict
        if isinstance(results, float):
            # If there's only one metric, you could name it 'score'
            results = {'score': results}

        if not isinstance(results, dict):
            # Create a dummy dict if results is not dict or float
            results = {'metric_value': results}

        # Filter out invalid float values or NaNs
        filtered_results = {}
        for k, v in results.items():
            if isinstance(v, float) and pd.isna(v):
                # Skip NaN values
                continue
            # Keep only valid items
            filtered_results[k] = v

        return filtered_results, test_df

    except Exception as e:
        print(f"RAGAS evaluation failed: {str(e)}")
        print("\n[DEBUG] Intermediate data that caused the failure:")
        print(f"Dataset length: {len(dataset)}")
        print("First item in dataset:", dataset[0] if len(dataset) > 0 else "No data")

        return None, test_df


try:
    results, clean_test_df = evaluate_with_ragas(rag_system, test_df, timeout_seconds=60)
    if results is not None:
        print("\nRAGAS Evaluation Results:")
        print("-" * 50)
        for metric_name, score in results.items():
            if isinstance(score, (int, float)) and not pd.isna(score):
                print(f"{metric_name}: {score:.3f}")
            else:
                print(f"{metric_name}: {score}")
    else:
        print("No results returned.")
except Exception as e:
    print(f"Final display failed: {str(e)}")

Starting evaluation with RAG system...
Total samples in original test_df: 498, sampling 49 for evaluation (excluding first row).
[DEBUG] Row 5365:
    Query: Are there any specific guidelines for the composition of Search and Rescue Teams?
    Ground Truth: - The Constitution of Search and Rescue Team does not specify guidelines for the composition of Search and Rescue Teams 
- The Constitution of Search and Rescue Team does not specify guidelines for the composition of Search and Rescue Teams
    RAG Response: Based on the following context:
    Q: Are there any specific guidelines for the composition of Search and Rescue Teams?
A: - The Constitution of Search and Rescue Team does not specify guidelines for the composition of Search and Rescue Teams 
- The Constitution of Search and Rescue Team does not specify guidelines for the composition of Search and Rescue Teams 
Q: What are the specific roles and responsibilities of the Disaster Management Centre?
A: The Disaster Management Cen

Evaluating:   0%|          | 0/245 [00:00<?, ?it/s]

RAGAS evaluation failed: 'float' object is not subscriptable

[DEBUG] Intermediate data that caused the failure:
Dataset length: 49
First item in dataset: {'question': 'Are there any specific guidelines for the composition of Search and Rescue Teams?', 'answer': "Based on the following context:\n    Q: Are there any specific guidelines for the composition of Search and Rescue Teams?\nA: - The Constitution of Search and Rescue Team does not specify guidelines for the composition of Search and Rescue Teams \n- The Constitution of Search and Rescue Team does not specify guidelines for the composition of Search and Rescue Teams \nQ: What are the specific roles and responsibilities of the Disaster Management Centre?\nA: The Disaster Management Centre's functions are assigned by the Council . These functions include initiating programmes relating to disaster prevention, mitigation, relief, rehabilitation, and reconstruction . \n\nQ: What is the composition of the National Disaster Risk Manag

In [32]:
pip show ragas

Name: ragas
Version: 0.2.8
Summary: 
Home-page: 
Author: 
Author-email: 
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: appdirs, datasets, langchain, langchain-community, langchain-core, langchain_openai, nest-asyncio, numpy, openai, pydantic, pysbd, tiktoken
Required-by: 


In [None]:
pip install timeout-decorator

Collecting timeout-decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: timeout-decorator
  Building wheel for timeout-decorator (setup.py) ... [?25l[?25hdone
  Created wheel for timeout-decorator: filename=timeout_decorator-0.5.0-py3-none-any.whl size=5007 sha256=90694e9face4afe30eda4dfbe73093d2f980968540bb08a453efec4ee3f622ce
  Stored in directory: /root/.cache/pip/wheels/68/2f/bc/76f1192d474666d41ae6f09813fccbd00fe3f07e8261c4cff5
Successfully built timeout-decorator
Installing collected packages: timeout-decorator
Successfully installed timeout-decorator-0.5.0


### ROUGE & BLEU

In [34]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=506aabdc3e6cd8410e39e2af49a6d678fc1ab9c67420b34a8c88a99b39ef05fc
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [37]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [38]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm

In [40]:
def calculate_rag_metrics(rag_system, test_df, n_samples=50):
    # Initialize counters and metrics
    processed_count = 0
    metrics = {
        'response_time': [],
        'rouge_scores': {'rouge1': [], 'rouge2': [], 'rougeL': []},
        'bleu_scores': [],
        'context_usage': [],
        'answer_length': []
    }

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smoother = SmoothingFunction().method1

    # Sample or use full dataset
    test_samples = test_df.sample(n=n_samples, random_state=42) if n_samples else test_df

    for idx, row in test_samples.iterrows():
        try:
            # Input validation
            query = str(row['question']) if not pd.isna(row['question']) else ""
            ground_truth = str(row['answer']) if not pd.isna(row['answer']) else ""

            if not query or not ground_truth:
                continue

            # Measure response time with microsecond precision
            start_time = time.perf_counter()
            response = rag_system.generate_response(query)
            end_time = time.perf_counter()

            if not response:
                continue

            # Calculate response time in seconds
            response_time = end_time - start_time
            metrics['response_time'].append(response_time)

            # Calculate ROUGE scores
            rouge_scores = scorer.score(ground_truth, response)
            for key in ['rouge1', 'rouge2', 'rougeL']:
                metrics['rouge_scores'][key].append(rouge_scores[key].fmeasure)

            # Calculate BLEU score
            reference_tokens = [word_tokenize(ground_truth.lower())]
            candidate_tokens = word_tokenize(response.lower())
            bleu_score = sentence_bleu(reference_tokens, candidate_tokens,
                                     smoothing_function=smoother)
            metrics['bleu_scores'].append(bleu_score)

            # Calculate context usage
            retrieved_pairs = rag_system.two_stage_retrieval(query)
            if retrieved_pairs:
                context_text = ' '.join([
                    f"{str(pair.get('question', ''))} {str(pair.get('answer', ''))}"
                    for pair in retrieved_pairs
                    if isinstance(pair, dict)
                ])

                if context_text.strip():
                    context_words = set(word_tokenize(context_text.lower()))
                    response_words = set(word_tokenize(response.lower()))
                    if context_words:
                        context_usage = len(context_words.intersection(response_words)) / len(context_words)
                        metrics['context_usage'].append(context_usage)

            # Calculate answer length
            metrics['answer_length'].append(len(word_tokenize(response)))

            processed_count += 1

        except Exception as e:
            continue

    # Calculate final metrics only if we have processed examples
    if processed_count > 0:
        final_metrics = {
            'average_response_time': np.mean(metrics['response_time']),
            'response_time_std': np.std(metrics['response_time']),
            'rouge1_f1': np.mean(metrics['rouge_scores']['rouge1']),
            'rouge2_f1': np.mean(metrics['rouge_scores']['rouge2']),
            'rougeL_f1': np.mean(metrics['rouge_scores']['rougeL']),
            'bleu_score': np.mean(metrics['bleu_scores']),
            'context_usage': np.mean(metrics['context_usage']) if metrics['context_usage'] else 0.0,
            'average_answer_length': np.mean(metrics['answer_length']),
            'answer_length_std': np.std(metrics['answer_length']),
            'total_processed': processed_count
        }
    else:
        final_metrics = {
            'average_response_time': 0.0,
            'response_time_std': 0.0,
            'rouge1_f1': 0.0,
            'rouge2_f1': 0.0,
            'rougeL_f1': 0.0,
            'bleu_score': 0.0,
            'context_usage': 0.0,
            'average_answer_length': 0.0,
            'answer_length_std': 0.0,
            'total_processed': 0
        }

    return final_metrics

def display_metrics(metrics):
    print("\nRAG System Evaluation Results:")
    print("-" * 30)

    print("\nResponse Time Metrics:")
    print(f"Average Response Time: {metrics['average_response_time']:.3f} seconds")
    print(f"Response Time Std Dev: {metrics['response_time_std']:.3f} seconds")

    print("\nQuality Metrics:")
    print(f"ROUGE-1 F1: {metrics['rouge1_f1']:.3f}")
    print(f"ROUGE-2 F1: {metrics['rouge2_f1']:.3f}")
    print(f"ROUGE-L F1: {metrics['rougeL_f1']:.3f}")
    print(f"BLEU Score: {metrics['bleu_score']:.3f}")

    print("\nContent Analysis:")
    # print(f"Average Context Usage: {metrics['context_usage']:.3f}")
    print(f"Average Answer Length: {metrics['average_answer_length']:.1f} words")
    print(f"Answer Length Std Dev: {metrics['answer_length_std']:.1f} words")

    print("\nProcessing Statistics:")
    print(f"Total examples processed: {metrics['total_processed']}")


metrics_results = calculate_rag_metrics(rag_system, clean_test_df)
display_metrics(metrics_results)



RAG System Evaluation Results:
------------------------------

Response Time Metrics:
Average Response Time: 0.000 seconds
Response Time Std Dev: 0.000 seconds

Quality Metrics:
ROUGE-1 F1: 0.172
ROUGE-2 F1: 0.110
ROUGE-L F1: 0.147
BLEU Score: 0.055

Content Analysis:
Average Answer Length: 258.5 words
Answer Length Std Dev: 134.3 words

Processing Statistics:
Total examples processed: 50


In [41]:
def analyze_rag_quality(metrics, benchmarks=None):
    """
    Analyze RAG system quality metrics and provide insights
    """
    if benchmarks is None:
        # Default benchmark values for comparison
        benchmarks = {
            'rouge1': 0.35,  # Good performance benchmark
            'rouge2': 0.20,
            'rougeL': 0.32,
            'bleu': 0.15,
            'min_acceptable': {
                'rouge1': 0.15,
                'rouge2': 0.08,
                'rougeL': 0.12,
                'bleu': 0.05
            }
        }

    # Current metrics
    current_metrics = {
        'rouge1': 0.172,
        'rouge2': 0.110,
        'rougeL': 0.147,
        'bleu': 0.055
    }

    content_stats = {
        'avg_length': 258.5,
        'length_std': 134.3
    }

    # Analysis results
    analysis = {
        'metrics_assessment': {},
        'recommendations': [],
        'overall_quality': None
    }

    # Analyze each metric
    for metric, value in current_metrics.items():
        benchmark = benchmarks[metric]
        min_acceptable = benchmarks['min_acceptable'][metric]

        if value >= benchmark:
            status = "Good"
        elif value >= min_acceptable:
            status = "Acceptable"
        else:
            status = "Needs Improvement"

        percentage_of_benchmark = (value / benchmark) * 100

        analysis['metrics_assessment'][metric] = {
            'value': value,
            'benchmark': benchmark,
            'status': status,
            'percentage_of_benchmark': percentage_of_benchmark
        }

    # Overall quality assessment
    metrics_below_min = sum(1 for m, v in current_metrics.items()
                          if v < benchmarks['min_acceptable'][m])

    if metrics_below_min == 0:
        analysis['overall_quality'] = "Good"
    elif metrics_below_min <= 2:
        analysis['overall_quality'] = "Acceptable"
    else:
        analysis['overall_quality'] = "Needs Improvement"

    # Generate recommendations
    if current_metrics['rouge1'] < benchmarks['min_acceptable']['rouge1']:
        analysis['recommendations'].append(
            "Improve answer relevance and coverage by enhancing retrieval precision"
        )

    if current_metrics['rouge2'] < benchmarks['min_acceptable']['rouge2']:
        analysis['recommendations'].append(
            "Enhance answer coherence by improving context integration"
        )

    if current_metrics['rougeL'] < benchmarks['min_acceptable']['rougeL']:
        analysis['recommendations'].append(
            "Improve answer structure and sequence matching"
        )

    if current_metrics['bleu'] < benchmarks['min_acceptable']['bleu']:
        analysis['recommendations'].append(
            "Work on improving answer precision and exact phrase matching"
        )

    if content_stats['length_std'] / content_stats['avg_length'] > 0.4:
        analysis['recommendations'].append(
            "Consider standardizing answer lengths for more consistent responses"
        )

    return analysis

def display_analysis(analysis):
    """
    Display the analysis results in a formatted way
    """
    print("\nRAG System Quality Analysis")
    print("=" * 50)

    print("\nMetrics Assessment:")
    print("-" * 30)
    for metric, assessment in analysis['metrics_assessment'].items():
        print(f"\n{metric.upper()}:")
        print(f"Value: {assessment['value']:.3f}")
        print(f"Benchmark: {assessment['benchmark']:.3f}")
        print(f"Status: {assessment['status']}")
        print(f"Percentage of Benchmark: {assessment['percentage_of_benchmark']:.1f}%")

    print("\nOverall Quality:", analysis['overall_quality'])

    print("\nRecommendations:")
    print("-" * 30)
    for i, rec in enumerate(analysis['recommendations'], 1):
        print(f"{i}. {rec}")

# Run the analysis
analysis_results = analyze_rag_quality(None)  # Using default benchmarks
display_analysis(analysis_results)



RAG System Quality Analysis

Metrics Assessment:
------------------------------

ROUGE1:
Value: 0.172
Benchmark: 0.350
Status: Acceptable
Percentage of Benchmark: 49.1%

ROUGE2:
Value: 0.110
Benchmark: 0.200
Status: Acceptable
Percentage of Benchmark: 55.0%

ROUGEL:
Value: 0.147
Benchmark: 0.320
Status: Acceptable
Percentage of Benchmark: 45.9%

BLEU:
Value: 0.055
Benchmark: 0.150
Status: Acceptable
Percentage of Benchmark: 36.7%

Overall Quality: Good

Recommendations:
------------------------------
1. Consider standardizing answer lengths for more consistent responses
