# Evaluation - Advanced RAG

##### Advanced RAG - Fusion, HyDE, Multi-query, Stepback hepsi tek hücrede

In [27]:
# Import necessary libraries
import os
import time
import pandas as pd
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
import prompts as prompts
import initials as initials
import indexing
import evaluation
import chromadb

VECTORSTORE_WEIGHT = 0.5

test_directory = '/Users/taha/Desktop/rag/test_data_routing/Mobilfunk'
# Define input CSV path
input_csv_path = '/Users/taha/Desktop/rag/test_data_routing/Mobilfunk/_testset_advanced_routing_semantic.csv'  # Input CSV file path

# Define output CSV path including the filename
output_csv_path = 'test_data_routing/Mobilfunk/_evaluation_advanced_routing_densex_semantic_fusion.csv'  # Output file will be created here

# Function to create the output CSV file at the beginning
def initialize_output_csv(output_path):
    # Directly create the file with the correct header
    with open(output_path, 'w') as file:
        header = (
            "Question,Response,Contexts,Ground Truth,"
            "Token Count,Total Cost (USD),Completion Tokens,Number of Retrieved documents,"
            "Response time,answer_relevancy,context_precision,"
            "context_recall,faithfulness,BleuScore,RougeScore\n"
        )
        file.write(header)
    print(f"Created output file at: {output_path}")

# Function to get response with error handling
def get_response(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        #==========================   Chunking method   =============================
        # Character splitting without separator
        text_splitter = CharacterTextSplitter(
            separator='',
            chunk_size=500,
            chunk_overlap=50,
        )
        
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")

        # Vector store
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)

        #DenseX vectors store
        densex_vectorstore = indexing.generate_vector_store_with_chunking(user_input, initials.model, test_directory, initials.embedding, )

        print("==========   VECTORSTORE CREATED  ==========")
        
        densex_retriever = densex_vectorstore.as_retriever()
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
        
        #==========================   HyDE   =============================
        '''
        retriever = vectorstore.as_retriever()

        hyde_docs = (prompts.prompt_hyde | initials.model | StrOutputParser())
        retrieval_chain_hyde = hyde_docs | retriever 
        retrieved_docs = retrieval_chain_hyde.invoke({"question": user_input, "question_history": question_history})
    
        hyde_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = hyde_rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."
        
        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, retrieved_docs, total_cost, total_tokens, completion_tokens

        '''

        #==========================   HyDE - Hybrid  =============================
        '''
        retriever = vectorstore.as_retriever()

        keyword_retriever = BM25Retriever.from_documents(chunks)
        hybrid_retriever = EnsembleRetriever(retrievers=[keyword_retriever, retriever], weights=[1-VECTORSTORE_WEIGHT, VECTORSTORE_WEIGHT])
        print("==================CONTROL=================")
        hyde_docs = (prompts.prompt_hyde | initials.model | StrOutputParser())
        retrieval_chain_hyde = hyde_docs | hybrid_retriever 
        retrieved_docs = retrieval_chain_hyde.invoke({"question": user_input, "question_history": question_history})
    
        hyde_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = hyde_rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens
 
        return response, retrieved_docs, total_cost, total_tokens, completion_tokens
        '''
        
        #==========================   RAG Fusion   =============================
        #'''
        # Generate multiple queries using the multi_query_prompt and model
        #retriever = vectorstore.as_retriever()

        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))
        )

        retrieval_chain_rag_fusion = generate_multi_queries | densex_retriever.map() | initials.reciprocal_rank_fusion

        fusion_docs = retrieval_chain_rag_fusion.invoke({"question": user_input, "question_history": question_history})
        document_list = [doc[0] for doc in fusion_docs]

        fusion_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())
        
        with get_openai_callback() as cb:
            response = fusion_rag_chain.invoke({
                "context": document_list, 
                "question": user_input,
                "chat_history": []
            }) if document_list else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, document_list, total_cost, total_tokens, completion_tokens
        #'''
        #==========================   Multi-Query   =============================
        '''
        retriever = vectorstore.as_retriever()

        # Generate multiple queries using the multi_query_prompt and model
        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))  # Split the generated output into individual queries
        )

        # Generate the multiple queries based on user input
        multiple_queries = generate_multi_queries.invoke({"question": user_input, "question_history": question_history})

        # Now, use the generated queries to retrieve documents
        # Now, use the generated queries to retrieve documents
        if multiple_queries:
            # Use retriever to fetch documents for each query
            documents_text = []
            for query in multiple_queries:
                retrieved_docs = retriever.get_relevant_documents(query)
                # Join all retrieved documents into a single text string for each query result
                docs_texts = " ".join([doc.page_content for doc in retrieved_docs])
                documents_text.append(docs_texts)

        # Create prompt for final response generation
        multi_query_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = multi_query_rag_chain.invoke({
                "context": documents_text, 
                "question": user_input,
                "chat_history": []
            }) if documents_text else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, documents_text, total_cost, total_tokens, completion_tokens
        '''

        #==========================   Stepback   =============================
        '''
        retriever = vectorstore.as_retriever()

        # Generate step-back queries
        generate_stepback_question = prompts.step_back_prompt | initials.model | StrOutputParser()
        step_back_question = generate_stepback_question.invoke({"question": user_input, "question_history": question_history })
        normal_context = retriever.invoke(user_input)
        step_back_chain = (
        {
            "chat_history": lambda x: x["chat_history"],
            "normal_context": lambda x: retriever.invoke(x["question"]),
            "question": lambda x: x["question"],
            "step_back_context": lambda x: retriever.invoke(x["step_back_question"]),
            "question_history": lambda x: x["question_history"],
        }
            | prompts.stepback_response_prompt
            | initials.model
            | StrOutputParser()
        )
        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = step_back_chain.invoke({ 
                "chat_history": [],
                "question": user_input,
                "step_back_question": step_back_question,
                "question_history": []
            })

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, normal_context, total_cost, total_tokens, completion_tokens
        '''

        #==========================   Stepback - Hybrid   =============================
        '''
        retriever = vectorstore.as_retriever()

        keyword_retriever = BM25Retriever.from_documents(chunks)
        hybrid_retriever = EnsembleRetriever(retrievers=[keyword_retriever, retriever], weights=[1-VECTORSTORE_WEIGHT, VECTORSTORE_WEIGHT])
        # Generate step-back queries
        generate_stepback_question = prompts.step_back_prompt | initials.model | StrOutputParser()
        step_back_question = generate_stepback_question.invoke({"question": user_input, "question_history": question_history })
        normal_context = retriever.invoke(user_input)
        step_back_chain = (
        {
            "chat_history": lambda x: x["chat_history"],
            "normal_context": lambda x: hybrid_retriever.invoke(x["question"]),
            "question": lambda x: x["question"],
            "step_back_context": lambda x: hybrid_retriever.invoke(x["step_back_question"]),
            "question_history": lambda x: x["question_history"],
        }
            | prompts.stepback_response_prompt
            | initials.model
            | StrOutputParser()
        )
        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = step_back_chain.invoke({ 
                "chat_history": [],
                "question": user_input,
                "step_back_question": step_back_question,
                "question_history": []
            })

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, normal_context, total_cost, total_tokens, completion_tokens
        '''
        
        #======================================================================
        
    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None


# Function to save evaluation data to CSV
def save_evaluation_to_csv(evaluation_data, filename):
    df = pd.DataFrame([evaluation_data])
    df.to_csv(filename, mode='a', index=False, header=False)

# Main execution
def run_evaluations_from_csv(input_csv, output_csv):
    # Directly create the output CSV file with headers at the beginning
    initialize_output_csv(output_csv)

    # Load questions from the CSV file
    questions_df = pd.read_csv(input_csv)
    
    for index, row in questions_df.iterrows():
        user_query = row['question']
        start_time = time.time()  # Start timing
        print(f"Processing question {index + 1}/{len(questions_df)}: {user_query}")

        try:
            # Get the response, generated queries, and retrieved documents
            response, context, total_cost, total_tokens, completion_tokens = get_response(user_query)
            print("==========   ANSWER GENERATED  ==========")

            # Initialize metrics_results
            metrics_results = None

            print("==========   EVALUATION  ==========")
            # Evaluate metrics and retrieve dataset
            metrics_results, dataset = evaluation.evaluate_result(user_query, response, context, input_csv)
            print(f"Metrics for question '{user_query}': {metrics_results}")

            if response:
                # Calculate response time
                response_time = time.time() - start_time
                # Clear the system cache after processing the response
                chromadb.api.client.SharedSystemClient.clear_system_cache()

                # Prepare data for CSV
                if metrics_results is not None:
                    # Extract contexts and ground_truth from the dataset
                    contexts = dataset["contexts"][0]  # Access first row's 'contexts'
                    ground_truth = dataset["ground_truth"][0]  # Access first row's 'ground_truth'
                    
                    evaluation_data = {
                        'Question': user_query,
                        'Response': response,
                        'Contexts': contexts,
                        'Ground Truth': ground_truth,
                        'Token Count': total_tokens,
                        'Total Cost (USD)': total_cost,
                        'Completion Tokens': completion_tokens,
                        'Number of Retrieved documents': len(context),
                        'Response time': response_time,
                        'answer_relevancy': metrics_results.get('answer_relevancy'),
                        'context_precision': metrics_results.get('context_precision'),
                        'context_recall': metrics_results.get('context_recall'),
                        'faithfulness': metrics_results.get('faithfulness'),
                        'BleuScore': metrics_results.get('bleu_score'),
                        'RougeScore': metrics_results.get('rouge_score'),

                    }

                    # Save the evaluation data to CSV
                    save_evaluation_to_csv(evaluation_data, output_csv)
                    print(f"Evaluation metrics saved for question '{user_query}'.")

            print("==========   PROCESS ENDED  ==========\n")

        except ValueError as ve:
            print(f"ValueError for question {index + 1}: {ve}")
            print("Skipping to the next question...\n")

        except Exception as e:
            print(f"Unexpected error for question {index + 1}: {e}")
            print("Skipping to the next question...\n")


# Run evaluations
run_evaluations_from_csv(input_csv_path, output_csv_path)

Created output file at: test_data_routing/Mobilfunk/_evaluation_advanced_routing_densex_semantic_fusion.csv
Processing question 1/10: What ist die Bedeutung der eSIM für die Verbindung der Samsung Galaxy Watch mit dem Mobilfunknetz?
An error occurred: Expected IDs to be a non-empty list, got 0 IDs
Unexpected error for question 1: 'NoneType' object is not iterable
Skipping to the next question...

Processing question 2/10: Kann ich eine PlusKarte ohne Mindestvertragslaufzeit buchen?


KeyboardInterrupt: 

### A-C-S-RAG

In [1]:
# Import necessary libraries
import time
import pandas as pd
import evaluation
import graph
import chromadb


test_directory = '/Users/taha/Desktop/rag/test_data_naive'
# Define input CSV path
input_csv_path = '/Users/taha/Desktop/rag/test_data_naive/_testset_semantic.csv'  # Input CSV file path

# Define output CSV path including the filename
output_csv_path = 'test_data_naive/_evaluation_advanced_semantic_fusion_acsRAG.csv'  # Output file will be created here

# Function to create the output CSV file at the beginning
def initialize_output_csv(output_path):
    # Directly create the file with the correct header
    with open(output_path, 'w') as file:
        header = (
            "Question,Response,Contexts,Ground Truth,"
            "Number of Retrieved documents,"
            "Response time,answer_relevancy,context_precision,"
            "context_recall,faithfulness,BleuScore,RougeScore\n"
        )
        file.write(header)
    print(f"Created output file at: {output_path}")

# Function to save evaluation data to CSV
def save_evaluation_to_csv(evaluation_data, filename):
    df = pd.DataFrame([evaluation_data])
    df.to_csv(filename, mode='a', index=False, header=False)

# Main execution
def run_evaluations_from_csv(input_csv, output_csv):
    # Directly create the output CSV file with headers at the beginning
    initialize_output_csv(output_csv)

    # Load questions from the CSV file
    questions_df = pd.read_csv(input_csv)
    
    for index, row in questions_df.iterrows():
        user_query = row['question']
        start_time = time.time()  # Start timing
        print(f"Processing question {index + 1}/{len(questions_df)}: {user_query}")

        try:
            question_history = []
            chat_history = []
            documents = []
            response, documents = graph.run_fusion_graph(user_query, chat_history, question_history, documents)            
            
            print("==========   ANSWER GENERATED  ==========")

            # Initialize metrics_results
            metrics_results = None

            print("==========   EVALUATION  ==========")
            # Evaluate metrics and retrieve dataset
            metrics_results, dataset = evaluation.evaluate_result(user_query, response, documents, input_csv)
            print(f"Metrics for question '{user_query}': {metrics_results}")

            if response:
                # Calculate response time
                response_time = time.time() - start_time
                # Clear the system cache after processing the response
                chromadb.api.client.SharedSystemClient.clear_system_cache()

                # Prepare data for CSV
                if metrics_results is not None:
                    # Extract contexts and ground_truth from the dataset
                    contexts = dataset["contexts"][0]  # Access first row's 'contexts'
                    ground_truth = dataset["ground_truth"][0]  # Access first row's 'ground_truth'
                    
                    evaluation_data = {
                        'Question': user_query,
                        'Response': response,
                        'Contexts': contexts,
                        'Ground Truth': ground_truth,
                        'Number of Retrieved documents': len(documents),
                        'Response time': response_time,
                        'answer_relevancy': metrics_results.get('answer_relevancy'),
                        'context_precision': metrics_results.get('context_precision'),
                        'context_recall': metrics_results.get('context_recall'),
                        'faithfulness': metrics_results.get('faithfulness'),
                        'BleuScore': metrics_results.get('bleu_score'),
                        'RougeScore': metrics_results.get('rouge_score'),

                    }

                    # Save the evaluation data to CSV
                    save_evaluation_to_csv(evaluation_data, output_csv)
                    print(f"Evaluation metrics saved for question '{user_query}'.")

            print("==========   PROCESS ENDED  ==========\n")

        except ValueError as ve:
            print(f"ValueError for question {index + 1}: {ve}")
            print("Skipping to the next question...\n")

        except Exception as e:
            print(f"Unexpected error for question {index + 1}: {e}")
            print("Skipping to the next question...\n")


# Run evaluations
run_evaluations_from_csv(input_csv_path, output_csv_path)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/taha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/taha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  import graph


Created output file at: test_data_naive/_evaluation_advanced_semantic_fusion_acsRAG.csv
Processing question 1/10: What steps should be taken to remove the Telekom Login from the Puls-Tablet?
---TRANSFORM QUERY---
	Transformed question:  What are the steps to remove the Telekom Login from my Puls-Tablet?
---ROUTE QUESTION---
---ROUTE QUESTION TO RAG---
"CURRENT GRAPH NODE: 'transform_query':"
---RETRIEVE---
Unexpected error for question 1: Error code: 429 - {'error': {'message': 'Request too large for text-embedding-ada-002 in organization org-3NbVEU8RpPtgSfHpicadwn8E on tokens per min (TPM): Limit 1000000, Requested 1010029. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Skipping to the next question...

Processing question 2/10: Wie können Sie Ihre Bestellung stornieren?
---TRANSFORM QUERY---
	Transformed question:  Wie kann 

KeyboardInterrupt: 

### results to csv

In [24]:
import pandas as pd
import os

# Load the CSV file into a DataFrame
file_path = 'test_data_routing/Mobilfunk/_evaluation_advanced_routing_semantic_multiquery.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# List of numeric columns to calculate averages
numeric_columns = [
    'Token Count', 'Total Cost (USD)', 'Completion Tokens',
    'Number of Retrieved documents', 'Response time',
    'answer_relevancy', 'context_precision', 'context_recall',
    'faithfulness', 'BleuScore', 'RougeScore'
]

# Calculate the mean for each numeric column
averages = df[numeric_columns].mean()

# Formatting the averages according to your requirements
formatted_averages = {
    'Token Count': f"{averages['Token Count']:.0f}",  # No decimal places
    'Total Cost (USD)': f"{averages['Total Cost (USD)']:.5f}",  # Keep as is
    'Completion Tokens': f"{averages['Completion Tokens']:.0f}",  # No decimal places
    'Number of Retrieved documents': f"{averages['Number of Retrieved documents']}",  # Keep as is
    'Response time': f"{averages['Response time']:.2f}",  # One decimal place
    'answer_relevancy': f"{averages['answer_relevancy']:.4f}",  # Four decimal places
    'context_precision': f"{averages['context_precision']:.4f}",  # Four decimal places
    'context_recall': f"{averages['context_recall']:.4f}",  # Four decimal places
    'faithfulness': f"{averages['faithfulness']:.4f}",  # Four decimal places
    'BleuScore': f"{averages['BleuScore']:.4f}",  # Four decimal places
    'RougeScore': f"{averages['RougeScore']:.4f}"  # Four decimal places
}

# Convert formatted averages to a DataFrame for saving
formatted_averages_df = pd.DataFrame([formatted_averages])

# Define the output file path by adding "results_" prefix
output_file_path = os.path.join(
    os.path.dirname(file_path), 
    f"_results_{os.path.basename(file_path)}"
)

# Save the formatted averages to CSV
formatted_averages_df.to_csv(output_file_path, index=False)
print(f"[INFO] Formatted averages saved to {output_file_path}")

[INFO] Formatted averages saved to test_data_routing/Mobilfunk/_results__evaluation_advanced_routing_semantic_multiquery.csv
