# Evaluation - Generation

In [15]:
# Import necessary libraries
import os
import time
import pandas as pd
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
import prompts as prompts
import initials as initials
import indexing
import evaluation
import chromadb

VECTORSTORE_WEIGHT = 0.5

test_routing_directory = '/Users/taha/Desktop/rag/test_data_routing/Mobilfunk'
test_directory = '/Users/taha/Desktop/rag/test_data'

# Define testset path
input_csv_path = '/Users/taha/Desktop/rag/test_data_routing/Mobilfunk/_testset_advanced_routing_semantic.csv'  # Input CSV file path

# Define output CSV path including the filename
output_csv_path = 'test_data_routing/_evaluation_advanced_fusion_densex_routing.csv'  # Output file will be created here

# Function to create the output CSV file at the beginning
def initialize_output_csv(output_path):
    # Directly create the file with the correct header
    with open(output_path, 'w') as file:
        header = (
            "Question,Response,Contexts,Ground Truth,"
            "Token Count,Total Cost (USD),Completion Tokens,Number of Retrieved documents,"
            "Response time,answer_relevancy,context_precision,"
            "context_recall,faithfulness,BleuScore,RougeScore\n"
        )
        file.write(header)
    print(f"Created output file at: {output_path}")

## Advanced RAG get_response functions

In [6]:
# Function to get response with error handling
def get_response_hyde(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
        
        #==========================   HyDE   =============================

        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        hyde_docs = (prompts.prompt_hyde | initials.model | StrOutputParser())
        retrieval_chain_hyde = hyde_docs | retriever 
        retrieved_docs = retrieval_chain_hyde.invoke({"question": user_input, "question_history": question_history})
    
        hyde_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = hyde_rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."
        
        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, retrieved_docs, total_cost, total_tokens, completion_tokens

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [7]:

# Function to get response with error handling
def get_response_hyde_hybrid(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents

        #==========================   HyDE - Hybrid  =============================
 
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        keyword_retriever = BM25Retriever.from_documents(chunks)
        hybrid_retriever = EnsembleRetriever(retrievers=[keyword_retriever, retriever], weights=[1-VECTORSTORE_WEIGHT, VECTORSTORE_WEIGHT])
        print("==================CONTROL=================")
        hyde_docs = (prompts.prompt_hyde | initials.model | StrOutputParser())
        retrieval_chain_hyde = hyde_docs | hybrid_retriever 
        retrieved_docs = retrieval_chain_hyde.invoke({"question": user_input, "question_history": question_history})
    
        hyde_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = hyde_rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens
 
        return response, retrieved_docs, total_cost, total_tokens, completion_tokens


    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [11]:
def get_response_hyde_densex_routing(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents

        #==========================   HyDE - DenseX   =============================

        #DenseX vectors store
        densex_vectorstore = indexing.generate_final_vectorstore_with_chunks(user_input, test_routing_directory, initials.embedding)        
        densex_retriever = densex_vectorstore.as_retriever()

        hyde_docs = (prompts.prompt_hyde | initials.model | StrOutputParser())
        retrieval_chain_hyde = hyde_docs | densex_retriever 
        retrieved_docs = retrieval_chain_hyde.invoke({"question": user_input, "question_history": question_history})
    
        hyde_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = hyde_rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."
        
        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, retrieved_docs, total_cost, total_tokens, completion_tokens



    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [2]:
def get_response_fusion(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
       
        #==========================   RAG Fusion   =============================

        #Generate multiple queries using the multi_query_prompt and model
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))
        )

        retrieval_chain_rag_fusion = generate_multi_queries | retriever.map() | initials.reciprocal_rank_fusion

        fusion_docs = retrieval_chain_rag_fusion.invoke({"question": user_input, "question_history": question_history})
        document_list = [doc[0] for doc in fusion_docs]

        fusion_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())
        
        with get_openai_callback() as cb:
            response = fusion_rag_chain.invoke({
                "context": document_list, 
                "question": user_input,
                "chat_history": []
            }) if document_list else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, document_list, total_cost, total_tokens, completion_tokens


    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [8]:
def get_response_fusion_routing(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
       
        #==========================   RAG Fusion   =============================

        #Generate multiple queries using the multi_query_prompt and model
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))
        )

        retrieval_chain_rag_fusion = generate_multi_queries | retriever.map() | initials.reciprocal_rank_fusion

        fusion_docs = retrieval_chain_rag_fusion.invoke({"question": user_input, "question_history": question_history})
        document_list = [doc[0] for doc in fusion_docs]

        fusion_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())
        
        with get_openai_callback() as cb:
            response = fusion_rag_chain.invoke({
                "context": document_list, 
                "question": user_input,
                "chat_history": []
            }) if document_list else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, document_list, total_cost, total_tokens, completion_tokens


    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [17]:
def get_response_fusion_densex_routing(user_input):
    try:

        #==========================   RAG Fusion - DenseX   =============================

        #DenseX vectors store
        densex_vectorstore = indexing.generate_final_vectorstore_with_chunks(user_input, test_routing_directory, initials.embedding)
        densex_retriever = densex_vectorstore.as_retriever()

        # Generate multiple queries using the multi_query_prompt and model
        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))
        )

        retrieval_chain_rag_fusion = generate_multi_queries | densex_retriever.map() | initials.reciprocal_rank_fusion
        question_history = []
        fusion_docs = retrieval_chain_rag_fusion.invoke({"question": user_input, "question_history": question_history})
        document_list = [doc[0] for doc in fusion_docs]

        fusion_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())
        
        with get_openai_callback() as cb:
            response = fusion_rag_chain.invoke({
                "context": document_list, 
                "question": user_input,
                "chat_history": []
            }) if document_list else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, document_list, total_cost, total_tokens, completion_tokens
        

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [None]:

# Function to get response with error handling
def get_response_multiquery(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
        
        #==========================   Multi-Query   =============================
   
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        # Generate multiple queries using the multi_query_prompt and model
        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))  # Split the generated output into individual queries
        )

        # Generate the multiple queries based on user input
        multiple_queries = generate_multi_queries.invoke({"question": user_input, "question_history": question_history})

        # Now, use the generated queries to retrieve documents
        # Now, use the generated queries to retrieve documents
        if multiple_queries:
            # Use retriever to fetch documents for each query
            documents_text = []
            for query in multiple_queries:
                retrieved_docs = retriever.get_relevant_documents(query)
                # Join all retrieved documents into a single text string for each query result
                docs_texts = " ".join([doc.page_content for doc in retrieved_docs])
                documents_text.append(docs_texts)

        # Create prompt for final response generation
        multi_query_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = multi_query_rag_chain.invoke({
                "context": documents_text, 
                "question": user_input,
                "chat_history": []
            }) if documents_text else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, documents_text, total_cost, total_tokens, completion_tokens


    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [None]:
# Function to get response with error handling
def get_response_multiquery_densex_routing(user_input):
    try:
        #==========================   Multi-Query - DenseX   =============================
        #'''
        #DenseX vectors store
        densex_vectorstore = indexing.generate_final_vectorstore_with_chunks(user_input, test_routing_directory, initials.embedding)
        densex_retriever = densex_vectorstore.as_retriever()

        # Generate multiple queries using the multi_query_prompt and model
        generate_multi_queries = (
            prompts.multi_query_prompt 
            | initials.model 
            | StrOutputParser() 
            | (lambda x: x.split("\n"))  # Split the generated output into individual queries
        )

        # Generate the multiple queries based on user input
        multiple_queries = generate_multi_queries.invoke({"question": user_input, "question_history": question_history})

        # Now, use the generated queries to retrieve documents
        # Now, use the generated queries to retrieve documents
        if multiple_queries:
            # Use retriever to fetch documents for each query
            documents_text = []
            for query in multiple_queries:
                retrieved_docs = densex_retriever.get_relevant_documents(query)
                # Join all retrieved documents into a single text string for each query result
                docs_texts = " ".join([doc.page_content for doc in retrieved_docs])
                documents_text.append(docs_texts)

        # Create prompt for final response generation
        multi_query_rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        with get_openai_callback() as cb:
            response = multi_query_rag_chain.invoke({
                "context": documents_text, 
                "question": user_input,
                "chat_history": []
            }) if documents_text else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, documents_text, total_cost, total_tokens, completion_tokens

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [None]:
# Function to get response with error handling
def get_response_stepback(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
        
        #==========================   Stepback   =============================

        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        # Generate step-back queries
        generate_stepback_question = prompts.step_back_prompt | initials.model | StrOutputParser()
        step_back_question = generate_stepback_question.invoke({"question": user_input, "question_history": question_history })
        normal_context = retriever.invoke(user_input)
        step_back_chain = (
        {
            "chat_history": lambda x: x["chat_history"],
            "normal_context": lambda x: retriever.invoke(x["question"]),
            "question": lambda x: x["question"],
            "step_back_context": lambda x: retriever.invoke(x["step_back_question"]),
            "question_history": lambda x: x["question_history"],
        }
            | prompts.stepback_response_prompt
            | initials.model
            | StrOutputParser()
        )
        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = step_back_chain.invoke({ 
                "chat_history": [],
                "question": user_input,
                "step_back_question": step_back_question,
                "question_history": []
            })

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, normal_context, total_cost, total_tokens, completion_tokens

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [None]:
# Function to get response with error handling
def get_response_stepback_hybrid(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
        
        #==========================   Stepback - Hybrid   =============================

        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        retriever = vectorstore.as_retriever()

        keyword_retriever = BM25Retriever.from_documents(chunks)
        hybrid_retriever = EnsembleRetriever(retrievers=[keyword_retriever, retriever], weights=[1-VECTORSTORE_WEIGHT, VECTORSTORE_WEIGHT])
        # Generate step-back queries
        generate_stepback_question = prompts.step_back_prompt | initials.model | StrOutputParser()
        step_back_question = generate_stepback_question.invoke({"question": user_input, "question_history": question_history })
        normal_context = retriever.invoke(user_input)
        step_back_chain = (
        {
            "chat_history": lambda x: x["chat_history"],
            "normal_context": lambda x: hybrid_retriever.invoke(x["question"]),
            "question": lambda x: x["question"],
            "step_back_context": lambda x: hybrid_retriever.invoke(x["step_back_question"]),
            "question_history": lambda x: x["question_history"],
        }
            | prompts.stepback_response_prompt
            | initials.model
            | StrOutputParser()
        )
        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = step_back_chain.invoke({ 
                "chat_history": [],
                "question": user_input,
                "step_back_question": step_back_question,
                "question_history": []
            })

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, normal_context, total_cost, total_tokens, completion_tokens

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

In [None]:
# Function to get response with error handling
def get_response_stepback_densex_routing(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_routing_directory, "*.txt")) if not file.endswith("_summary.txt")]
        question_history = []
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())
 
        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(initials.embedding)
        
        #retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8, 'k': 5}) #add cos similarity filter, and k documents
        
        #==========================   Stepback - DenseX  =============================
        #DenseX vectors store
        densex_vectorstore = indexing.generate_final_vectorstore_with_chunks(user_input, test_routing_directory, initials.embedding)
        densex_retriever = densex_vectorstore.as_retriever()

        # Generate step-back queries
        generate_stepback_question = prompts.step_back_prompt | initials.model | StrOutputParser()
        step_back_question = generate_stepback_question.invoke({"question": user_input, "question_history": question_history })
        normal_context = densex_retriever.invoke(user_input)
        step_back_chain = (
        {
            "chat_history": lambda x: x["chat_history"],
            "normal_context": lambda x: densex_retriever.invoke(x["question"]),
            "question": lambda x: x["question"],
            "step_back_context": lambda x: densex_retriever.invoke(x["step_back_question"]),
            "question_history": lambda x: x["question_history"],
        }
            | prompts.stepback_response_prompt
            | initials.model
            | StrOutputParser()
        )
        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = step_back_chain.invoke({ 
                "chat_history": [],
                "question": user_input,
                "step_back_question": step_back_question,
                "question_history": []
            })

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, normal_context, total_cost, total_tokens, completion_tokens


    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None

# CHANGE get_response functions here

In [18]:
# Function to save evaluation data to CSV
def save_evaluation_to_csv(evaluation_data, filename):
    df = pd.DataFrame([evaluation_data])
    df.to_csv(filename, mode='a', index=False, header=False)

# Main execution
def run_evaluations_from_csv(input_csv, output_csv):
    # Directly create the output CSV file with headers at the beginning
    initialize_output_csv(output_csv)

    # Load questions from the CSV file
    questions_df = pd.read_csv(input_csv)
    
    for index, row in questions_df.iterrows():
        user_query = row['question']
        start_time = time.time()  # Start timing
        print(f"Processing question {index + 1}/{len(questions_df)}: {user_query}")

        try:
            # Get the response, generated queries, and retrieved documents
            response, context, total_cost, total_tokens, completion_tokens = get_response_fusion_densex_routing(user_query)
            print("==========  GENERATION   ==========")

            # Initialize metrics_results
            metrics_results = None

            print("==========   EVALUATION  ==========")
            # Evaluate metrics and retrieve dataset
            metrics_results, dataset = evaluation.evaluate_result(user_query, response, context, input_csv)
            print(f"Metrics for question '{user_query}': {metrics_results}")

            if response:
                # Calculate response time
                response_time = time.time() - start_time
                # Clear the system cache after processing the response
                chromadb.api.client.SharedSystemClient.clear_system_cache()

                # Prepare data for CSV
                if metrics_results is not None:
                    # Extract contexts and ground_truth from the dataset
                    contexts = dataset["contexts"][0]  # Access first row's 'contexts'
                    ground_truth = dataset["ground_truth"][0]  # Access first row's 'ground_truth'
                    
                    evaluation_data = {
                        'Question': user_query,
                        'Response': response,
                        'Contexts': contexts,
                        'Ground Truth': ground_truth,
                        'Token Count': total_tokens,
                        'Total Cost (USD)': total_cost,
                        'Completion Tokens': completion_tokens,
                        'Number of Retrieved documents': len(context),
                        'Response time': response_time,
                        'answer_relevancy': metrics_results.get('answer_relevancy'),
                        'context_precision': metrics_results.get('context_precision'),
                        'context_recall': metrics_results.get('context_recall'),
                        'faithfulness': metrics_results.get('faithfulness'),
                        'BleuScore': metrics_results.get('bleu_score'),
                        'RougeScore': metrics_results.get('rouge_score'),

                    }

                    # Save the evaluation data to CSV
                    save_evaluation_to_csv(evaluation_data, output_csv)
                    print(f"Evaluation metrics saved for question '{user_query}'.")

            print("==========   PROCESS ENDED  ==========\n")

        except ValueError as ve:
            print(f"ValueError for question {index + 1}: {ve}")
            print("Skipping to the next question...\n")

        except Exception as e:
            print(f"Unexpected error for question {index + 1}: {e}")
            print("Skipping to the next question...\n")


# RUN

In [19]:
# Run evaluations
run_evaluations_from_csv(input_csv_path, output_csv_path)

Created output file at: test_data_routing/_evaluation_advanced_fusion_densex_routing.csv
Processing question 1/10: What ist die Bedeutung der eSIM für die Verbindung der Samsung Galaxy Watch mit dem Mobilfunknetz?






Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.87s/it]


Metrics for question 'What ist die Bedeutung der eSIM für die Verbindung der Samsung Galaxy Watch mit dem Mobilfunknetz?': {'answer_relevancy': 0.9348684240306048, 'context_precision': 0.9999999999, 'context_recall': 1.0, 'faithfulness': 1.0, 'bleu_score': 0.24660313247404905, 'rouge_score': 0.4297520661157025}
Evaluation metrics saved for question 'What ist die Bedeutung der eSIM für die Verbindung der Samsung Galaxy Watch mit dem Mobilfunknetz?'.

Processing question 2/10: Kann ich eine PlusKarte ohne Mindestvertragslaufzeit buchen?


Evaluating: 100%|██████████| 4/4 [00:14<00:00,  3.64s/it]


Metrics for question 'Kann ich eine PlusKarte ohne Mindestvertragslaufzeit buchen?': {'answer_relevancy': 0.9999999999999994, 'context_precision': 0.8261904761739525, 'context_recall': 1.0, 'faithfulness': 1.0, 'bleu_score': 0.1782921965574844, 'rouge_score': 0.4406779661016949}
Evaluation metrics saved for question 'Kann ich eine PlusKarte ohne Mindestvertragslaufzeit buchen?'.

Processing question 3/10: What advantages does the A15 Bionic Prozessor offer in the iPhone 13 series compared to its predecessor?


Evaluating: 100%|██████████| 4/4 [00:28<00:00,  7.18s/it]


Metrics for question 'What advantages does the A15 Bionic Prozessor offer in the iPhone 13 series compared to its predecessor?': {'answer_relevancy': 0.9657319139749733, 'context_precision': 0.49999999995, 'context_recall': 1.0, 'faithfulness': 0.0, 'bleu_score': 0.009192577995194304, 'rouge_score': 0.09090909090909091}
Evaluation metrics saved for question 'What advantages does the A15 Bionic Prozessor offer in the iPhone 13 series compared to its predecessor?'.

Processing question 4/10: What steps are involved in updating bank connections in the Kundencenter and MeinMagenta App?


Evaluating: 100%|██████████| 4/4 [00:23<00:00,  5.84s/it]


Metrics for question 'What steps are involved in updating bank connections in the Kundencenter and MeinMagenta App?': {'answer_relevancy': 0.9430972584807084, 'context_precision': 0.5888888888692593, 'context_recall': 1.0, 'faithfulness': 1.0, 'bleu_score': 0.0022866565934025434, 'rouge_score': 0.24242424242424243}
Evaluation metrics saved for question 'What steps are involved in updating bank connections in the Kundencenter and MeinMagenta App?'.

Processing question 5/10: What sind die Kosten für die Mitnahme meiner Mobilfunk-Rufnummer zu einem anderen Anbieter?


Evaluating: 100%|██████████| 4/4 [00:14<00:00,  3.68s/it]


Metrics for question 'What sind die Kosten für die Mitnahme meiner Mobilfunk-Rufnummer zu einem anderen Anbieter?': {'answer_relevancy': 0.9607084945792637, 'context_precision': 0.4499999999775, 'context_recall': 1.0, 'faithfulness': 1.0, 'bleu_score': 0.22950595360157608, 'rouge_score': 0.4444444444444445}
Evaluation metrics saved for question 'What sind die Kosten für die Mitnahme meiner Mobilfunk-Rufnummer zu einem anderen Anbieter?'.

Processing question 6/10: Wie aktiviere ich VoLTE, wenn LTE fehlt?


Evaluating: 100%|██████████| 4/4 [00:21<00:00,  5.25s/it]


Metrics for question 'Wie aktiviere ich VoLTE, wenn LTE fehlt?': {'answer_relevancy': 0.8833387918075685, 'context_precision': 0.99999999995, 'context_recall': 1.0, 'faithfulness': 0.9285714285714286, 'bleu_score': 0.0015654934906083598, 'rouge_score': 0.054945054945054944}
Evaluation metrics saved for question 'Wie aktiviere ich VoLTE, wenn LTE fehlt?'.

Processing question 7/10: Wie kann die eSIM der Galaxy Watch online ohne Smartphone sein?


Evaluating: 100%|██████████| 4/4 [00:17<00:00,  4.44s/it]


Metrics for question 'Wie kann die eSIM der Galaxy Watch online ohne Smartphone sein?': {'answer_relevancy': 0.8738311653523044, 'context_precision': 0.6666666666444444, 'context_recall': 1.0, 'faithfulness': 1.0, 'bleu_score': 0.06527622361665518, 'rouge_score': 0.20408163265306123}
Evaluation metrics saved for question 'Wie kann die eSIM der Galaxy Watch online ohne Smartphone sein?'.

Processing question 8/10: Why was the 10th Sept meeting on the mobile plan canceled?


Evaluating: 100%|██████████| 4/4 [00:19<00:00,  4.88s/it]


Metrics for question 'Why was the 10th Sept meeting on the mobile plan canceled?': {'answer_relevancy': 0.0, 'context_precision': 0.49999999995, 'context_recall': 1.0, 'faithfulness': 0.3333333333333333, 'bleu_score': 0.004867213525910928, 'rouge_score': 0.05194805194805195}
Evaluation metrics saved for question 'Why was the 10th Sept meeting on the mobile plan canceled?'.

Processing question 9/10: Wie wird die eSIM zugestellt und was ist die Rolle des QR-Codes bei der Installation?


Evaluating: 100%|██████████| 4/4 [00:12<00:00,  3.08s/it]


Metrics for question 'Wie wird die eSIM zugestellt und was ist die Rolle des QR-Codes bei der Installation?': {'answer_relevancy': 0.9620994766805503, 'context_precision': 0.8333333332916666, 'context_recall': 1.0, 'faithfulness': 0.875, 'bleu_score': 0.1394434502652766, 'rouge_score': 0.2975206611570248}
Evaluation metrics saved for question 'Wie wird die eSIM zugestellt und was ist die Rolle des QR-Codes bei der Installation?'.

Processing question 10/10: What MagentaMobil plans are available for no-commitment users?


Evaluating: 100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


Metrics for question 'What MagentaMobil plans are available for no-commitment users?': {'answer_relevancy': 0.8964403151650057, 'context_precision': 0.0, 'context_recall': 0.5, 'faithfulness': 0.8888888888888888, 'bleu_score': 0.0020632861907783676, 'rouge_score': 0.09876543209876544}
Evaluation metrics saved for question 'What MagentaMobil plans are available for no-commitment users?'.



### results to csv

In [20]:
import pandas as pd
import os

# Load the CSV file into a DataFrame
file_path = 'test_data_routing/_evaluation_advanced_fusion_densex_routing.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# List of numeric columns to calculate averages
numeric_columns = [
    'Token Count', 'Total Cost (USD)', 'Completion Tokens',
    'Number of Retrieved documents', 'Response time',
    'answer_relevancy', 'context_precision', 'context_recall',
    'faithfulness', 'BleuScore', 'RougeScore'
]

# Calculate the mean for each numeric column
averages = df[numeric_columns].mean()

# Formatting the averages according to your requirements
formatted_averages = {
    'Token Count': f"{averages['Token Count']:.0f}",  # No decimal places
    'Total Cost (USD)': f"{averages['Total Cost (USD)']:.5f}",  # Keep as is
    'Completion Tokens': f"{averages['Completion Tokens']:.0f}",  # No decimal places
    'Number of Retrieved documents': f"{averages['Number of Retrieved documents']}",  # Keep as is
    'Response time': f"{averages['Response time']:.2f}",  # One decimal place
    'answer_relevancy': f"{averages['answer_relevancy']:.4f}",  # Four decimal places
    'context_precision': f"{averages['context_precision']:.4f}",  # Four decimal places
    'context_recall': f"{averages['context_recall']:.4f}",  # Four decimal places
    'faithfulness': f"{averages['faithfulness']:.4f}",  # Four decimal places
    'BleuScore': f"{averages['BleuScore']:.4f}",  # Four decimal places
    'RougeScore': f"{averages['RougeScore']:.4f}"  # Four decimal places
}

# Convert formatted averages to a DataFrame for saving
formatted_averages_df = pd.DataFrame([formatted_averages])

# Define the output file path by adding "results_" prefix
output_file_path = os.path.join(
    os.path.dirname(file_path), 
    f"_results_{os.path.basename(file_path)}"
)

# Save the formatted averages to CSV
formatted_averages_df.to_csv(output_file_path, index=False)
print(f"[INFO] Formatted averages saved to {output_file_path}")

[INFO] Formatted averages saved to test_data_routing/_results__evaluation_advanced_fusion_densex_routing.csv
