# Evaluation - Retrieval

### Naive RAG - Semantic Search - Character Splitting - Recursive Character Text Splitting

In [None]:
# Import necessary libraries
import os
import time
import pandas as pd
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma
import prompts as prompts
import initials as initials
import evaluation
import chromadb

 
test_directory = '/Users/taha/Desktop/rag/test_data'

# Define testset CSV path
input_csv_path = '/Users/taha/Desktop/rag/test_data/_testset_semantic.csv'  # Input CSV file path

# Define output CSV path including the filename
output_csv_path = '/Users/taha/Desktop/rag/test_data/_evaluation_naive_semantic.csv'  # Output file will be created here

# Function to create the output CSV file at the beginning
def initialize_output_csv(output_path):
    # Directly create the file with the correct header
    with open(output_path, 'w') as file:
        header = (
            "Question,Response,Contexts,Ground Truth,"
            "Token Count,Total Cost (USD),Completion Tokens,Number of Retrieved documents,"
            "Response time,answer_relevancy,context_precision,"
            "context_recall,faithfulness,BleuScore,RougeScore\n"
        )
        file.write(header)
    print(f"Created output file at: {output_path}")

# Function to get response with error handling
def get_response(user_input):
    try:
        # Dosyaları listele
        all_txt_files = [file for file in glob.glob(os.path.join(test_directory, "*.txt")) if not file.endswith("_summary.txt")]
    
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())

        # CharacterTextSplitter without separator
        text_splitter = CharacterTextSplitter(
            separator='',
            chunk_size=500,
            chunk_overlap=25,
        )

        # CharacterTextSplitter with separator
        text_splitter_separator = CharacterTextSplitter(
            separator="\n\n",
            chunk_size=1000,
            chunk_overlap=100,
        )

        #RecursiveCharacterTextSplitter
        text_splitter_recursive = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=25)

        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(embeddings=initials.embedding)
        
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")

        # Embedding işlemi
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        print("==========   VECTORSTORE CREATED  ==========")
        retriever = vectorstore.as_retriever()
        retrieved_docs = retriever.invoke(user_input)

        # Prompt oluşturma
        rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, retrieved_docs, total_cost, total_tokens, completion_tokens

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None


# Function to save evaluation data to CSV
def save_evaluation_to_csv(evaluation_data, filename):
    df = pd.DataFrame([evaluation_data])
    df.to_csv(filename, mode='a', index=False, header=False)

# Main execution
def run_evaluations_from_csv(input_csv, output_csv):
    # Directly create the output CSV file with headers at the beginning
    initialize_output_csv(output_csv)

    # Load questions from the CSV file
    questions_df = pd.read_csv(input_csv)
    
    for index, row in questions_df.iterrows():
        user_query = row['question']
        start_time = time.time()  # Start timing
        print(f"Processing question {index + 1}/{len(questions_df)}: {user_query}")

        try:
            # Get the response, generated queries, and retrieved documents
            response, context, total_cost, total_tokens, completion_tokens = get_response(user_query)
            print("==========   ANSWER GENERATED  ==========")

            # Initialize metrics_results
            metrics_results = None

            print("==========   EVALUATION  ==========")
            # Evaluate metrics and retrieve dataset
            metrics_results, dataset = evaluation.evaluate_result(user_query, response, context, input_csv)
            print(f"Metrics for question '{user_query}': {metrics_results}")

            if response:
                # Calculate response time
                response_time = time.time() - start_time
                # Clear the system cache after processing the response
                chromadb.api.client.SharedSystemClient.clear_system_cache()

                # Prepare data for CSV
                if metrics_results is not None:
                    # Extract contexts and ground_truth from the dataset
                    contexts = dataset["contexts"][0]  # Access first row's 'contexts'
                    ground_truth = dataset["ground_truth"][0]  # Access first row's 'ground_truth'
                    
                    evaluation_data = {
                        'Question': user_query,
                        'Response': response,
                        'Contexts': contexts,
                        'Ground Truth': ground_truth,
                        'Token Count': total_tokens,
                        'Total Cost (USD)': total_cost,
                        'Completion Tokens': completion_tokens,
                        'Number of Retrieved documents': len(context),
                        'Response time': response_time,
                        'answer_relevancy': metrics_results.get('answer_relevancy'),
                        'context_precision': metrics_results.get('context_precision'),
                        'context_recall': metrics_results.get('context_recall'),
                        'faithfulness': metrics_results.get('faithfulness'),
                        'BleuScore': metrics_results.get('bleu_score'),
                        'RougeScore': metrics_results.get('rouge_score'),

                    }

                    # Save the evaluation data to CSV
                    save_evaluation_to_csv(evaluation_data, output_csv)
                    print(f"Evaluation metrics saved for question '{user_query}'.")

            print("==========   PROCESS ENDED  ==========\n")

        except ValueError as ve:
            print(f"ValueError for question {index + 1}: {ve}")
            print("Skipping to the next question...\n")

        except Exception as e:
            print(f"Unexpected error for question {index + 1}: {e}")
            print("Skipping to the next question...\n")


# Run evaluations
run_evaluations_from_csv(input_csv_path, output_csv_path)

Created output file at: /Users/taha/Desktop/rag/test_data_naive/_evaluation_naive_semantic.csv
Processing question 1/10: What steps should be taken to remove the Telekom Login from the Puls-Tablet?


Evaluating: 100%|██████████| 4/4 [00:10<00:00,  2.66s/it]


Metrics for question 'What steps should be taken to remove the Telekom Login from the Puls-Tablet?': {'answer_relevancy': 0.9701190021948135, 'context_precision': 0.8333333332916666, 'context_recall': 0.0, 'faithfulness': 0.0, 'bleu_score': 0, 'rouge_score': 0.0}
Evaluation metrics saved for question 'What steps should be taken to remove the Telekom Login from the Puls-Tablet?'.

Processing question 2/10: Wie können Sie Ihre Bestellung stornieren?


Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.87s/it]


Metrics for question 'Wie können Sie Ihre Bestellung stornieren?': {'answer_relevancy': 0.8205745212554838, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.8, 'bleu_score': 0.02900697051073635, 'rouge_score': 0.1891891891891892}
Evaluation metrics saved for question 'Wie können Sie Ihre Bestellung stornieren?'.

Processing question 3/10: What is the requirement for devices to support UHD media in relation to HDCP 2.2?


Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.84s/it]


Metrics for question 'What is the requirement for devices to support UHD media in relation to HDCP 2.2?': {'answer_relevancy': 0.8766066089412939, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.25, 'bleu_score': 0.011729176379814873, 'rouge_score': 0.18666666666666665}
Evaluation metrics saved for question 'What is the requirement for devices to support UHD media in relation to HDCP 2.2?'.

Processing question 4/10: What ist die Lösung für das Einstellen von Untertiteln auf Magenta TV?


Evaluating: 100%|██████████| 4/4 [00:08<00:00,  2.10s/it]


Metrics for question 'What ist die Lösung für das Einstellen von Untertiteln auf Magenta TV?': {'answer_relevancy': 0.8935203316684572, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.6666666666666666, 'bleu_score': 0.023943187829134, 'rouge_score': 0.3063063063063063}
Evaluation metrics saved for question 'What ist die Lösung für das Einstellen von Untertiteln auf Magenta TV?'.

Processing question 5/10: What muss ich tun, wenn die Registrierung der Rufnummer nicht funktioniert?


Evaluating: 100%|██████████| 4/4 [00:09<00:00,  2.50s/it]


Metrics for question 'What muss ich tun, wenn die Registrierung der Rufnummer nicht funktioniert?': {'answer_relevancy': 0.9018336460205569, 'context_precision': 0.0, 'context_recall': 1.0, 'faithfulness': 0.8571428571428571, 'bleu_score': 0.002425393177607981, 'rouge_score': 0.046153846153846156}
Evaluation metrics saved for question 'What muss ich tun, wenn die Registrierung der Rufnummer nicht funktioniert?'.

Processing question 6/10: Wie aktiviere ich das Telekom Sicherheitspaket?


Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.80s/it]


Metrics for question 'Wie aktiviere ich das Telekom Sicherheitspaket?': {'answer_relevancy': 0.0, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.3333333333333333, 'bleu_score': 0.05615408504546591, 'rouge_score': 0.28148148148148144}
Evaluation metrics saved for question 'Wie aktiviere ich das Telekom Sicherheitspaket?'.

Processing question 7/10: Who to contact for order cancellation within the Widerrufsfrist?


Evaluating: 100%|██████████| 4/4 [00:08<00:00,  2.05s/it]


Metrics for question 'Who to contact for order cancellation within the Widerrufsfrist?': {'answer_relevancy': 0.912176632632328, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.3333333333333333, 'bleu_score': 0.3589485409071449, 'rouge_score': 0.5000000000000001}
Evaluation metrics saved for question 'Who to contact for order cancellation within the Widerrufsfrist?'.

Processing question 8/10: How to fix the thermostat display in the MagentaZuhause App if it's upside down?


Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.79s/it]


Metrics for question 'How to fix the thermostat display in the MagentaZuhause App if it's upside down?': {'answer_relevancy': 0.9497981553251109, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.8571428571428571, 'bleu_score': 0.005846553820644135, 'rouge_score': 0.1111111111111111}
Evaluation metrics saved for question 'How to fix the thermostat display in the MagentaZuhause App if it's upside down?'.

Processing question 9/10: How to adjust subtitle quality on Magenta TV?


Evaluating: 100%|██████████| 4/4 [00:08<00:00,  2.23s/it]


Metrics for question 'How to adjust subtitle quality on Magenta TV?': {'answer_relevancy': 0.9940142836248823, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.3333333333333333, 'bleu_score': 0.008719780366845377, 'rouge_score': 0.198019801980198}
Evaluation metrics saved for question 'How to adjust subtitle quality on Magenta TV?'.

Processing question 10/10: What's the first step for the Telekom security check after Google or Apple login in the GÖNN app?


Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.80s/it]


Metrics for question 'What's the first step for the Telekom security check after Google or Apple login in the GÖNN app?': {'answer_relevancy': 0.951597088888693, 'context_precision': 0.999999999975, 'context_recall': 0.0, 'faithfulness': 0.6666666666666666, 'bleu_score': 0.13713283832022413, 'rouge_score': 0.47826086956521735}
Evaluation metrics saved for question 'What's the first step for the Telekom security check after Google or Apple login in the GÖNN app?'.



#### Results to csv file

In [None]:
import pandas as pd
import os

# Load the CSV file into a DataFrame
file_path = '/Users/taha/Desktop/rag/test_data/_evaluation_naive_semantic.csv'  # Replace with the path to your CSV file

df = pd.read_csv(file_path)

# List of numeric columns to calculate averages
numeric_columns = [
    'Token Count', 'Total Cost (USD)', 'Completion Tokens',
    'Number of Retrieved documents', 'Response time',
    'answer_relevancy', 'context_precision', 'context_recall',
    'faithfulness', 'BleuScore', 'RougeScore'
]

# Calculate the mean for each numeric column
averages = df[numeric_columns].mean()

# Formatting the averages according to your requirements
formatted_averages = {
    'Token Count': f"{averages['Token Count']:.0f}",  # No decimal places
    'Total Cost (USD)': f"{averages['Total Cost (USD)']:.5f}",  # Keep as is
    'Completion Tokens': f"{averages['Completion Tokens']:.0f}",  # No decimal places
    'Number of Retrieved documents': f"{averages['Number of Retrieved documents']}",  # Keep as is
    'Response time': f"{averages['Response time']:.2f}",  # One decimal place
    'answer_relevancy': f"{averages['answer_relevancy']:.4f}",  # Four decimal places
    'context_precision': f"{averages['context_precision']:.4f}",  # Four decimal places
    'context_recall': f"{averages['context_recall']:.4f}",  # Four decimal places
    'faithfulness': f"{averages['faithfulness']:.4f}",  # Four decimal places
    'BleuScore': f"{averages['BleuScore']:.4f}",  # Four decimal places
    'RougeScore': f"{averages['RougeScore']:.4f}"  # Four decimal places
}

# Convert formatted averages to a DataFrame for saving
formatted_averages_df = pd.DataFrame([formatted_averages])

# Define the output file path by adding "results_" prefix
output_file_path = os.path.join(
    os.path.dirname(file_path), 
    f"_results_{os.path.basename(file_path)}"
)

# Save the formatted averages to CSV
formatted_averages_df.to_csv(output_file_path, index=False)
print(f"[INFO] Formatted averages saved to {output_file_path}")

[INFO] Formatted averages saved to /Users/taha/Desktop/rag/test_data_naive/_results__evaluation_naive_semantic.csv
