# Evaluation - Naive RAG

### Naive RAG - Semantic Search - Character Splitting - Recursive Character Text Splitting

In [4]:
# Import necessary libraries
import os
import time
import pandas as pd
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma
import prompts as prompts
import initials as initials
import evaluation
import chromadb


test_directory = '/Users/taha/Desktop/rag/test_data_naive'
# Define input CSV path
input_csv_path = '/Users/taha/Desktop/rag/test_data_naive/_testset_semantic_gradient.csv'  # Input CSV file path

# Define output CSV path including the filename
output_csv_path = 'test_data_naive/_evaluation_naive_semantic_gradient.csv'  # Output file will be created here

# Function to create the output CSV file at the beginning
def initialize_output_csv(output_path):
    # Directly create the file with the correct header
    with open(output_path, 'w') as file:
        header = (
            "Question,Response,Contexts,Ground Truth,"
            "Token Count,Total Cost (USD),Completion Tokens,Number of Retrieved documents,"
            "Response time,answer_relevancy,context_precision,"
            "context_recall,faithfulness,BleuScore,RougeScore\n"
        )
        file.write(header)
    print(f"Created output file at: {output_path}")

# Function to get response with error handling
def get_response(user_input):
    try:
        # Dosyaları listele
        all_txt_files = glob.glob(os.path.join(test_directory, "*.txt"))
    
        # Seçilen dosyaların içeriklerini oku ve birleştir
        all_texts = []
        for file_path in all_txt_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                all_texts.append(f.read())

        # CharacterTextSplitter without separator
        text_splitter = CharacterTextSplitter(
            separator='',
            chunk_size=250,
            chunk_overlap=25,
        )

        # CharacterTextSplitter with separator
        text_splitter_separator = CharacterTextSplitter(
            separator="\n\n",
            chunk_size=1000,
            chunk_overlap=100,
        )

        #RecursiveCharacterTextSplitter
        text_splitter_recursive = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

        # Semantic Splitting
        text_splitter_semantic = SemanticChunker(embeddings=initials.embedding, breakpoint_threshold_type="gradient")
        
        chunks = text_splitter_semantic.create_documents(all_texts)
        print("==========   CHUNKS CREATED  ==========")

        # Embedding işlemi
        vectorstore = Chroma.from_documents(documents=chunks, embedding=initials.embedding)
        print("==========   VECTORSTORE CREATED  ==========")
        retriever = vectorstore.as_retriever()
        retrieved_docs = retriever.invoke(user_input)

        # Prompt oluşturma
        rag_chain = (prompts.prompt_telekom | initials.model | StrOutputParser())

        # OpenAI callback ile maliyet ve token takibi
        with get_openai_callback() as cb:
            response = rag_chain.invoke({
                "context": retrieved_docs, 
                "question": user_input,
                "chat_history": []
            }) if retrieved_docs else "No relevant documents found."

        total_tokens = cb.total_tokens
        total_cost = cb.total_cost
        completion_tokens = cb.completion_tokens

        return response, retrieved_docs, total_cost, total_tokens, completion_tokens

    except FileNotFoundError:
        print("Documents could not be loaded. Please check the data directory path.")
        return None, None, None, None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, None, None, None


# Function to save evaluation data to CSV
def save_evaluation_to_csv(evaluation_data, filename):
    df = pd.DataFrame([evaluation_data])
    df.to_csv(filename, mode='a', index=False, header=False)

# Main execution
def run_evaluations_from_csv(input_csv, output_csv):
    # Directly create the output CSV file with headers at the beginning
    initialize_output_csv(output_csv)

    # Load questions from the CSV file
    questions_df = pd.read_csv(input_csv)
    
    for index, row in questions_df.iterrows():
        user_query = row['question']
        start_time = time.time()  # Start timing
        print(f"Processing question {index + 1}/{len(questions_df)}: {user_query}")

        try:
            # Get the response, generated queries, and retrieved documents
            response, context, total_cost, total_tokens, completion_tokens = get_response(user_query)
            print("==========   ANSWER GENERATED  ==========")

            # Initialize metrics_results
            metrics_results = None

            print("==========   EVALUATION  ==========")
            # Evaluate metrics and retrieve dataset
            metrics_results, dataset = evaluation.evaluate_result(user_query, response, context, input_csv)
            print(f"Metrics for question '{user_query}': {metrics_results}")

            if response:
                # Calculate response time
                response_time = time.time() - start_time
                # Clear the system cache after processing the response
                chromadb.api.client.SharedSystemClient.clear_system_cache()

                # Prepare data for CSV
                if metrics_results is not None:
                    # Extract contexts and ground_truth from the dataset
                    contexts = dataset["contexts"][0]  # Access first row's 'contexts'
                    ground_truth = dataset["ground_truth"][0]  # Access first row's 'ground_truth'
                    
                    evaluation_data = {
                        'Question': user_query,
                        'Response': response,
                        'Contexts': contexts,
                        'Ground Truth': ground_truth,
                        'Token Count': total_tokens,
                        'Total Cost (USD)': total_cost,
                        'Completion Tokens': completion_tokens,
                        'Number of Retrieved documents': len(context),
                        'Response time': response_time,
                        'answer_relevancy': metrics_results.get('answer_relevancy'),
                        'context_precision': metrics_results.get('context_precision'),
                        'context_recall': metrics_results.get('context_recall'),
                        'faithfulness': metrics_results.get('faithfulness'),
                        'BleuScore': metrics_results.get('bleu_score'),
                        'RougeScore': metrics_results.get('rouge_score'),

                    }

                    # Save the evaluation data to CSV
                    save_evaluation_to_csv(evaluation_data, output_csv)
                    print(f"Evaluation metrics saved for question '{user_query}'.")

            print("==========   PROCESS ENDED  ==========\n")

        except ValueError as ve:
            print(f"ValueError for question {index + 1}: {ve}")
            print("Skipping to the next question...\n")

        except Exception as e:
            print(f"Unexpected error for question {index + 1}: {e}")
            print("Skipping to the next question...\n")


# Run evaluations
run_evaluations_from_csv(input_csv_path, output_csv_path)

Created output file at: test_data_naive/_evaluation_naive_semantic_gradient.csv
Processing question 1/10: Was sind die Vorteile der Nutzung der MeinMagenta App für GÖNN Kunden?


Evaluating: 100%|██████████| 4/4 [00:23<00:00,  5.77s/it]


Metrics for question 'Was sind die Vorteile der Nutzung der MeinMagenta App für GÖNN Kunden?': {'answer_relevancy': 0.9553970603390374, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.7333333333333333, 'bleu_score': 0.03364007721566668, 'rouge_score': 0.2634146341463415}
Evaluation metrics saved for question 'Was sind die Vorteile der Nutzung der MeinMagenta App für GÖNN Kunden?'.

Processing question 2/10: Wie kann ich einen AV-Receiver mit meinem TV-Receiver und Fernseher verbinden?


Evaluating: 100%|██████████| 4/4 [00:12<00:00,  3.18s/it]


Metrics for question 'Wie kann ich einen AV-Receiver mit meinem TV-Receiver und Fernseher verbinden?': {'answer_relevancy': 0.9817467081303711, 'context_precision': 0.0, 'context_recall': 1.0, 'faithfulness': 0.5714285714285714, 'bleu_score': 0.002939806889647791, 'rouge_score': 0.045454545454545456}
Evaluation metrics saved for question 'Wie kann ich einen AV-Receiver mit meinem TV-Receiver und Fernseher verbinden?'.

Processing question 3/10: Welche Funktionen bietet das Kundencenter für die Verwaltung von TV-Paketen?


Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.77s/it]


Metrics for question 'Welche Funktionen bietet das Kundencenter für die Verwaltung von TV-Paketen?': {'answer_relevancy': 0.982257999685125, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.8, 'bleu_score': 0.005488673667067168, 'rouge_score': 0.12}
Evaluation metrics saved for question 'Welche Funktionen bietet das Kundencenter für die Verwaltung von TV-Paketen?'.

Processing question 4/10: What can you do in the MeinMagenta App by clicking on 'Moments'?


Evaluating: 100%|██████████| 4/4 [00:05<00:00,  1.40s/it]


Metrics for question 'What can you do in the MeinMagenta App by clicking on 'Moments'?': {'answer_relevancy': 0.8852915843309234, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.6666666666666666, 'bleu_score': 0.34850182406276775, 'rouge_score': 0.5833333333333334}
Evaluation metrics saved for question 'What can you do in the MeinMagenta App by clicking on 'Moments'?'.

Processing question 5/10: What services can you access through the Telekom Kundencenter?


Evaluating: 100%|██████████| 4/4 [00:09<00:00,  2.31s/it]


Metrics for question 'What services can you access through the Telekom Kundencenter?': {'answer_relevancy': 0.9816277796586625, 'context_precision': 0.0, 'context_recall': 1.0, 'faithfulness': 0.14285714285714285, 'bleu_score': 0.0039094870602888725, 'rouge_score': 0.030769230769230767}
Evaluation metrics saved for question 'What services can you access through the Telekom Kundencenter?'.

Processing question 6/10: Was beeinflusst die Wahl Ihres DSL-/VDSL-Tarifs bzgl. Internetgeschwindigkeit?


Evaluating: 100%|██████████| 4/4 [00:10<00:00,  2.64s/it]


Metrics for question 'Was beeinflusst die Wahl Ihres DSL-/VDSL-Tarifs bzgl. Internetgeschwindigkeit?': {'answer_relevancy': 0.9873434713442647, 'context_precision': 0.999999999975, 'context_recall': 1.0, 'faithfulness': 0.6666666666666666, 'bleu_score': 0.08070632004040003, 'rouge_score': 0.38383838383838387}
Evaluation metrics saved for question 'Was beeinflusst die Wahl Ihres DSL-/VDSL-Tarifs bzgl. Internetgeschwindigkeit?'.

Processing question 7/10: What does the Verifizierungs-Link ask Android users for app install?


Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Metrics for question 'What does the Verifizierungs-Link ask Android users for app install?': {'answer_relevancy': 0.0, 'context_precision': 0.0, 'context_recall': 0.0, 'faithfulness': 0.0, 'bleu_score': 0, 'rouge_score': 0.0}
Evaluation metrics saved for question 'What does the Verifizierungs-Link ask Android users for app install?'.

Processing question 8/10: Wie wird WLAN Call in Dtl. abgerechnet und welches Symbol zeigt die Nutzung?


Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.97s/it]


Metrics for question 'Wie wird WLAN Call in Dtl. abgerechnet und welches Symbol zeigt die Nutzung?': {'answer_relevancy': 0.9383452616899589, 'context_precision': 0.999999999975, 'context_recall': 0.5, 'faithfulness': 0.16666666666666666, 'bleu_score': 0.05712161481567817, 'rouge_score': 0.2777777777777778}
Evaluation metrics saved for question 'Wie wird WLAN Call in Dtl. abgerechnet und welches Symbol zeigt die Nutzung?'.

Processing question 9/10: How important is a unique password for Disney Plus registration?


Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.87s/it]


Metrics for question 'How important is a unique password for Disney Plus registration?': {'answer_relevancy': 0.9657617080555628, 'context_precision': 0.0, 'context_recall': 0.0, 'faithfulness': 0.0, 'bleu_score': 0, 'rouge_score': 0.0}
Evaluation metrics saved for question 'How important is a unique password for Disney Plus registration?'.

Processing question 10/10: Welche Einschränkungen gibt's bei Speedport-Kaskaden?


Evaluating: 100%|██████████| 4/4 [00:11<00:00,  2.83s/it]


Metrics for question 'Welche Einschränkungen gibt's bei Speedport-Kaskaden?': {'answer_relevancy': 0.9626551041933255, 'context_precision': 0.6388888888675925, 'context_recall': 1.0, 'faithfulness': 0.0, 'bleu_score': 0.0038690034505641548, 'rouge_score': 0.09259259259259259}
Evaluation metrics saved for question 'Welche Einschränkungen gibt's bei Speedport-Kaskaden?'.



#### Results to csv file

In [5]:
import pandas as pd
import os

# Load the CSV file into a DataFrame
file_path = '/Users/taha/Desktop/rag/test_data_naive/_evaluation_naive_semantic_gradient.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# List of numeric columns to calculate averages
numeric_columns = [
    'Token Count', 'Total Cost (USD)', 'Completion Tokens',
    'Number of Retrieved documents', 'Response time',
    'answer_relevancy', 'context_precision', 'context_recall',
    'faithfulness', 'BleuScore', 'RougeScore'
]

# Calculate the mean for each numeric column
averages = df[numeric_columns].mean()

# Formatting the averages according to your requirements
formatted_averages = {
    'Token Count': f"{averages['Token Count']:.0f}",  # No decimal places
    'Total Cost (USD)': f"{averages['Total Cost (USD)']:.5f}",  # Keep as is
    'Completion Tokens': f"{averages['Completion Tokens']:.0f}",  # No decimal places
    'Number of Retrieved documents': f"{averages['Number of Retrieved documents']}",  # Keep as is
    'Response time': f"{averages['Response time']:.2f}",  # One decimal place
    'answer_relevancy': f"{averages['answer_relevancy']:.4f}",  # Four decimal places
    'context_precision': f"{averages['context_precision']:.4f}",  # Four decimal places
    'context_recall': f"{averages['context_recall']:.4f}",  # Four decimal places
    'faithfulness': f"{averages['faithfulness']:.4f}",  # Four decimal places
    'BleuScore': f"{averages['BleuScore']:.4f}",  # Four decimal places
    'RougeScore': f"{averages['RougeScore']:.4f}"  # Four decimal places
}

# Convert formatted averages to a DataFrame for saving
formatted_averages_df = pd.DataFrame([formatted_averages])

# Define the output file path by adding "results_" prefix
output_file_path = os.path.join(
    os.path.dirname(file_path), 
    f"_results_{os.path.basename(file_path)}"
)

# Save the formatted averages to CSV
formatted_averages_df.to_csv(output_file_path, index=False)
print(f"[INFO] Formatted averages saved to {output_file_path}")

[INFO] Formatted averages saved to /Users/taha/Desktop/rag/test_data_naive/_results__evaluation_naive_semantic_gradient.csv
