In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/3. MscIS 22-24/Thesis/final code files/src')

In [None]:
# Install necessary packages from requirements.txt
req_path = '/content/drive/MyDrive/3. MscIS 22-24/Thesis/final code files/requirements.txt'
with open(req_path, 'r') as file:
    requirements = file.read().splitlines()

for package in requirements:
    !pip install {package}

In [None]:
from ragpipelines import *

# Load configuration from config.yaml
import yaml
config_path = '/content/drive/MyDrive/3. MscIS 22-24/Thesis/final code files/config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

!pip install trulens_eval==0.33.0

In [None]:
# Import libraries for evaluation
import pandas as pd
import numpy as np
from trulens_eval.feedback import GroundTruthAgreement
from trulens_eval import Tru, OpenAI, Feedback, TruLlama

In [None]:
pip install matplotlib

In [None]:
import os
os.environ['OPENAI_API_KEY'] = config['openai_api_key']

In [None]:
# Initialize TruLens and OpenAI provider
tru = Tru()
openai_provider = OpenAI(model_engine="gpt-4o-mini")

In [None]:
tru.reset_database()

In [None]:
# Load evaluation questions and answers
eval_path = '/content/drive/MyDrive/3. MscIS 22-24/Thesis/final code files/eval_questions.xlsx'
qa_df = pd.read_excel(eval_path, sheet_name="Sheet1")
qa_set = [{"query": item["Question"], "response": item["Answer"]} for index, item in qa_df.iterrows()]

In [None]:
# Define the evaluation metrics
# answer relevance
f_qa_relevance = Feedback(
    openai_provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()

In [None]:
# context relevance
f_qs_relevance = Feedback(
    openai_provider.relevance_with_cot_reasons, name="Context Relevance"
).on_input().on(TruLlama.select_source_nodes().node.text).aggregate(np.mean)

In [None]:
# Groundedness (sometimes called faithfulness)
f_groundedness = (
    Feedback(openai_provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(TruLlama.select_source_nodes().node.text)
    .on_output()
)

In [None]:
# Ground truth agreement (sometimes called answer correctness)
f_groundtruth = Feedback(
    GroundTruthAgreement(qa_set).agreement_measure, name = "Answer Correctness"
).on_input_output()

In [None]:
metrics = [f_qa_relevance, f_qs_relevance, f_groundedness, f_groundtruth]

In [None]:
# Function to create TruLens recorder
def get_trulens_recorder(query_engine, app_id):
    return TruLlama(query_engine, feedbacks=metrics, app_id=app_id)

In [None]:
# Base class for RAG pipelines evaluation
class RAGEvaluator:
    def __init__(self, pipeline_class, app_id, api_key, document_path, llm_model, embed_model):
        self.pipeline = pipeline_class(api_key, document_path, llm_model, embed_model)
        self.query_engine = self.pipeline.query_engine
        self.recorder = get_trulens_recorder(self.query_engine, app_id=app_id)

    def evaluate(self, qa_set):
        with self.recorder as recording:
            for q in qa_set:
                self.query_engine.query(q["query"])

In [None]:
api_key = config['openai_api_key']
document_path = '/content/drive/MyDrive/3. MscIS 22-24/Thesis/final code files/privacy-pmi.pdf'
llm_model = config['llm_model']
embed_model = config['embed_model']

In [None]:
# This function evaluates a given RAG pipeline by using the TruLens framework
def evaluate_pipeline(pipeline, qa_set, app_id, tru, metrics):
    # Initialize TruLens recorder for the pipeline
    recorder = TruLlama(
        app=pipeline.query_engine,  # Assuming the query_engine is the app here
        feedbacks=metrics,
        app_id=app_id
    )

    with recorder as recording:
        for q in qa_set:
            pipeline.query(q["query"])

    # Retrieve and return the records and feedback
    records, feedback = tru.get_records_and_feedback(app_ids=[app_id])
    return records, feedback

In [None]:
# Initialize and evaluate each pipeline

basic_pipeline = BasicRAGPipeline(api_key=config['openai_api_key'], document_path=document_path,
                                  llm_model=config['llm_model'], embed_model=config['embed_model'])

In [None]:
evaluate_pipeline(basic_pipeline, qa_set, "0. Basic Query Engine", tru, metrics)

In [None]:
sentence_window_rag_without_rerank = SentenceWindowRAGPipeline(api_key=config['openai_api_key'], document_path=document_path,
                                                               llm_model=config['llm_model'], embed_model=config['embed_model'],
                                                               rerank=False)

In [None]:
evaluate_pipeline(sentence_window_rag_without_rerank, qa_set, "1. Sentence Window Query Engine - W/o Reranking", tru, metrics)

In [None]:
sentence_window_rag_with_rerank = SentenceWindowRAGPipeline(api_key=config['openai_api_key'], document_path=document_path,
                                                            llm_model=config['llm_model'], embed_model=config['embed_model'],
                                                            rerank=True)

In [None]:
evaluate_pipeline(sentence_window_rag_with_rerank, qa_set, "2. Sentence Window Query Engine - w/ re-ranking", tru, metrics)

In [None]:
automerging_pipeline = AutoMergingRAGPipeline(api_key=config['openai_api_key'], document_path=document_path,
                                              llm_model=config['llm_model'], embed_model=config['embed_model'])

In [None]:
evaluate_pipeline(automerging_pipeline, qa_set, "3. Automerging Query Engine - w/ re-ranking", tru, metrics)

In [None]:
automerging_pipeline_with_prompting = AutoMergingRAGPipelineWithPrompting(
    api_key=config['openai_api_key'],
    document_path=document_path,
    llm_model=config['llm_model'],
    embed_model=config['embed_model']
)

In [None]:
evaluate_pipeline(
    automerging_pipeline_with_prompting,
    qa_set,
    "4. Auto-Merging Query Engine - w/ re-ranking and prompt engineering",
    tru,
    metrics
)

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()

In [None]:
#tru.stop_dashboard()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# RADAR CHART

# Data for the pipelines
pipelines = [
    "Basic Query Engine",
    "Sentence Window Query Engine - W/o Reranking",
    "Sentence Window Query Engine - w/ Re-ranking",
    "Auto-merging Query Engine - w/ Re-ranking",
    "Auto-merging Query Engine - w/ Re-ranking and Prompt Engineering"
]

# Metrics
categories = ['Answer Correctness', 'Context Relevance', 'Groundedness', 'Answer Relevance']
values = [
    [0.78, 0.54, 0.51, 0.89],
    [0.83, 0.46, 0.28, 0.83],
    [0.43, 0.31, 0.55, 0.66],
    [0.98, 0.56, 0.48, 0.81],
    [0.98, 0.57, 0.40, 0.82]
]

# Radar chart setup
N = len(categories)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
values = [v + [v[0]] for v in values]  
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

for i, pipeline in enumerate(pipelines):
    ax.fill(angles, values[i], color=colors[i], alpha=0.3)
    ax.plot(angles, values[i], color=colors[i], linewidth=2, label=pipeline)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=12, fontweight='bold', color='navy')

ax.grid(color='gray', linestyle='--', linewidth=0.5)
ax.set_yticklabels([])

plt.title("Pipeline Performance Comparison", size=18, color='darkblue', fontweight='bold', pad=20)

plt.legend(loc='upper left', bbox_to_anchor=(1.1, 1.05), fontsize=10, frameon=False)

ax.spines['polar'].set_visible(False)
ax.set_facecolor('#f9f9f9')

plt.tight_layout()
plt.show()

In [None]:
# HEATMAP

import matplotlib.pyplot as plt
import seaborn as sns

# Data for the pipelines
pipelines = [
    "Basic Query Engine",
    "Sentence Window Query Engine - W/o Reranking",
    "Sentence Window Query Engine - w/ Re-ranking",
    "Auto-merging Query Engine - w/ Re-ranking",
    "Auto-merging Query Engine - w/ Re-ranking and Prompt Engineering"
]

# Metrics
metrics = ['Answer Correctness', 'Context Relevance', 'Groundedness', 'Answer Relevance']
values = np.array([
    [0.78, 0.54, 0.51, 0.89],
    [0.83, 0.46, 0.28, 0.83],
    [0.43, 0.31, 0.55, 0.66],
    [0.98, 0.56, 0.48, 0.81],
    [0.98, 0.57, 0.40, 0.82]
])

plt.figure(figsize=(10, 6))
sns.heatmap(values, annot=True, cmap="coolwarm", xticklabels=metrics, yticklabels=pipelines)
plt.title("Heatmap: Pipeline Performance Across Metrics", size=20)
plt.show()