In [None]:
import os
from typing import List, Dict
from dotenv import load_dotenv

from langfuse import Langfuse
import pandas as pd
import tiktoken

from config.base_config import rag_config

In [None]:
load_dotenv()

LANGFUSE_SECRET_KEY = os.environ.get("LANGFUSE_SECRET_KEY", None)
LANGFUSE_PUBLIC_KEY = os.environ.get("LANGFUSE_PUBLIC_KEY", None)
LANGFUSE_HOST = "http://localhost:3000"

In [None]:
langfuse = Langfuse(
  secret_key=LANGFUSE_SECRET_KEY,
  public_key=LANGFUSE_PUBLIC_KEY,
  host=LANGFUSE_HOST
)

In [None]:
tokenizer = tiktoken.get_encoding("o200k_base")

In [None]:
pricing = {
    "gpt-4o": {
        "input": 5,
        "output": 15
    },
    "gpt-4o-2024-08-06": {
        "input": 2.5,
        "output": 10
    },
    "gpt-4o-2024-05-13": {
        "input": 5,
        "output": 15
    },
    "gpt-4o-mini": {
        "input": 0.15,
        "output": 0.6
    },
    "gpt-4o-mini-2024-07-18": {
        "input": 0.15,
        "output": 0.6
    },
    "chatgpt-4o-latest": {
        "input": 5.00,
        "output": 15.00
    },
    "gpt-4-turbo": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4-turbo-2024-04-09": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4": {
        "input": 30.00,
        "output": 60.00
    },
    "gpt-4-32k": {
        "input": 60.00,
        "output": 120.00
    },
    "gpt-4-0125-preview": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4-1106-preview": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4-vision-preview": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-3.5-turbo-0125": {
        "input": 0.50,
        "output": 1.50
    },
    "gpt-3.5-turbo-instruct": {
        "input": 1.50,
        "output": 2.00
    },
    "gpt-3.5-turbo-1106": {
        "input": 1.00,
        "output": 2.00
    },
    "gpt-3.5-turbo-0613": {
        "input": 1.50,
        "output": 2.00
    },
    "gpt-3.5-turbo-16k-0613": {
        "input": 3.00,
        "output": 4.00
    },
    "gpt-3.5-turbo-0301": {
        "input": 1.50,
        "output": 2.00
    }
 }

In [None]:
model = rag_config["llm"]["model"]

if model in ["gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-2024-08-06", "chatgpt-4o-latest", "gpt-4o-mini", "gpt-4o-mini-2024-07-18"]:
    encoding = "o200k_base"
elif model in ["gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4-turbo-preview", "gpt-4-0125-preview", "gpt-4-1106-preview", "gpt-4",
               "gpt-4-0613", "gpt-4-0314", "gpt-3.5-turbo-0125", "gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-instruct"]:
    encoding = "cl100k_base"

tokenizer = tiktoken.get_encoding(encoding)

def get_cost(tokenizer, input: List[str], output: List[str], pricing: Dict, model: str):

    n_input_toks = len(tokenizer.encode(input))
    n_output_toks = len(tokenizer.encode(output))
    input_cost = n_input_toks * pricing[model]["input"] / 1_000_000
    output_cost = n_input_toks * pricing[model]["output"] / 1_000_000

    return input_cost + output_cost

### Get traces

In [None]:
traces = langfuse.fetch_traces().data

In [None]:
trace_data = []

for i, trace in enumerate(traces):
    input = trace.input["args"][1]["query"]
    if trace.output:
        if all(isinstance(item, str) for item in trace.output):
            output = "".join(trace.output)
    else:
        output = ""
    trace_data.append(
        {
            "id": trace.id,
            "timestamp": trace.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            "latency": trace.latency,
            "cost": get_cost(tokenizer=tokenizer,
                       input=input,
                       output=output,
                       pricing=pricing,
                       model=model),
            "input": input,
            "output": output
        }
    )

trace_data_df = pd.DataFrame(trace_data)
trace_data_df

In [None]:
trace_data_df.cost.sum()

In [None]:
trace_data_df.describe()

### Observations

In [None]:
observations = langfuse.fetch_observations(name="retrieve")

In [None]:
obs = {obs.trace_id: obs.output for obs in observations.data}
trace_data_df["retrieval"] = trace_data_df["id"].map(obs)

In [None]:
trace_data_df

# Retrieval EVAL

In [10]:
import os
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from sqlalchemy import create_engine, delete
from sqlalchemy.orm import sessionmaker

from rag.rag_processor import processor
from config.base_config import rag_config
from database.models import Question, Document

import pandas as pd

In [None]:
rag_config

In [17]:
POSTGRES_USER = os.environ.get("POSTGRES_USER", None)
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", None)
POSTGRES_PORT = os.environ.get("POSTGRES_PORT", None)
POSTGRES_DB = os.environ.get("POSTGRES_DB", None)

def get_db():

    DATABASE_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:{POSTGRES_PORT}/{POSTGRES_DB}"

    engine = create_engine(DATABASE_URL)

    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

    db = SessionLocal()

    return db

def purge_db(db):
    try:
        # Delete all rows from the Question table
        db.execute(delete(Question))
        db.commit()

        # Delete all rows from the Document table
        db.execute(delete(Document))
        db.commit()
    except Exception as e:
        db.rollback()
        raise e
    finally:
        db.close()

    return True

In [18]:
db = get_db()

### Purge db data

In [None]:
purge_db(db)

### Choose dataset

In [None]:
eval_data = pd.read_csv("indexing/data/eak_eval_fz.csv")
eval_data.tail()

In [None]:
eval_data = pd.read_csv("indexing/data/memento_eval_qa_fz.csv")
eval_data.tail()

In [None]:
eval_data = pd.read_csv("indexing/data/memento_eval_qa_allgemeines.csv")
eval_data.tail()

### Evaluation params

In [21]:
language = None
tag = None
k = 100
#processor.retriever_client.reranker = None
processor.retriever_client.reranker.top_k = 10

### Evaluate

In [24]:
docs = []
for i, row in eval_data.iterrows():
    docs.append(processor.retriever_client.get_documents(db, row.question, language=language, tag=tag, k=k))

In [None]:
retrieved_docs = []
for doc in docs:
    list_docs = []
    for d in doc:
        list_docs.append(d["url"])
    retrieved_docs.append(list_docs)

eval_data["retrieval"] = retrieved_docs

In [None]:
# recall@k
k = 3
#for k in [100, 10, 5, 3, 2, 1]:
recall = eval_data.apply(lambda row: row['url'].replace("www.", "") in [url.replace("www.", "") for url in row['retrieval']][:k], axis=1)
print(k, ": ", recall.sum() / len(recall))

In [None]:
for i, row in eval_data[~recall][["question", "url", "retrieval"]].iterrows():
    print(row.question)
    print(row.url)
    print(row.retrieval)
    print("--------------------_")

In [None]:
bad_retrieval = [doc for doc, b in zip(docs, recall) if not b]

bad_docs = []
for doc_list in bad_retrieval:
    retrieved_docs = []
    for doc in doc_list:
        retrieved_docs.append(doc["id"])
    bad_docs.append(retrieved_docs)

bad_docs

In [None]:
bad_doc = db.query(Document).filter(Document.id == 428).first()
print(bad_doc.url)
print(bad_doc.text)

In [None]:
import numpy as np

def dcg(relevance_scores):
    """
    Compute Discounted Cumulative Gain (DCG)
    """
    return sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores)])

def ndcg(predicted_ranking, gold_standard_string):
    """
    Compute normalized Discounted Cumulative Gain (nDCG)

    :param predicted_ranking: List of strings in predicted order
    :param gold_standard_string: The gold standard string that should ideally be at position 1
    :return: nDCG score
    """
    # Calculate relevance scores for the predicted ranking
    relevance_scores = [1 if s == gold_standard_string else 0 for s in predicted_ranking]

    # Calculate the DCG for the predicted ranking
    dcg_score = dcg(relevance_scores)

    # Calculate the ideal DCG (when the gold standard string is at the top)
    ideal_ranking = [1] + [0] * (len(predicted_ranking) - 1)
    ideal_dcg = dcg(ideal_ranking)

    # Calculate nDCG
    ndcg_score = dcg_score / ideal_dcg if ideal_dcg > 0 else 0
    return ndcg_score

# Example usage
predicted_ranking = ["doc2", "doc3", "doc1", "doc4"]
gold_standard_string = "doc1"

ndcg_score = ndcg(predicted_ranking, gold_standard_string)
print(f"nDCG score: {ndcg_score:.4f}")


In [None]:
ranks = [d["url"].replace("www.", "") for d in docs[0]]
gold = eval_data.loc[0].url.replace("www.", "")

ndcg(ranks, gold)

In [None]:
ranks

In [None]:
gold

In [None]:
ranks[0] = "hhh"
ranks[3] = 'https://ahv-iv.ch/p/1.01.f'

In [None]:
ranks

In [None]:
ndcg(ranks, gold)