In [None]:
import os
from typing import List, Dict
from dotenv import load_dotenv

from langfuse import Langfuse
import pandas as pd
import tiktoken

from config.base_config import rag_config

In [None]:
load_dotenv()

LANGFUSE_SECRET_KEY = os.environ.get("LANGFUSE_SECRET_KEY", None)
LANGFUSE_PUBLIC_KEY = os.environ.get("LANGFUSE_PUBLIC_KEY", None)
LANGFUSE_HOST = "http://localhost:3000"

In [None]:
langfuse = Langfuse(
  secret_key=LANGFUSE_SECRET_KEY,
  public_key=LANGFUSE_PUBLIC_KEY,
  host=LANGFUSE_HOST
)

In [None]:
tokenizer = tiktoken.get_encoding("o200k_base")

In [None]:
pricing = {
    "gpt-4o": {
        "input": 5,
        "output": 15
    },
    "gpt-4o-2024-08-06": {
        "input": 2.5,
        "output": 10
    },
    "gpt-4o-2024-05-13": {
        "input": 5,
        "output": 15
    },
    "gpt-4o-mini": {
        "input": 0.15,
        "output": 0.6
    },
    "gpt-4o-mini-2024-07-18": {
        "input": 0.15,
        "output": 0.6
    },
    "chatgpt-4o-latest": {
        "input": 5.00,
        "output": 15.00
    },
    "gpt-4-turbo": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4-turbo-2024-04-09": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4": {
        "input": 30.00,
        "output": 60.00
    },
    "gpt-4-32k": {
        "input": 60.00,
        "output": 120.00
    },
    "gpt-4-0125-preview": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4-1106-preview": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-4-vision-preview": {
        "input": 10.00,
        "output": 30.00
    },
    "gpt-3.5-turbo-0125": {
        "input": 0.50,
        "output": 1.50
    },
    "gpt-3.5-turbo-instruct": {
        "input": 1.50,
        "output": 2.00
    },
    "gpt-3.5-turbo-1106": {
        "input": 1.00,
        "output": 2.00
    },
    "gpt-3.5-turbo-0613": {
        "input": 1.50,
        "output": 2.00
    },
    "gpt-3.5-turbo-16k-0613": {
        "input": 3.00,
        "output": 4.00
    },
    "gpt-3.5-turbo-0301": {
        "input": 1.50,
        "output": 2.00
    }
 }

In [None]:
model = rag_config["llm"]["model"]

if model in ["gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-2024-08-06", "chatgpt-4o-latest", "gpt-4o-mini", "gpt-4o-mini-2024-07-18"]:
    encoding = "o200k_base"
elif model in ["gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4-turbo-preview", "gpt-4-0125-preview", "gpt-4-1106-preview", "gpt-4",
               "gpt-4-0613", "gpt-4-0314", "gpt-3.5-turbo-0125", "gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-instruct"]:
    encoding = "cl100k_base"

tokenizer = tiktoken.get_encoding(encoding)

def get_cost(tokenizer, input: List[str], output: List[str], pricing: Dict, model: str):

    n_input_toks = len(tokenizer.encode(input))
    n_output_toks = len(tokenizer.encode(output))
    input_cost = n_input_toks * pricing[model]["input"] / 1_000_000
    output_cost = n_input_toks * pricing[model]["output"] / 1_000_000

    return input_cost + output_cost

### Get traces

In [None]:
traces = langfuse.fetch_traces().data

In [None]:
trace_data = []

for trace in traces:

    input = trace.input["args"][1]["query"]
    output = "".join(trace.output) if trace.output else ""
    trace_data.append(
        {
            "id": trace.id,
            "timestamp": trace.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            "latency": trace.latency,
            "cost": get_cost(tokenizer=tokenizer,
                       input=input,
                       output=output,
                       pricing=pricing,
                       model=model),
            "input": input,
            "output": output
        }
    )

trace_data_df = pd.DataFrame(trace_data)
trace_data_df

In [None]:
trace_data_df.cost.sum()

In [None]:
trace_data_df.describe()

### Observations

In [None]:
observations = langfuse.fetch_observations(name="retrieve")

In [None]:
obs = {obs.trace_id: obs.output for obs in observations.data}
trace_data_df["retrieval"] = trace_data_df["id"].map(obs)

In [None]:
trace_data_df

# Embedding visualizations

In [3]:
import os

from utils.embedding import get_embedding
from database.service import document_service

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

In [4]:
POSTGRES_USER = os.environ.get("POSTGRES_USER", None)
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", None)
POSTGRES_PORT = os.environ.get("POSTGRES_PORT", None)
POSTGRES_DB = os.environ.get("POSTGRES_DB", None)

def get_db():

    DATABASE_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:{POSTGRES_PORT}/{POSTGRES_DB}"

    engine = create_engine(DATABASE_URL)

    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

    db = SessionLocal()

    return db

db = get_db()

In [None]:
query = "hello"
language = None
tag = None
k = 10

In [None]:
document_service.get_semantic_match(db, query, language=language, tag=tag, k=k)

In [5]:
eval_data = pd.read_csv("indexing/data/memento_eval_qa_allgemeines.csv")

In [None]:
n = 10
query_embeddings = np.array([x.embedding for x in get_embedding(eval_data.question[:n].to_list())])
query_embeddings

In [None]:
query_embeddings.shape

In [18]:
df_embeddings = pd.DataFrame(query_embeddings)
df_embeddings["label"] = [f"query_{i}" for i in range(len(df_embeddings))]

df_embeddings.to_csv("indexing/data/query_embeddings_tsne.csv", index=None)

In [None]:
df_embeddings

In [None]:
eval_data.head()

In [None]:
answer_embeddings = np.array([x.embedding for x in get_embedding(eval_data.answer[:n].to_list())])
answer_embeddings

In [36]:
df_answer = pd.DataFrame(answer_embeddings)
df_answer["label"] = [f"answer_{i}" for i in range(len(df_answer))]

df_answer.to_csv("indexing/data/answer_embeddings_tsne.csv", index=None)

In [None]:
df_answer

In [41]:
df = pd.concat([df_embeddings, df_answer])
df.to_csv("indexing/data/embeddings_tsne.csv", index=None)

In [None]:
embeddings = np.append(query_embeddings, doc_embeddings, axis=0)
embeddings.shape

In [None]:
# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=5, random_state=42, init='pca', max_iter=3000, metric='cosine', learning_rate=50)
vis_dims = tsne.fit_transform(query_embeddings)
vis_dims.shape

In [14]:
colors = ["red", "darkorange", "gold", "turquoise", "darkgreen"]
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]
#color_indices = df.Score.values - 1

In [None]:
colormap = matplotlib.colors.ListedColormap(colors)
plt.scatter(x, y, cmap=colormap, alpha=0.3)

In [None]:
df_query = pd.DataFrame()
df_query["x"] = x[:n]
df_query["y"] = y[:n]
df_query["label"] = eval_data.question[:n].to_list()
df_query

In [None]:
df_query.plot.scatter(x="x", y="y")

In [None]:
df_answer = pd.DataFrame()
df_answer ["x"] = x[n:]
df_answer ["y"] = y[n:]
df_answer["label"] = eval_data.answer[:n].apply(lambda x: x[:30]).to_list()
df_answer

In [None]:
df_answer.plot.scatter(x="x", y="y")

In [None]:
import matplotlib.colors as mcolors
from random import sample

# List of named colors
colors = sample(list(mcolors.CSS4_COLORS.keys()), n)

markers = [
    ".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "s", "p", "*",
    "h", "H", "+", "x", "D", "d", "|", "_", "8", "P", "X", "$\u2665$",
    "$\u2663$", "$\u2660$", "$\u2666$"
]

In [None]:
fig, ax = plt.subplots()
for i, row in df_query.iterrows():
    ax.scatter(x=row.x, y=row.y, color=colors[i], marker=markers[i])
    #ax.text(x=row.x, y=row.y, s=row.label)
    ax.text(x=row.x, y=row.y, s=f"query_{i}")

for i, row in df_answer.iterrows():
    ax.scatter(x=row.x, y=row.y, color=colors[i], marker=markers[i])
    #ax.text(x=row.x, y=row.y, s=row.label)
    ax.text(x=row.x, y=row.y, s=f"answer_{i}")

plt.show()

In [None]:
df_query.loc[:1]

In [None]:
df_query.loc[1:2]

In [None]:
query

- is the query close in 2d space to the answer?
    - also compute cos score in 1536d space between query-answer
- is the query close in 2d space to the correct expected retrieved doc?
    - also compute cos score in 1536d space between query-doc
- is the correct retrieved doc close in 2d space to the answer?
    - also compute cos score in 1536d space between retrieved doc-answer

In [None]:
colormap = matplotlib.colors.ListedColormap(colors)
plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3)

for score in [0,1,2,3,4]:
    avg_x = np.array(x)[df.Score-1==score].mean()
    avg_y = np.array(y)[df.Score-1==score].mean()
    color = colors[score]
    plt.scatter(avg_x, avg_y, marker='x', color=color, s=100)

plt.title("Amazon ratings visualized in language using t-SNE")

In [None]:
import tiktoken

In [None]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000

In [None]:
input_datapath = "indexing/data/Reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

In [None]:
top_n = 10
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

In [None]:
df["embedding"] = df.combined.apply(lambda x: get_embedding(x))

In [None]:
matrix = np.array([x for x in df.embedding.values])

In [None]:
matrix

In [None]:
# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=9, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)
vis_dims.shape

In [None]:
colors = ["red", "darkorange", "gold", "turquoise", "darkgreen"]
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]
color_indices = df.Score.values - 1

In [None]:
colormap = matplotlib.colors.ListedColormap(colors)
plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3)
for score in [0,1,2,3,4]:
    avg_x = np.array(x)[df.Score-1==score].mean()
    avg_y = np.array(y)[df.Score-1==score].mean()
    color = colors[score]
    plt.scatter(avg_x, avg_y, marker='x', color=color, s=100)

plt.title("Amazon ratings visualized in language using t-SNE")