In [None]:
!pip install -qU semantic-router

In [73]:
import os
import csv
from dotenv import load_dotenv
from semantic_router import Route
from semantic_router.encoders import CohereEncoder, OpenAIEncoder
from semantic_router.layer import RouteLayer
import pandas as pd

In [63]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [None]:
question = pd.read_csv("../../src/copilot/app/indexing/data/question.csv")
question.head()

In [65]:
# get FAQ topics
topics = {"allgemeines": [],
          "familienzulagen": [],}

for i, row in question.iterrows():
    url = row.url.lower().split("/")

    if "familienzulagen" in url:
        if row.language == "de":
            topics["familienzulagen"].append((row.text, row.url))
    else:
        topics["allgemeines"].append((row.text, row.url))

In [None]:
len(topics["familienzulagen"])

In [None]:
len(topics["allgemeines"])

In [68]:
fz_utterances = pd.read_csv("../../src/copilot/app/indexing/data/memento_eval_qa_FZ.csv")
allgemeines_utterances = pd.read_csv("../../src/copilot/app/indexing/data/memento_eval_qa_allgemeines.csv")

In [69]:
fz_utterances_q = fz_utterances["question"].to_list()[:-5] + [x[0] for x in topics["familienzulagen"]]
allgemeines_utterances_q = allgemeines_utterances["question"].to_list()[:-5] + [x[0] for x in topics["allgemeines"][:-30]]

In [78]:
with open('../../src/copilot/app/indexing/data/fz_utterances_q.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for s in fz_utterances_q:
        writer.writerow([s])

with open('../../src/copilot/app/indexing/data/allgemeines_utterances_q.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for s in allgemeines_utterances_q:
        writer.writerow([s])

In [None]:
familienzulage = Route(
    name="familienzulage",
    utterances=fz_utterances_q,
)

allgemeines = Route(
    name="allgemeines",
    utterances=allgemeines_utterances_q,
)

routes = [familienzulage, allgemeines]

In [None]:
# for Cohere
# os.environ["COHERE_API_KEY"] = "<YOUR_API_KEY>"
# encoder = CohereEncoder()

# or for OpenAI
encoder = OpenAIEncoder()

In [None]:
rl = RouteLayer(encoder=encoder, routes=routes)

In [None]:
# Test
rl("Was sind Familienzulagen?")

In [None]:
rl("Was ist, wenn ich selbständig erwerbend bin oder meine Arbeitgeberin oder mein Arbeitgeber nicht beitragspflichtig ist?")

In [None]:
rl("Wo muss ich mich melden, wenn ich nicht arbeite?")

In [None]:
rl("Où sont inscrites les bonifications pour tâches d’assistance ?").name

In [None]:
rl("Quelles informations l’extrait de compte contient-il ?").name

In [None]:
rl(topics["allgemeines"][0][0]).name

In [None]:
preds_fz = []
for i, row in fz_utterances[-5:].iterrows():
    preds_fz.append(rl(row["question"]).name)

preds_allgemeines = []
for i, row in allgemeines_utterances[-5:].iterrows():
    preds_allgemeines.append(rl(row["question"]).name)

for q in [x[0] for x in topics["allgemeines"][-30:]]:
    preds_allgemeines.append(rl(q).name)

In [None]:
sum([x=="allgemeines" for x in preds_allgemeines])/len(preds_allgemeines), sum([x=="familienzulage" for x in preds_fz])/len(preds_fz)

In [None]:
# Train data
train_fz_utterances = fz_utterances["question"].to_list()[:-5] + [x[0] for x in topics["familienzulagen"]]
train_fz_utterances = [(q, "familienzulage") for q in train_fz_utterances]
train_allgemeines_utterances = allgemeines_utterances["question"].to_list()[:-5] + [x[0] for x in topics["allgemeines"][:-30]]
train_allgemeines_utterances = [(q, "allgemeines") for q in train_allgemeines_utterances]

train_data = train_fz_utterances + train_allgemeines_utterances

train_data

In [None]:
# Test data
test_fz_utterances = fz_utterances["question"].to_list()[-5:]
test_fz_utterances = [(q, "familienzulage") for q in test_fz_utterances]
test_allgemeines_utterances = allgemeines_utterances["question"].to_list()[-5:] + [x[0] for x in topics["allgemeines"][-30:]]
test_allgemeines_utterances = [(q, "allgemeines") for q in test_allgemeines_utterances]

test_data = test_fz_utterances + test_allgemeines_utterances

test_data

In [None]:
X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

# evaluate using the default thresholds
accuracy = rl.evaluate(X=X_test, y=y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

In [None]:
route_thresholds = rl.get_thresholds()
print("Default route thresholds:", route_thresholds)

In [None]:
rl.fit(X=X_train, y=y_train)

In [None]:
route_thresholds = rl.get_thresholds()
print("Updated route thresholds:", route_thresholds)