# Notebook description

- Retrieve glossary tags from AHV-IV and BSV
- Translate tags to english with deepl/LLM
- Align tags across AHV-IV translation methods, across BSV translation methods, and across AHV-IV and BSV
- Scrap data from AHV-IV mementos and eak.admin.ch
- Augment RAG documents with top-level tags and subtopics
    - Top-level `tags`: from AHV-IV mementos and eak.admin.ch web sections
    - `subtopics`: from glossaries
- Augment RAG documents with identified top-level tags, subtopics and summary (manual + llm extracted)
- Augment RAG documents with doctype
    - `context_doc`
    - `formular`
- Augment RAG documents with retrieval method
    - `semantic_search`
    - `raptor`
    - `contextual_retrieval`
    - for `graph_rag`and `light_rag`, see `kg_docs_preprocessing.ipynb`
- ECL

# TO DO


- [x] translate to eng if not correctly extracted (can get eng terms from https://www.ahv-iv.ch/en/Social-insurances/Glossary but not bsv)
- [x] align terms across glossaries (ahv-iv, bsv)
- [ ] align tags for eak.admin.ch > ahv-iv general tags + from eak.admin.ch urls/categories
- [ ] Extract tags/topics (terms) for each document for:
    - ahv-iv.ch
    - eak.admin.ch
    - praxisleitfaden
    - ahv-lernbaustein

- [ ] doctype definition (context_doc, formular)

- [ ] integrate to intent classification module
- [ ] affirmative query reformulation module/retriever + dynamic tags/topics (terms) filtering for agentic RAG
- [ ] inject glossary terms + related links to RAG context with semantic search/mahalanobis dist based on dynamically identified tags/topics (terms)
- [ ] ECL
    - embed tags/descriptions

In [None]:
CSV_CHAR_LIMIT = 131_072
MAX_EMBEDDING_TOKENS = 8191

In [None]:
import tiktoken
import pandas as pd

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")

# EAK

In [None]:
df = pd.read_csv("indexing/data/to_upsert/eak_admin_ch/eak_admin_ch_de_fr_tags_NEW.csv")

df["subtopics"] = None
df["organizations"] = "EAK"
df["doctype"] = "context_doc"
df["summary"] = None
df["hyq"] = None
df["hyq_declarative"] = None
df["tags"] = None

In [None]:
df.to_csv("indexing/data/to_upsert/eak_admin_ch/eak_admin_ch_de_fr_tags_NEW.csv", index=None)

# AHV-IV MEMENTO

In [None]:
df = pd.read_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm.csv")
df.rename(columns={"organization": "organizations"}, inplace=True)
df["text"] = df.text.apply(lambda x: x[:CSV_CHAR_LIMIT] if len(x) > CSV_CHAR_LIMIT else x)
df.head()

In [None]:
df["embedding_limit"] = df.text.apply(lambda x: len(tokenizer.encode(x)) > MAX_EMBEDDING_TOKENS)

new_docs = []
for i, row in df.iterrows():
    if row.embedding_limit:
        tokenized_text = tokenizer.encode(row.text)
        chunks = [tokenizer.decode(tokenized_text[i:i + MAX_EMBEDDING_TOKENS]) for i in range(0, len(tokenized_text), MAX_EMBEDDING_TOKENS)]
        for chunk in chunks:
            new_docs.append(
                {
                    "text": chunk,
                    "language": row.language,
                    "url": row.url,
                    "tags": row.tags,
                    "subtopics": row.subtopics,
                    "organizations": row.organizations,
                    "summary": row.summary,
                    "doctype": row.doctype,
                    "hyq": row.hyq,
                    "hyq_declarative": row.hyq_declarative,
                }
            )
        df.drop(index=i, inplace=True)

df.reset_index(inplace=True)

In [None]:
len(df)

In [None]:
len(new_docs)

In [None]:
df = pd.concat([df, pd.DataFrame(new_docs)])
df.reset_index(inplace=True)

In [None]:
df.drop(columns=["embedding_limit"], inplace=True)

In [None]:
df

In [None]:
df.text.apply(lambda x: len(tokenizer.encode(x)) > 8192).sum()

In [None]:
len(tokenizer.encode(df.loc[5].text))

# AHV LERNBAUSTEIN 2024

In [None]:
df = pd.read_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm.csv")
df["doctype"] = "context_doc"
df["hyq"] = None
df["hyq_declarative"] = None
df["organizations"] = "EAK"
df["tags"] = df.tags.apply(lambda x: x.lower().replace(" ", "_"))
df.head()

In [None]:
df.to_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm.csv", index=None)

# PRAXISLEITFADEN

In [None]:
df = pd.read_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de_tags_llm.csv")

#df.drop(columns=["context", "parent_node", "childen_nodes"], inplace=True)
df["doctype"] = "context_doc"
df["hyq"] = None
df["hyq_declarative"] = None
df["organizations"] = "EAK"
df["tags"] = "family_allowances"
df["subtopics"] = None
df.head()

In [None]:
df.to_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de_tags_llm.csv", index=None)

# FEDLEX

In [None]:
df_de = pd.read_csv("indexing/data/to_upsert/fedlex/Bundesgesetz vom 20. Dezember 1946 über die Alters- und Hinterlassenenversicherung (AHVG)_de.csv")
df_fr = pd.read_csv("indexing/data/to_upsert/fedlex/Loi fédérale du 20 décembre 1946 sur l'assurance-vieillesse et survivants (LAVS)_fr.csv")
df_it = pd.read_csv("indexing/data/to_upsert/fedlex/Legge federale del 20 dicembre 1946 sull'assicurazione per la vecchiaia e per i superstiti (LAVS)_it.csv")

df_de["tags"] = "lavs"

In [None]:
df = pd.concat([df_de, df_fr, df_it])
df

In [None]:
df.to_csv("indexing/data/to_upsert/fedlex/lavs.csv", index=None)

# AKIS

In [None]:
df = pd.read_csv("indexing/data/zas_eak_copilot/akis/akis_hyq.csv")
#df.rename(columns={"organization": "organizations"}, inplace=True)
df["text"] = df.text.apply(lambda x: x[:CSV_CHAR_LIMIT] if len(x) > CSV_CHAR_LIMIT else x)
#df["subtopics"] = None
df.head()

# LLM AUGMENTATION

In [None]:
QUERY_STATEMENT_REWRITING_PROMPT_DE = """<anweisungen>
    <anweisung>Gebe den untenstehenden <text> vor, formuliere {n_alt_queries} Fragen, die der Text genau beantworten kann</anweisung>
    <anweisung>Geben Sie die generierten Fragen vor, formulieren Sie sie in einem deklarativen/affirmativen Tonfall in mehrere alternative Aussagen um</anweisung>
    <anweisung>Jede umformulierte Aussage sollte die Bedeutung der ursprünglichen Anfrage beibehalten, sie aber auf eine etwas andere Weise ausdrücken</anweisung>
    <anweisung>Schreiben Sie Fragen/Reformulierungen immer in derselben Sprache wie der <text></anweisung>
</anweisungen>

<Beispiele>
hyq: [„Wie ist das Wetter?“, „Was ändert sich mit AHV21?“, „Was bedeutet das flexible Rentenalter?“]
hyq_delarative: [„Ich möchte wissen, wie das Wetter ist“, „Erklär mir, was sich mit der AHV21 ändert“, „Flexibles Rentenalter erklärt“]
</Beispiele>

<format_der_antwort>
HYQReformulationSchema(BaseModel)
    hyq: List[str] # eine Liste von Fragen, die der <text> genau beantworten kann.
    hyq_declarative = List[str] # die affirmative/deklarative Umformulierung der hyq-Fragen.
</format_der_antwort>

<text>
{text}
</text>"""

QUERY_STATEMENT_REWRITING_PROMPT_FR = """<instructions>
    <instruction>Étant donné le <texte> ci-dessous, formulez {n_alt_queries} questions auxquelles le texte peut exactement répondre</instruction>
    <instruction>Étant donné les questions générées, reformulez les en plusieurs énoncés alternatifs sur un ton déclaratif/affirmatif</instruction>
    <instruction>Chaque déclaration reformulée doit conserver le sens de la requête originale mais l'exprimer d'une manière légèrement différente</instruction>
    <instruction>Toujours écrire les questions/reformulations dans la même langue que le <texte></instruction>
</instructions>

<exemples>
hyq: ["Quel temps fait-il?", "Que change avec AVS21 ?", "Que signifie l'âge de la retraite flexible ?"]
hyq_delarative: ["J'aimerais connaître le temps qu'il fait", "Explique moi ce qui change avec AVS21", "L'âge de la retraite flexible expliqué"]
</exemples>

<format_de_réponse>
HYQReformulationSchema(BaseModel)
    hyq: List[str] # une liste de questions auxquelles le <texte> peut répondre exactement
    hyq_declarative = List[str] # la reformulation de manière affirmative/déclarative des questions hyq
</format_de_réponse>

<texte>
{text}
</texte>"""

QUERY_STATEMENT_REWRITING_PROMPT_IT = """<istruzioni>
    <istruzione>Dato il <testo> sottostante, formulare {n_alt_queries} domande a cui il testo può rispondere esattamente</istruzione>.
    <istruzione>Date le domande generate, riformularle in diverse affermazioni alternative con un tono dichiarativo/affermativo</istruzione>.
    <istruzione>Ogni affermazione riformulata deve mantenere il significato della domanda originale, ma esprimerlo in modo leggermente diverso</istruzione>.
    <istruzione>Scrivere sempre le domande/riformulazioni nella stessa lingua del <testo></istruzione>.
</istruzioni>

<esempi>
hyq: [“Com'è il tempo?”, “Cosa sta cambiando con AVS21?”, “Cosa significa l'età pensionabile flessibile?”]
hyq_delarative: [“Vorrei sapere com'è il tempo”, “Spiegami cosa sta cambiando con AVS21”, “L'età pensionabile flessibile spiegata”].
</esempi>

<formato_di_risposta>
HYQReformulationSchema(BaseModel)
    hyq: List[str] # un elenco di domande a cui il <testo> può rispondere esattamente
    hyq_declarative = List[str] # la riformulazione affermativa/declarativa delle domande hyq.
</formato_di_risposta>

<testo>
{text}
</testo>"""

prompts = {
    "de": QUERY_STATEMENT_REWRITING_PROMPT_DE,
    "fr": QUERY_STATEMENT_REWRITING_PROMPT_FR,
    "it": QUERY_STATEMENT_REWRITING_PROMPT_IT,
}

In [None]:
organizations = "EAK"

In [None]:
for i, row in tqdm.tqdm(df.iterrows()):
    len_toks = len(tokenizer.encode(row.text))
    if len_toks > MAX_EMBEDDING_TOKENS:
        print(len_toks)

In [None]:
class HYQReformulationSchema(BaseModel):
    hyq: List[str]
    hyq_declarative: List[str]

for i, row in tqdm.tqdm(df.iterrows()):


    prompt = prompts.get(row.language)
    messages = [
        {"role": "user", "content": prompt.format(n_alt_queries=3, text=row.text)}
    ]
    res = await llm_client.beta.chat.completions.parse(
        model="gpt-4o",
            temperature=0,
            top_p=0.95,
            max_tokens=2048,
            messages=messages,
            response_format=HYQReformulationSchema,
        )
    
    hyq = res.choices[0].message.parsed.hyq
    hyq_declarative = res.choices[0].message.parsed.hyq_declarative
    #if row.organizations:
    #organizations = ",".join(ast.literal_eval(row.organizations))
    #else:

    #organizations = organizations

    #df.loc[i, "organizations"] = row.organizations
    df.loc[i, "hyq"] = "{SEP}".join(hyq)
    df.loc[i, "hyq_declarative"] = "{SEP}".join(hyq_declarative)

    """
    text_embedding = await get_embedding(row.text)
    df.loc[i, "text_embedding"] = str(text_embedding)
    
    tags_embedding = await get_embedding(row.tags)
    df.loc[i, "tags_embedding"] = str(tags_embedding)

    subtopics_embedding = await get_embedding(row.subtopics)
    df.loc[i, "subtopics_embedding"] = str(subtopics_embedding)

    summary_embedding = await get_embedding(row.summary)
    df.loc[i, "text_summary"] = str(summary_embedding)

    hyq_embedding = await get_embedding(",".join(hyq))
    df.loc[i, "hyq_embedding"] = str(hyq_embedding)
    
    hyq_declarative_embedding = await get_embedding(",".join(hyq_declarative))
    df.loc[i, "hyq_declarative_embedding"] = str(hyq_declarative_embedding)
    """

In [None]:
df.to_csv("indexing/data/to_upsert/akis/akis_EMBED.csv", index=None)

In [None]:
df.to_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm_EMBED.csv", index=None)

In [None]:
df.to_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm_EMBED.csv", index=None)

In [None]:
df = pd.read_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm_EMBED.csv")

# Imports

In [None]:
import os
from dotenv import load_dotenv
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
import pandas as pd
import ast
from typing import List
from pydantic import BaseModel
from openai import AsyncOpenAI
import tqdm
from dataclasses import dataclass
import deepl
import asyncio

In [None]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)
DEEPL_API_KEY = os.environ.get("DEEPL_API_KEY", None)

In [None]:
llm_client = AsyncOpenAI(
    api_key=OPENAI_API_KEY
)

In [None]:
translator = deepl.Translator(DEEPL_API_KEY)

# Scrap tags from AHV-IV glossary

In [None]:
glossary_ahv_iv = {}

urls = {
    "de": "https://www.ahv-iv.ch/de/Sozialversicherungen/Glossar",
    "fr": "https://www.ahv-iv.ch/fr/Assurances-sociales/Glossaire",
    "it": "https://www.ahv-iv.ch/it/Assicurazioni-sociali/Glossario",
    "en": "https://www.ahv-iv.ch/en/Social-insurances/Glossary",
}

for lang, url in urls.items():
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    glossary_ahv_iv[lang] = {}
    for element in soup.find_all("div", {"class": "app-glossary-single-term"}):
        glossary_ahv_iv[lang][element.find("strong").text.strip()] = {}
        glossary_ahv_iv[lang][element.find("strong").text.strip()]["description"] = element.find("p").text.strip()

[pd.DataFrame.from_dict(glossary_ahv_iv[lang]).T.to_csv(f"indexing/data/glossary/glossary_ahv_iv_{lang}.csv") for lang in glossary_ahv_iv.keys()]
#glossary_ahv_iv

# Scrap tags from BSV glossary

In [None]:
glossary_bsv = {}

urls = {
    "de": "https://www.bsv.admin.ch/bsv/de/home/glossar.html",
    "fr": "https://www.bsv.admin.ch/bsv/fr/home/glossar.html#glossary-caisse_de_pension",
    "it": "https://www.bsv.admin.ch/bsv/it/home/glossar.html"
}

for lang, url in urls.items():
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    glossary_bsv[lang] = {}
    
    for element in soup.find_all("dl"):
        glossary_bsv[lang][element.find("dt").text.strip()] = {}
        glossary_bsv[lang][element.find("dt").text.strip()]["description"] = element.find("dd").find("p").text.strip()
        if element.find("dd").find("div", {"class": "relatedlinks"}):
            glossary_bsv[lang][element.find("dt").text.strip()]["synonyms"] = [f'[{a.text}]({a["href"].replace("#glossary-", f"https://www.bsv.admin.ch/bsv/{lang}/home/glossar/")}.html)' for a in element.find("dd").find("div", {"class": "relatedlinks"}).find_all("a")]
        if element.find("dd").find("div", {"class": "referencedlinks"}):
            glossary_bsv[lang][element.find("dt").text.strip()]["links"] = [f'[{a.text}]({a["href"]})' for a in element.find("dd").find("div", {"class": "referencedlinks"}).find_all("a")]

[pd.DataFrame.from_dict(glossary_bsv[lang]).T.to_csv(f"indexing/data/glossary/glossary_bsv_{lang}.csv") for lang in glossary_bsv.keys()]
#glossary_bsv

# Translate tags to english

In [None]:
class TranslationSchema(BaseModel):
    key: str
    value: str

    def to_dict(self):
        return {self.key: self.value}


prompt = """<purpose>
You must translate to English the follow <tag> and <description>.
Keep in mind the context is social insurances in Switzerland.
</purpose>

<response_format>
class TranslationSchema(BaseModel):
    class KeyValuePair(BaseModel):
        key: str # key (tag) is the english translation of <tag>
        value: str # value (description) is the english translation of <description>
    
    items: List[KeyValuePair]
</response_format>

<tag>
{tag}
</tag>

<description>
{desc}
</description>
"""

### AHV-IV

In [None]:
glossary_ahv_iv_trans = {}

for lang in glossary_ahv_iv.keys():
    glossary_ahv_iv_trans[f"{lang}_en"] = {}
    glossary_ahv_iv_trans[f"{lang}_en"]["deepl_tag"] = {}
    glossary_ahv_iv_trans[f"{lang}_en"]["deepl_desc"] = {}
    glossary_ahv_iv_trans[f"{lang}_en"]["llm_tag"] = {}
    glossary_ahv_iv_trans[f"{lang}_en"]["llm_desc"] = {}
    for tag, desc in tqdm.tqdm(glossary_ahv_iv[lang].items()):
    
        # deepl translation
        trans_deepl = translator.translate_text([tag, desc["description"]], target_lang="EN-GB", model_type="prefer_quality_optimized")
        key = trans_deepl[0].text.lower()
        value = trans_deepl[1].text
        glossary_ahv_iv_trans[f"{lang}_en"]["deepl_tag"][tag.lower()] = key
        glossary_ahv_iv_trans[f"{lang}_en"]["deepl_desc"][tag.lower()] = value

        # llm translation
        messages = [{"role": "developer", "content": prompt.format(tag=tag, desc=desc["description"])}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=TranslationSchema,
            )

        trans_llm = res.choices[0].message.parsed.to_dict()
        [key] = trans_llm.keys()
        [value] = trans_llm.values()
        glossary_ahv_iv_trans[f"{lang}_en"]["llm_tag"][tag.lower()] = key.lower()
        glossary_ahv_iv_trans[f"{lang}_en"]["llm_desc"][tag.lower()] = value


In [None]:
pd.DataFrame.from_dict(glossary_bsv_trans["de_en"])

In [None]:
[pd.DataFrame.from_dict(glossary_ahv_iv_trans[lang]).to_csv(f"indexing/data/glossary/glossary_ahv_iv_trans_{lang}.csv") for lang in glossary_ahv_iv_trans.keys()]

### BSV

In [None]:
glossary_bsv_trans = {}

for lang in glossary_bsv.keys():
    glossary_bsv_trans[f"{lang}_en"] = {}
    glossary_bsv_trans[f"{lang}_en"]["deepl_tag"] = {}
    glossary_bsv_trans[f"{lang}_en"]["deepl_desc"] = {}
    glossary_bsv_trans[f"{lang}_en"]["llm_tag"] = {}
    glossary_bsv_trans[f"{lang}_en"]["llm_desc"] = {}
    for tag, desc in tqdm.tqdm(glossary_bsv[lang].items()):
    
        # deepl translation
        trans_deepl = translator.translate_text([tag, desc["description"]], target_lang="EN-GB", model_type="prefer_quality_optimized")
        key = trans_deepl[0].text.lower()
        value = trans_deepl[1].text
        glossary_bsv_trans[f"{lang}_en"]["deepl_tag"][tag.lower()] = key
        glossary_bsv_trans[f"{lang}_en"]["deepl_desc"][tag.lower()] = value

        # llm translation
        messages = [{"role": "developer", "content": prompt.format(tag=tag, desc=desc["description"])}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=TranslationSchema,
            )

        trans_llm = res.choices[0].message.parsed.to_dict()
        [key] = trans_llm.keys()
        [value] = trans_llm.values()
        glossary_bsv_trans[f"{lang}_en"]["llm_tag"][tag.lower()] = key.lower()
        glossary_bsv_trans[f"{lang}_en"]["llm_desc"][tag.lower()] = value


In [None]:
pd.DataFrame.from_dict(glossary_bsv_trans["it_en"])

In [None]:
[pd.DataFrame.from_dict(glossary_bsv_trans[lang]).to_csv(f"indexing/data/glossary/glossary_bsv_trans_{lang}.csv") for lang in glossary_bsv_trans.keys()]

# Align glossary tags

In [None]:
class SelectTranslationSchema(BaseModel):
    selection: str

prompt = """<purpose>
Your task is to look at the source expression and evaluate select the best (most accurate) translation based on provided <context>.
</purpose>

<response_format>
SelectTranslationSchema(BaseModel):
    selection: str # the best (most accurate) translation based on context
</response_format>

<translation_1>
{translation_1}
</translation_1>

<translation_2>
{translation_2}
</translation_2>

<context>
{desc}
<context>
"""

### Align across AHV-IV (deepl-llm)

In [None]:
df_de = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_trans_de_en.csv", index_col=0)
df_fr = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_trans_fr_en.csv", index_col=0)
df_it = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_trans_it_en.csv", index_col=0)

#### 1. Merge across languages

In [None]:
df_de["aligned"] = df_de["deepl_tag"] == df_de["llm_tag"]
df_fr["aligned"] = df_fr["deepl_tag"] == df_fr["llm_tag"]
df_it["aligned"] = df_it["deepl_tag"] == df_it["llm_tag"]

In [None]:
# de
for i, row in tqdm.tqdm(df_de.iterrows()):

    if row["aligned"]:
        df_de.loc[i, "tag"] = row["deepl_tag"]
        
    else:
        messages = [{"role": "developer",
                     "content": prompt.format(
                         translation_1=row["deepl_tag"],
                         translation_2=row["llm_tag"],
                         desc="\n\n".join([row["deepl_desc"], row["llm_desc"]]))}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=SelectTranslationSchema,
            )
        
        df_de.loc[i, "tag"] = res.choices[0].message.parsed.selection

# fr
for i, row in tqdm.tqdm(df_fr.iterrows()):

    if row["aligned"]:
        df_fr.loc[i, "tag"] = row["deepl_tag"]
        
    else:
        messages = [{"role": "developer",
                     "content": prompt.format(
                         translation_1=row["deepl_tag"],
                         translation_2=row["llm_tag"],
                         desc="\n\n".join([row["deepl_desc"], row["llm_desc"]]))}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=SelectTranslationSchema,
            )
        
        df_fr.loc[i, "tag"] = res.choices[0].message.parsed.selection

# it
for i, row in tqdm.tqdm(df_it.iterrows()):

    if row["aligned"]:
        df_it.loc[i, "tag"] = row["deepl_tag"]
        
    else:
        messages = [{"role": "developer",
                     "content": prompt.format(
                         translation_1=row["deepl_tag"],
                         translation_2=row["llm_tag"],
                         desc="\n\n".join([row["deepl_desc"], row["llm_desc"]]))}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=SelectTranslationSchema,
            )
        
        df_it.loc[i, "tag"] = res.choices[0].message.parsed.selection

In [None]:
df_de.to_csv("indexing/data/glossary/glossary_ahv_iv_trans_de_en_aligned.csv")
df_fr.to_csv("indexing/data/glossary/glossary_ahv_iv_trans_fr_en_aligned.csv")
df_it.to_csv("indexing/data/glossary/glossary_ahv_iv_trans_it_en_aligned.csv")

#### 2. Merge between languages

In [None]:
tags_ahv_iv = list(pd.concat([df_de["tag"], df_fr["tag"], df_it["tag"]]).drop_duplicates().values)

### Align across BSV (deepl-llm)

In [None]:
df_de = pd.read_csv("indexing/data/glossary/glossary_bsv_trans_de_en.csv", index_col=0)
df_fr = pd.read_csv("indexing/data/glossary/glossary_bsv_trans_fr_en.csv", index_col=0)
df_it = pd.read_csv("indexing/data/glossary/glossary_bsv_trans_it_en.csv", index_col=0)

#### 1. Merge across languages

In [None]:
df_de["aligned"] = df_de["deepl_tag"] == df_de["llm_tag"]
df_fr["aligned"] = df_fr["deepl_tag"] == df_fr["llm_tag"]
df_it["aligned"] = df_it["deepl_tag"] == df_it["llm_tag"]

In [None]:
# de
for i, row in tqdm.tqdm(df_de.iterrows()):

    if row["aligned"]:
        df_de.loc[i, "tag"] = row["deepl_tag"]
        
    else:
        messages = [{"role": "developer",
                     "content": prompt.format(
                         translation_1=row["deepl_tag"],
                         translation_2=row["llm_tag"],
                         desc="\n\n".join([row["deepl_desc"], row["llm_desc"]]))}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=SelectTranslationSchema,
            )
        
        df_de.loc[i, "tag"] = res.choices[0].message.parsed.selection

# fr
for i, row in tqdm.tqdm(df_fr.iterrows()):

    if row["aligned"]:
        df_fr.loc[i, "tag"] = row["deepl_tag"]
        
    else:
        messages = [{"role": "developer",
                     "content": prompt.format(
                         translation_1=row["deepl_tag"],
                         translation_2=row["llm_tag"],
                         desc="\n\n".join([row["deepl_desc"], row["llm_desc"]]))}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=SelectTranslationSchema,
            )
        
        df_fr.loc[i, "tag"] = res.choices[0].message.parsed.selection

# it
for i, row in tqdm.tqdm(df_it.iterrows()):

    if row["aligned"]:
        df_it.loc[i, "tag"] = row["deepl_tag"]
        
    else:
        messages = [{"role": "developer",
                     "content": prompt.format(
                         translation_1=row["deepl_tag"],
                         translation_2=row["llm_tag"],
                         desc="\n\n".join([row["deepl_desc"], row["llm_desc"]]))}]
        res = await llm_client.beta.chat.completions.parse(
                model="gpt-4o",
                temperature=0,
                top_p=0.95,
                max_tokens=512,
                messages=messages,
                response_format=SelectTranslationSchema,
            )
        
        df_it.loc[i, "tag"] = res.choices[0].message.parsed.selection

In [None]:
df_de.to_csv("indexing/data/glossary/glossary_bsv_trans_de_en_aligned.csv")
df_fr.to_csv("indexing/data/glossary/glossary_bsv_trans_fr_en_aligned.csv")
df_it.to_csv("indexing/data/glossary/glossary_bsv_trans_it_en_aligned.csv")

#### 2. Merge between languages

In [None]:
tags_bsv = list(pd.concat([df_de["tag"], df_fr["tag"], df_it["tag"]]).drop_duplicates().values)

# Merge all tags between AHV-IV and BSV

In [None]:
tags = list(set(tags_ahv_iv + tags_bsv))

In [None]:
len(tags)

# Augment with description

In [None]:
df_ahv_iv_de = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_trans_de_en_aligned.csv", index_col=0)
df_ahv_iv_fr = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_trans_fr_en_aligned.csv", index_col=0)
df_ahv_iv_it = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_trans_it_en_aligned.csv", index_col=0)
df_bsv_de = pd.read_csv("indexing/data/glossary/glossary_bsv_trans_de_en_aligned.csv", index_col=0)
df_bsv_fr = pd.read_csv("indexing/data/glossary/glossary_bsv_trans_fr_en_aligned.csv", index_col=0)
df_bsv_it = pd.read_csv("indexing/data/glossary/glossary_bsv_trans_it_en_aligned.csv", index_col=0)

In [None]:
# de/fr/it

for dfs in [(df_ahv_iv_de, df_bsv_de, "de"), (df_ahv_iv_fr, df_bsv_fr, "fr"), (df_ahv_iv_it, df_bsv_it, "it")]:
    lang = dfs[2]
    df = pd.concat(
        [
            dfs[0],
            dfs[1]
        ], axis=0)
    
    df_tag_desc = df.drop_duplicates(subset="tag")[["tag", "llm_desc"]]
    
    df_tag_desc.reset_index(inplace=True)
    df_tag_desc.rename(columns={"tag": "tag_en", "index": f"tag_{lang}", "llm_desc": "description_en"}, inplace=True)
    
    df_lang = pd.read_csv(f"indexing/data/glossary/glossary_ahv_iv_{lang}.csv", names=[f"tag_{lang}", f"description_{lang}"], header=0)
    df_lang[f"tag_{lang}"] = df_lang[f"tag_{lang}"].apply(lambda x: x.lower())
    
    df_tag_desc.merge(df_lang, how="left", on=f"tag_{lang}").to_csv(f"indexing/data/glossary/tags_with_desc_{lang}.csv", index=None)

In [None]:
df_de = pd.read_csv("indexing/data/glossary/tags_with_desc_de.csv")
df_de["language"] = "de"
df_fr = pd.read_csv("indexing/data/glossary/tags_with_desc_fr.csv")
df_fr["language"] = "fr"
df_it = pd.read_csv("indexing/data/glossary/tags_with_desc_it.csv")
df_it["language"] = "it"
df_it

In [None]:
df = pd.concat(
    [
        df_de,
        df_fr,
        df_it
    ], axis=0
)

df.to_csv("indexing/data/glossary/tags_aligned.csv", index=None)

df = df[["tag_en", "description_en", "description_de", "description_fr", "description_it", "language"]].drop_duplicates("tag_en").sort_values(by="tag_en").reset_index(drop=True)
df

In [None]:
for i, row in df.iterrows():

    if isinstance(row["description_de"], float):
        translation = translator.translate_text(row["description_en"], target_lang="DE", model_type="prefer_quality_optimized")
        df.loc[i, "description_de"] = translation.text
        
    if isinstance(row["description_fr"], float):
        translation = translator.translate_text(row["description_en"], target_lang="FR", model_type="prefer_quality_optimized")
        df.loc[i, "description_fr"] = translation.text
        
    if isinstance(row["description_it"], float):
        translation = translator.translate_text(row["description_en"], target_lang="IT", model_type="prefer_quality_optimized")
        df.loc[i, "description_it"] = translation.text
        

In [None]:
df

In [None]:
df.to_csv("indexing/data/glossary/tags_aligned_postgres.csv", index=None)

In [None]:
# Melt the DataFrame
refactored_df = df.melt(
    id_vars=["tag_en", "description_en", "language"],  # Columns to keep
    value_vars=["description_de", "description_fr", "description_it"],  # Columns to melt
    var_name="temp_language",  # Temporary column name for the melted column
    value_name="description"  # New column name for the values
)

# Extract the suffix to update the language column
refactored_df["temp_language"] = refactored_df["temp_language"].str.split('_').str[-1]

# Update the language column based on the suffix from the original columns
refactored_df["language"] = refactored_df["temp_language"]

# Drop the temporary column and reset the index
refactored_df = refactored_df.drop(columns=["temp_language"]).reset_index(drop=True)

refactored_df

In [None]:
refactored_df.sort_values("tag_en").reset_index(drop=True).to_csv("indexing/data/glossary/tags_aligned_postgres_temp.csv", index=None)

In [None]:
refactored_df.sort_values("tag_en").reset_index(drop=True)

In [None]:
refactored_df[refactored_df["tag_en"] == "professional measure"]

# TO DO:
- remove duplicate tag entries
- eg. ahv/avs number (3x)
- ability/inability to work, capacity/incapacity for work, incapacity to earn, work capacity/incapacity

# Remove synonymous entries

In [None]:
df = pd.read_csv("indexing/data/glossary/tags_aligned_postgres_temp.csv")

In [None]:
class Duplicate(BaseModel):
    values: List[str]
    ids: List[int]

class DuplicateIdentificationSchema(BaseModel):
    duplicates: List[Duplicate]
    
prompt = """<purpose>
Your task is to identify duplicate or synonymous tag entries in <available tags> based on their name and description.
Synonymous entries might have a different name but the description will be very similar.
Identify ALL possible tag duplicates.
</purpose>

<response_format>
Duplicate(BaseModel):
    values: List[str] # the tag str values of the duplicates
    ids: List[int] # the indices of the duplicate values

DuplicateIdentificationSchema(BaseModel):
    duplicates: List[Duplicate]
</response_format>

<available_tags>
{tags}
<available_tags>
"""

messages = [{"role": "developer", "content": prompt.format(tags=df[["tag_en", "description_en"]][::3])}]

res = await llm_client.beta.chat.completions.parse(
        model="gpt-4o",
        temperature=0,
        top_p=0.95,
        max_tokens=4096,
        messages=messages,
        response_format=DuplicateIdentificationSchema,
    )

In [None]:
res.choices[0].message.parsed.duplicates

In [None]:
df[2:10]

In [None]:
df.drop([891, 892, 893], inplace=True)
df.drop([6, 7, 8], inplace=True)
df.drop([885, 886, 887], inplace=True)

In [None]:
df[877:887]

# -----> CURRENT APPROACH

- get subtopics in de/fr/it from from ahv-iv/bsv glossaries
- augment docs with subtopics in each language (no english normalization)

In [None]:
import re

In [None]:
df_ahv_iv_de = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_de.csv", names=["subtopic", "description"], header=1)
df_ahv_iv_fr = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_fr.csv", names=["subtopic", "description"], header=1)
df_ahv_iv_it = pd.read_csv("indexing/data/glossary/glossary_ahv_iv_it.csv", names=["subtopic", "description"], header=1)
df_bsv_de = pd.read_csv("indexing/data/glossary/glossary_bsv_de.csv", names=["subtopic", "description"], header=1, usecols=[0, 1])
df_bsv_fr = pd.read_csv("indexing/data/glossary/glossary_bsv_fr.csv", names=["subtopic", "description"], header=1, usecols=[0, 1])
df_bsv_it = pd.read_csv("indexing/data/glossary/glossary_bsv_it.csv", names=["subtopic", "description"], header=1, usecols=[0, 1])

In [None]:
dfs = [df_ahv_iv_de, df_ahv_iv_fr, df_ahv_iv_it, df_bsv_de, df_bsv_fr, df_bsv_it]

replace_mapping = {
    ":": "",
    "'": "_",
    "’": "_",
    '"': "",
    "- ": " ",
    "-": "_",
    "(": "",
    ")": "",
    "/": " ",
    " ": "_",
}

for df in dfs:
    df["subtopic"] = df.subtopic.apply(lambda x: x.lower())

    for i, row in df.iterrows():
        for old, new in replace_mapping.items():
            df.loc[i, "subtopic"] = df.loc[i, "subtopic"].replace(old, new)
            df.loc[i, "subtopic"] = re.sub(r'_+', '_', df.loc[i, "subtopic"])

    

In [None]:
df_ahv_iv_de.to_csv("indexing/data/glossary/glossary_ahv_iv_de_normalized.csv", index=None)
df_ahv_iv_fr.to_csv("indexing/data/glossary/glossary_ahv_iv_fr_normalized.csv", index=None)
df_ahv_iv_it.to_csv("indexing/data/glossary/glossary_ahv_iv_it_normalized.csv", index=None)
df_bsv_de.to_csv("indexing/data/glossary/glossary_bsv_de_normalized.csv", index=None)
df_bsv_fr.to_csv("indexing/data/glossary/glossary_bsv_fr_normalized.csv", index=None)
df_bsv_it.to_csv("indexing/data/glossary/glossary_bsv_it_normalized.csv", index=None)

# Scrap up-to-date data

In [None]:
import requests
from bs4 import BeautifulSoup
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import PyPDFToDocument

In [None]:
@dataclass
class Document:
    url: str
    text: str
    language: str
    tags: List[str]
    subtopics: List[str] = None
    #source: str = None
    summary: str = None
    doctype: str = None
    organization: str = None
    #context: str = None
    #parent_node: int = None
    #childen_nodes: List[int] = None

### AHV-IV mementos

In [None]:
replace_mapping = {
    "Stand: ": "",
    "Etat: ": "",
    "Stato: ": ""
}

def extract_pdf_metadata(pdf: BeautifulSoup):
    
    url = "https://www.ahv-iv.ch" + pdf.a["href"]
    
    if url.endswith(".d"):
        language = "de"
    elif url.endswith(".f"):
        language = "fr"
    elif url.endswith(".i"):
        language = "it"
    else:
        language = "de"
    
    pdf_title = pdf.find("div", {"class": "co-document-main"}).b.text.strip()
    publication_date = pdf.find("div", {"class": "co-document-state"}).text.strip()
    for old, new in replace_mapping.items():
        publication_date = publication_date.replace(old, new)

    return {
        "url": url,
        "language": language,
        "title": pdf_title,
        "publication_date": publication_date,
    }

In [None]:
ahv_iv_memento_mapping = {
    'https://www.ahv-iv.ch/p/1.01.d': ['general'],
 'https://www.ahv-iv.ch/p/1.02.d': ['general'],
 'https://www.ahv-iv.ch/p/1.03.d': ['general'],
 'https://www.ahv-iv.ch/p/1.04.d': ['general'],
 'https://www.ahv-iv.ch/p/1.05.d': ['general'],
 'https://www.ahv-iv.ch/p/1.07.d': ['general'],
 'https://www.ahv-iv.ch/p/2.01.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.02.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.03.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.04.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.05.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.06.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.07.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.08.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.09.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.10.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.11.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.12.d': ['contributions'],
 'https://www.ahv-iv.ch/p/2.14.d': ['bankruptcy'],
 'https://www.ahv-iv.ch/p/31.d':  ['ahv_stabilisation_21'],
 'https://www.ahv-iv.ch/p/3.01.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.02.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.03.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.04.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.05.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.06.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.07.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.08.d': ['ahv_services'],
 'https://www.ahv-iv.ch/p/4.01.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.02.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.03.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.04.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.05.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.06.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.07.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.08.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.09.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.11.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.12.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.13.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.14.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.15.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.16.d': ['iv_services'],
 'https://www.ahv-iv.ch/p/5.01.d': ['complementary_services'],
 'https://www.ahv-iv.ch/p/5.02.d': ['complementary_services'],
 'https://www.ahv-iv.ch/p/52.d': ['complementary_services'],
 'https://www.ahv-iv.ch/p/5.03.d': ['transitory_services'],
 'https://www.ahv-iv.ch/p/6.01.d': ['loss_of_earnings_allowance'],
 'https://www.ahv-iv.ch/p/6.02.d': ['maternity_allowance'],
 'https://www.ahv-iv.ch/p/6.04.d': ['allowance_for_the_other_parent'],
 'https://www.ahv-iv.ch/p/6.08.d': ['family_allowances'],
 'https://www.ahv-iv.ch/p/6.09.d': ['family_allowances'],
 'https://www.ahv-iv.ch/p/6.10.d': ['support_allowance'],
 'https://www.ahv-iv.ch/p/6.11.d': ['adoption_allowance'],
 'https://www.ahv-iv.ch/p/10.01.d': ['international'],
 'https://www.ahv-iv.ch/p/10.02.d': ['international'],
 'https://www.ahv-iv.ch/p/10.03.d': ['international'],
 'https://www.ahv-iv.ch/p/11.01.d': ['international'],
 'https://www.ahv-iv.ch/p/880.d': ['international'],
 'https://www.ahv-iv.ch/p/890.d': ['international'],
 'https://www.ahv-iv.ch/p/6.05.d': ['accident_insurance'],
 'https://www.ahv-iv.ch/p/6.06.d': ['occupational_benefits'],
 'https://www.ahv-iv.ch/p/6.07.d': ['health_insurance'],
 'https://www.ahv-iv.ch/p/1.2020.d': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/1.2024.d': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/1.2025.d': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/007.003.d': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/7qdWS5iD0UCLkZDTcDwZDA/Document/Liste%20Paedakustiker%20August%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/VycMj-qmQEOCFBBfpGqx_w/Document/Liste%20ORL-Experten%20November%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/fewE3XQRcUWJxYaAJMWd8Q/Document/Liste%20implantierte%20H%C3%B6rhilfen%20Oktober%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/p/300.001.d': ['hearing_aids'],  
 'https://www.ahv-iv.ch/p/1.01.f': ["general"],
 'https://www.ahv-iv.ch/p/1.02.f': ["general"],
 'https://www.ahv-iv.ch/p/1.03.f': ["general"],
 'https://www.ahv-iv.ch/p/1.04.f': ["general"],
 'https://www.ahv-iv.ch/p/1.05.f': ["general"],
 'https://www.ahv-iv.ch/p/1.07.f': ["general"],
 'https://www.ahv-iv.ch/p/2.01.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.02.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.03.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.04.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.05.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.06.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.07.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.08.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.09.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.10.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.11.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.12.f': ["contributions"],
 'https://www.ahv-iv.ch/p/2.14.f': ["bankruptcy"],
 'https://www.ahv-iv.ch/p/31.f': ["ahv_stabilisation_21"],
 'https://www.ahv-iv.ch/p/3.01.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.02.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.03.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.04.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.05.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.06.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.07.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/3.08.f': ["ahv_services"],
 'https://www.ahv-iv.ch/p/4.01.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.02.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.03.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.04.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.05.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.06.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.07.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.08.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.09.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.11.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.12.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.13.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.14.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.15.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/4.16.f': ["iv_services"],
 'https://www.ahv-iv.ch/p/5.01.f': ['complementary_services'],
 'https://www.ahv-iv.ch/p/5.02.f': ['complementary_services'],
 'https://www.ahv-iv.ch/p/51.f': ['complementary_services'],
 'https://www.ahv-iv.ch/p/52.f': ['complementary_services'],
 'https://www.ahv-iv.ch/p/5.03.f': ['transitory_services'],
 'https://www.ahv-iv.ch/p/6.01.f': ['loss_of_earnings_allowance'],
 'https://www.ahv-iv.ch/p/6.02.f': ['maternity_allowance'],
 'https://www.ahv-iv.ch/p/6.04.f': ['allowance_for_the_other_parent'],
 'https://www.ahv-iv.ch/p/6.10.f': ['support_allowance'],
 'https://www.ahv-iv.ch/p/6.11.f': ['adoption_allowance'],
 'https://www.ahv-iv.ch/p/6.08.f': ['family_allowances'],
 'https://www.ahv-iv.ch/p/6.09.f': ['family_allowances'],
 'https://www.ahv-iv.ch/p/10.01.f': ['international'],
 'https://www.ahv-iv.ch/p/10.02.f': ['international'],
 'https://www.ahv-iv.ch/p/10.03.f': ['international'],
 'https://www.ahv-iv.ch/p/11.01.f': ['international'],
 'https://www.ahv-iv.ch/p/880.f': ['international'],
 'https://www.ahv-iv.ch/p/890.f': ['international'],
 'https://www.ahv-iv.ch/p/6.05.f': ['accident_insurance'],
 'https://www.ahv-iv.ch/p/6.06.f': ['occupational_benefits'],
 'https://www.ahv-iv.ch/p/6.07.f': ['health_insurance'],
 'https://www.ahv-iv.ch/p/1.2020.f': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/1.2024.f': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/1.2025.f': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/007.003.f': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/7qdWS5iD0UCLkZDTcDwZDA/Document/Liste%20Paedakustiker%20August%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/VycMj-qmQEOCFBBfpGqx_w/Document/Liste%20ORL-Experten%20November%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/fewE3XQRcUWJxYaAJMWd8Q/Document/Liste%20implantierte%20H%C3%B6rhilfen%20Oktober%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/p/300.001.f': ['hearing_aids'],
 'https://www.ahv-iv.ch/p/1.01.i': ['general'],
 'https://www.ahv-iv.ch/p/1.02.i': ['general'],
 'https://www.ahv-iv.ch/p/1.03.i': ['general'],
 'https://www.ahv-iv.ch/p/1.04.i': ['general'],
 'https://www.ahv-iv.ch/p/1.05.i': ['general'],
 'https://www.ahv-iv.ch/p/1.07.i': ['general'],
 'https://www.ahv-iv.ch/p/2.01.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.02.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.03.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.04.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.05.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.06.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.07.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.08.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.09.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.10.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.11.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.12.i': ['contributions'],
 'https://www.ahv-iv.ch/p/2.14.i': ['bankruptcy'],
 'https://www.ahv-iv.ch/p/31.i':  ['ahv_stabilisation_21'],
 'https://www.ahv-iv.ch/p/3.01.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.02.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.03.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.04.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.05.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.06.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.07.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/3.08.i': ['ahv_services'],
 'https://www.ahv-iv.ch/p/4.01.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.02.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.03.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.04.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.05.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.06.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.07.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.08.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.09.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.11.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.12.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.13.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.14.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.15.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/4.16.i': ['iv_services'],
 'https://www.ahv-iv.ch/p/5.01.i': ['complementary_services'],
 'https://www.ahv-iv.ch/p/5.02.i': ['complementary_services'],
 'https://www.ahv-iv.ch/p/52.i': ['complementary_services'],
 'https://www.ahv-iv.ch/p/5.03.i': ['transitory_services'],
 'https://www.ahv-iv.ch/p/6.01.i': ['loss_of_earnings_allowance'],
 'https://www.ahv-iv.ch/p/6.02.i': ['maternity_allowance'],
 'https://www.ahv-iv.ch/p/6.04.i': ['allowance_for_the_other_parent'],
 'https://www.ahv-iv.ch/p/6.08.i': ['family_allowances'],
 'https://www.ahv-iv.ch/p/6.09.i': ['family_allowances'],
 'https://www.ahv-iv.ch/p/6.10.i': ['support_allowance'],
 'https://www.ahv-iv.ch/p/6.11.i': ['adoption_allowance'],
 'https://www.ahv-iv.ch/p/10.01.i': ['international'],
 'https://www.ahv-iv.ch/p/10.02.i': ['international'],
 'https://www.ahv-iv.ch/p/10.03.i': ['international'],
 'https://www.ahv-iv.ch/p/11.01.i': ['international'],
 'https://www.ahv-iv.ch/p/880.i': ['international'],
 'https://www.ahv-iv.ch/p/890.i': ['international'],
 'https://www.ahv-iv.ch/p/6.05.i': ['accident_insurance'],
 'https://www.ahv-iv.ch/p/6.06.i': ['occupational_benefits'],
 'https://www.ahv-iv.ch/p/6.07.i': ['health_insurance'],
 'https://www.ahv-iv.ch/p/1.2020.i': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/1.2024.i': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/1.2025.i': ['annual_modifications'],
 'https://www.ahv-iv.ch/p/007.003.i': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/7qdWS5iD0UCLkZDTcDwZDA/Document/Liste%20Paedakustiker%20August%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/VycMj-qmQEOCFBBfpGqx_w/Document/Liste%20ORL-Experten%20November%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/Portals/0/adam/AHV-IV/fewE3XQRcUWJxYaAJMWd8Q/Document/Liste%20implantierte%20H%C3%B6rhilfen%20Oktober%202024.pdf': ['hearing_aids'],
 'https://www.ahv-iv.ch/p/300.001.i': ['hearing_aids'],
}

In [None]:
# 1. get topics
urls = ["https://www.ahv-iv.ch/de/Merkbl%C3%A4tter", "https://www.ahv-iv.ch/fr/M%C3%A9mentos", "https://www.ahv-iv.ch/it/Opuscoli"]

pdf_meta = []
for url in urls:
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    topics = soup.find("ul", {"class": "ly-nav ly-nav-sub"})
    
    topics_urls = {}
    for topic in topics.find_all("li"):
        topics_urls[topic.a["href"]] = topic.text.strip()

    print("TOPIC URLS: ", topics_urls)

    # 2. get PDFs by topic
    for url, topic in topics_urls.items():
        res = requests.get(url)
        soup = BeautifulSoup(res.content)
    
        pdf_div = soup.find("div", {"class": "co-documents-content"})
    
        for pdf_html in pdf_div.find_all("div", {"class": "sc-element co-fileType-PDF published"}):
            meta = extract_pdf_metadata(pdf_html)
            meta["tags"] = ahv_iv_memento_mapping[meta["url"]]
            pdf_meta.append(meta)

    print("LEN PDF META: ", len(pdf_meta))

pd.DataFrame(pdf_meta)

In [None]:
# 3. download pdf bytes
pdf_urls = [pdf["url"] for pdf in pdf_meta]
fetcher = LinkContentFetcher()
pdf_bytes = fetcher.run(urls=pdf_urls)

len(pdf_bytes["streams"])

In [None]:
# 4. parse pdfs
parser = PyPDFToDocument()
documents = parser.run(sources=pdf_bytes["streams"])

len(documents["documents"])

In [None]:
# 5. augment with topic + llm tags
prompt = """<instructions>
    <instruction>Your purpose is to assign subtopics from the list of available <subtopics> and create a short summary (3-5 sentences) for the following <doc>.</instruction>
    <instruction>The selected subtopics should be clearly recognizable in the <doc>.</instruction>
    <instruction>Look at the description of each subtopic to ensure selected subtopics are relevant to the <doc>.</instruction>
    <instruction>The selected subtopics must be related to the general <tags> and provide lower-level (more precise) information about the document content.</instruction>
    <instruction>You can also look at the provided <url> to help guide your tagging decision.</instruction>
</instructions>

<response_format>
class SubtopicTaggingSchema(BaseModel):
    sutopics: List[str] # one or more subtopics.
    summary: str # in same language as <doc>.
</response_format>

<subtopics>
{subtopics}
</subtopics>

<url>
{url}
</url>

<tags>
{tags}
</tags>

<doc>
{doc}
</doc>
"""

In [None]:
class SubtopicTaggingSchema(BaseModel):
    subtopics: List[str]
    summary: str

# Important Note: only use ahv-iv subtopics for the moment

In [None]:
subtopics = {
    "de": pd.read_csv("indexing/data/glossary/glossary_ahv_iv_de_normalized.csv"),
    "fr": pd.read_csv("indexing/data/glossary/glossary_ahv_iv_fr_normalized.csv"),
    "it": pd.read_csv("indexing/data/glossary/glossary_ahv_iv_it_normalized.csv")
}

In [None]:
tags = ahv_iv_memento_mapping.values()
tags = list(set([x[0] for x in list(tags)]))

organizations = "ZAS:ALL,EAK:ALL"

docs = []
for doc, meta in tqdm.tqdm(zip(documents["documents"], pdf_meta)):
    
    lang = meta["language"]
    subtopics_lang = "\n\n".join([f'**{row["subtopic"]}**: {row["description"]}' for i, row in subtopics[lang].iterrows()])
    messages = [{"role": "developer",
                 "content": prompt.format(
                     url=doc.meta["url"],
                     doc=doc.content,
                     subtopics=subtopics_lang,
                     tag=meta["tags"][0])
                }]
    
    res = await llm_client.beta.chat.completions.parse(
                    model="gpt-4o",
                    temperature=0,
                    top_p=0.95,
                    max_tokens=512,
                    messages=messages,
                    response_format=SubtopicTaggingSchema,
                )

    tags = meta["tags"]
    tags = ",".join(list(set(tags)))

    inferred_subtopics = []
    inferred_subtopics.extend(res.choices[0].message.parsed.subtopics)
    inferred_subtopics = ",".join(list(set(inferred_subtopics)))

    summary = res.choices[0].message.parsed.summary # for ECL
    
    docs.append(
        Document(
            url=doc.meta["url"],
            text=doc.content.strip(),
            tags=tags,
            subtopics=inferred_subtopics,
            language=lang,
            summary=summary,
            doctype="context_doc",
            organization=organizations
        )
    )


In [None]:
pd.DataFrame([doc.__dict__ for doc in docs])

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).to_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm.csv", index=None)

In [None]:
df = pd.read_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm.csv") 

In [None]:
class DocumentSplits(BaseModel):
    split_str: List[str]
    previous_context: str
    following_context: str
    
prompt = """<purpose>
Your task is to split the <doc> into parts based on the <rules>.
</purpose>

<rules>
Number of characters are defined in <n_chars>.
1. If the <doc> is ≤ 50,000 characters, do not split it. Return it as is.
2. If the <doc> is > 50,000 and ≤ 100,000 characters, split it into 2 logical, well-structured parts. 
3. If the <doc> is > 100,000 characters, split it into 3 logical, well-structured parts.

For each split_str in list, return the exact string sentence upon which to split (no reformulation to ensure str.split() works in python).
IMPORTANT: preserve original string formatting (eg. "\n", "\x0c", etc.).
Divide the text logically where it makes sense (e.g., avoid splitting a topic, paragraph, or section mid-way). Ensure each part maintains readability and continuity.
Write a short contextual summary for each part (content following part 1, content before/after part 2 contextualizing it, content before part 3 contextualizing it).
</rules>

<response_format>
DocumentSplits(BaseModel):
    split_str: List[str] # list of exact string matches to perform split upon. PRESERVE ORIGINAL STRING FORMATTING. Select 2 sentences from original text per list item.
    previous_context: str # to contextualize previous document content. Written in same language as <doc>. 3-5 detailed sentences capturing context.
    following_context: str # to contextualize following document content. Written in same language as <doc>. 3-5 detailed sentences capturing context.
</response_format>

<n_chars>
{n_chars}
</n_chars>

<doc>
{doc}
</doc>
"""

MAX_N_CHARS = 131072

for i, row in df.iterrows():

    if len(row.text) > MAX_N_CHARS:
        messages = [{"role": "developer",
             "content": prompt.format(
                 n_chars=len(row.text),
                 doc=row.text)
            }]

        res = await llm_client.beta.chat.completions.parse(
                        model="gpt-4o",
                        temperature=0,
                        top_p=0.95,
                        max_tokens=512,
                        messages=messages,
                        response_format=DocumentSplits,
                    )
        break

In [None]:
res.choices[0].message.parsed.split_str

In [None]:
res.choices[0].message.parsed.split_str[0]

In [None]:
row.text.split(res.choices[0].message.parsed.split_str[0])[1]

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).to_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm.csv", index=None)

### eak.admin.ch

### Praxisleitfaden EAK

In [None]:
df = pd.read_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de.csv")
df.head()

In [None]:
df.rename(columns={"tag": "tags"}, inplace=True)

In [None]:
tags = ahv_iv_memento_mapping.values()
tags = list(set([x[0] for x in list(tags)]))

docs = []
for i, row in tqdm.tqdm(df.iterrows()):

    messages = [{"role": "developer", "content": prompt.format(url=row["url"], doc=row["text"], tags=tags)}]

    res = await llm_client.beta.chat.completions.parse(
                    model="gpt-4o",
                    temperature=0,
                    top_p=0.95,
                    max_tokens=512,
                    messages=messages,
                    response_format=TagExtraction,
                )
    
    #tags = [row["tags"]]
    tags = []
    tags.extend(res.choices[0].message.parsed.tags)
    tags = ",".join(list(set(tags)))

    topics = []
    topics.extend(res.choices[0].message.parsed.topics)
    topics = ",".join(list(set(topics)))
    
    docs.append(
        Document(
            url = row["url"],
            text=row["text"],
            language=row["language"],
            tags=tags,
            topics=topics,
            summary=res.choices[0].message.parsed.summary,
            source=row["source"]
            )
        )



In [None]:
pd.DataFrame([doc.__dict__ for doc in docs])

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).to_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de_tags_llm.csv", index=None)

# AKIS

In [None]:
df.head()

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt-4o")

In [None]:
tags = ahv_iv_memento_mapping.values()
tags = list(set([x[0] for x in list(tags)]))

organizations = "EAK"

#docs = []
for i, row in tqdm.tqdm(df[983:].iterrows()):
    
    lang = "de"
    subtopics_lang = "\n\n".join([f'**{row["subtopic"]}**: {row["description"]}' for i, row in subtopics[lang].iterrows()])

    tokens = tokenizer.encode(row.text)
    if len(tokens) > 16_384 - 512: 
        text = tokenizer.decode(tokens[:15_872])
    else:
        text = row.text
        
    messages = [{"role": "developer",
                 "content": prompt.format(
                     url=row.url,
                     doc=row.text,
                     subtopics=subtopics_lang,
                     tags=row.tags)
                }]
    
    res = await llm_client.beta.chat.completions.parse(
                    model="gpt-4o",
                    temperature=0,
                    top_p=0.95,
                    max_tokens=512,
                    messages=messages,
                    response_format=SubtopicTaggingSchema,
                )

    inferred_subtopics = []
    inferred_subtopics.extend(res.choices[0].message.parsed.subtopics)
    inferred_subtopics = ",".join(list(set(inferred_subtopics)))

    summary = res.choices[0].message.parsed.summary # for ECL
    
    docs.append(
        Document(
            url=row.url,
            text=row.text.strip(),
            tags=row.tags,
            subtopics=inferred_subtopics,
            language=lang,
            summary=summary,
            doctype="context_doc",
            organization=organizations
        )
    )


In [None]:
len(messages[0]["content"])

In [None]:
i

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs])

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).subtopics.apply(lambda x: not (len(x) > 1)).sum()

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).to_csv("indexing/data/zas_eak_copilot/akis/akis_augmented.csv", index=None)

### AHV Lernbaustein 2024

In [None]:
df = pd.read_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024.csv")

# replace tag col with "tags"
# update "source" col
df.rename(columns={"tag": "tags"}, inplace=True)
df["source"] = "AHV Lernbaustein 2024"

In [None]:
class TranslationSchema(BaseModel):
    translations: List[str]

translation_prompt = """<purpose>
Translate any element not in English in the list of <topics> to English.
IMPORTANT: If a topic is already in English, keep it as is.
Keep in mind the topic is Social Insurances.
</purpose>

<response_format>
TranslationSchema(BaseModel):
    translations: List[str] # exact translated correspondance with <topics>

Be careful to maintain acronyms, capital letters, etc.
All acronyms must remain unchanged (eg. LAMal -> LAMal, IV -> IV, EO -> EO, MSE -> MSE, etc.)
</response_format>

<examples>
EU-Abkommen -> EU agreement
EFTA-Abkommen -> EFTA Agreement
IV-Taggeld -> IV Daily Allowance
Erwerbsersatzordnung (EO) -> Income Compensation Ordinance (EO)
Mutterschaftsentschädigung (MSE) -> Maternity allowance (MSE)
</examples>

<topics>
{topics}
</topics>
"""

In [None]:
tags = ahv_iv_memento_mapping.values()
tags = list(set([x[0] for x in list(tags)]))

docs = []
for i, row in tqdm.tqdm(df.iterrows()):

    messages = [{"role": "developer", "content": prompt.format(url=row["url"], doc=row["text"], tags=tags)}]

    res = await llm_client.beta.chat.completions.parse(
                    model="gpt-4o",
                    temperature=0,
                    top_p=0.95,
                    max_tokens=512,
                    messages=messages,
                    response_format=TagExtraction,
                )
    
    #tags = [row["tags"]]
    tags = []
    tags.extend(res.choices[0].message.parsed.tags)
    tags = ",".join(list(set(tags)))

    topics = []
    topics.extend(res.choices[0].message.parsed.topics)
    

    # translate topics into EN with deepl
    #for i, topic in enumerate(topics):
    #    trans = translator.translate_text(topic, target_lang="EN-GB", model_type="prefer_quality_optimized")
    #    if trans.detected_source_lang != "EN":
    #        print(trans.detected_source_lang, topic, trans.text)
    #        topics[i] = trans.text

    for i, topic in enumerate(topics):
        trans = await llm_client.beta.chat.completions.parse(
                    model="gpt-4o",
                    temperature=0,
                    top_p=0.95,
                    max_tokens=512,
                    messages=[{"role": "developer", "content": translation_prompt.format(topics=topics)}],
                    response_format=TranslationSchema,
                )

    trans_topics = trans.choices[0].message.parsed.translations
    trans_topics = ",".join(list(set(trans_topics))).title()
    
    docs.append(
        Document(
            url = row["url"],
            text=row["text"],
            language=row["language"],
            tags=tags,
            topics=trans_topics,
            summary=res.choices[0].message.parsed.summary,
            source=row["source"]
            )
        )
    break

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).topics[0]

In [None]:
topics

In [None]:
pd.DataFrame([doc.__dict__ for doc in docs]).to_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm.csv", index=None)

# OR load base data

In [None]:
df = pd.read_csv("./indexing/data/document_embed_DE_FR_language_tags.csv")

In [None]:
# replace NaN with None
tags = df.tags.apply(lambda x: ast.literal_eval(x)[0] if isinstance(x, str) else None)
df["tags"] = tags
# remove embedding column
df.drop(columns=["embedding"], inplace=True)
df.to_csv("./indexing/data/document_DE_FR_language_tags.csv", index=None)

### avs-ai

In [None]:
df[:30]

In [None]:
new_tags = df['url'].map(ahv_iv_mapping)
new_tags[:30]

In [None]:
sum(df.url.str.contains("ahv-iv.ch"))

In [None]:
ahv_iv_data = df[:106]
ahv_iv_data.loc[:, "tags"] = new_tags[:106]
ahv_iv_data

In [None]:
ahv_iv_data.to_csv("./indexing/data/ahv_iv_de_fr_tags.csv", index=None)

### eak.admin.ch

In [None]:
eak_admin_ch = df[106:]
eak_admin_ch

In [None]:
eak_mapping = {
    "anschluss": "Anschluss",
    "firmen": "Firmen",
    "beitraege_und_loehne": "Beiträge und Löhne",
    "kinder": "Kinder",
    "erwerbsersatz": "Erwerbsersatz",
    "ahv-konto": "AHV-Konto",
    "zivilstand": "Zivilstand",
    "ausbildung": "Ausbildung",
    "arbeit": "Arbeit",
    "arbeitsunterbruch": "Arbeitsunterbruch",
    "pensionierung": "Pensionierung",
    "im_ausland": "Im Ausland",
    "organisation": "Organisation",
    "publikationen": "Publikationen",
    "kurse-und-beratung": "Kurse und Beratung",
    "dokumentation": "Private",
    "familienzulagen": "Familienzulagen", 
    "leistungen": "Leistungen"
}

In [None]:
matches = {k:[] for k in eak_admin_ch.url}

In [None]:
for i, row in eak_admin_ch.iterrows():
    for k, v in eak_mapping.items():
        if k.lower() in row.url.lower():
            matches[row.url].append(v)

In [None]:
eak_admin_ch = eak_admin_ch.reset_index(drop=True)
eak_admin_ch.loc[:, "tags"] = pd.Series([x if x else None for x in list(matches.values())])

In [None]:
eak_admin_ch.tags[30:60]

In [None]:
eak_admin_ch.loc[:, 'tags'] = eak_admin_ch['tags'].apply(lambda x: ','.join(x) if isinstance(x, list) else '')

In [None]:
eak_admin_ch[60:100]

In [None]:
eak_admin_ch.to_csv("./indexing/data/eak_admin_ch_de_fr_tags.csv", index=None)

# REF platform (tag) - LEGACY

### Praxisleitfaden EAK

In [None]:
df = pd.read_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de.csv")
df.head()

In [None]:
# replace tag col with "tags"
df.rename(columns={"tag": "tags"}, inplace=True)

In [None]:
df

In [None]:
df.to_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de_tags.csv", index=None)

### AHV Lernbaustein 2024

In [None]:
df = pd.read_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024.csv")
df.head()

In [None]:
# replace tag col with "tags"
# update "source" col
df.rename(columns={"tag": "tags"}, inplace=True)
df["source"] = "AHV Lernbaustein 2024"

In [None]:
df

In [None]:
df.to_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags.csv", index=None)