In [1]:
import pandas as pd
import tqdm
import math

# AHV-IV MEMENTO

In [None]:
df = pd.read_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm_EMBED.csv")
df.rename(columns={"text_summary": "summary_embedding"}, inplace=True)
len(df)

In [None]:
df.head()

In [None]:
df.to_csv("indexing/data/to_upsert/ahv_iv_memento/ahv_iv_de_fr_it_tags_subtopics_llm_EMBED.csv", index=None)

# EAK - AHV LERNBAUSTEIN 2024

In [None]:
df = pd.read_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm_EMBED.csv")
df.rename(columns={"text_summary": "summary_embedding"}, inplace=True)
len(df)

In [None]:
df.head()

In [None]:
df.to_csv("indexing/data/to_upsert/AHV_Lernbaustein_2024/AHV_Lernbaustein_2024_tags_llm_EMBED.csv", index=None)

# EAK PRAXISLEITFADEN 2024

In [None]:
df = pd.read_csv("indexing/data/to_upsert/Guide_Pratique_CAF_CFC/guide_pratique_caf_cfc_de_tags_llm.csv")
len(df)

In [None]:
df.head()

# AKIS

In [None]:
df = pd.read_csv("indexing/data/zas_eak_copilot/akis/akis_EMBED_2.csv")
len(df)

In [None]:
df.head()

# EAK ADMIN CH

In [None]:
df = pd.read_csv("indexing/data/to_upsert/eak_admin_ch/eak_admin_ch_de_fr_tags_NEW.csv")
len(df)

In [None]:
df.head()

# FEDLEX

In [3]:
df = pd.read_csv("../preprocessing/data/output/fedlex.csv")
len(df)

13

In [None]:
df.head()

# OFAS

In [2]:
df = pd.read_csv("../preprocessing/data/output/ofas.csv")
len(df)

64

In [None]:
df.head()

# AUTOCOMPLETE

In [None]:
df = pd.read_csv("indexing/data/to_upsert/autocomplete/question.csv")

df["text_embedding"] = None

len(df)

In [None]:
df.head()

In [None]:
for i, row in tqdm.tqdm(df.iterrows()):

    embeddings = await get_embedding([row.text])

    df.loc[i, "text_embedding"] = str(embeddings[0].embedding)


In [None]:
df.tail()

In [None]:
df.to_csv("indexing/data/to_upsert/autocomplete/question_EMBED.csv", index=None)

In [None]:
df.to_csv("indexing/data/to_upsert/autocomplete/eak_admin_ch_de_fr_tags_EMBED.csv", index=None)

# EMBEDDING

In [4]:
MAX_INPUT_TOKENS = 8191

In [5]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

In [6]:
import sys
import os

# Add the src directory to the system path
sys.path.append(os.path.abspath(os.path.join('..', '..', 'src', 'copilot', 'app')))

# Now you can import the function
from utils.embedding import get_embedding

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [7]:
df["text_embedding"] = None
df["tags_embedding"] = None
df["subtopics_embedding"] = None
df["summary_embedding"] = None
df["hyq_embedding"] = None
df["hyq_declarative_embedding"] = None

## WITHOUT CHUNKING

In [None]:
for i, row in tqdm.tqdm(df.iterrows()):

    try:
        tokens = tokenizer.encode(row.text)
        if len(tokens) > MAX_INPUT_TOKENS:
            truncated_text = tokenizer.decode(tokens[:MAX_INPUT_TOKENS])
            embeddings = await get_embedding(truncated_text)
        else:
            embeddings = await get_embedding(row.text)            
        tags_embedding = await get_embedding(row.tags)
        summary_embedding = await get_embedding(row.summary)
        hyq_embedding = await get_embedding(row.hyq)
        hyq_declarative_embedding = await get_embedding(row.hyq_declarative)
        subtopics_embedding = await get_embedding(row.subtopics)
    except Exception as e:
        embeddings = None

    df.loc[i, "text_embedding"] = str(embeddings) if embeddings else None
    df.loc[i, "summary_embedding"] = str(summary_embedding)
    df.loc[i, "hyq_embedding"] = str(hyq_embedding)
    df.loc[i, "hyq_declarative_embedding"] = str(hyq_declarative_embedding)
    df.loc[i, "subtopics_embedding"] = str(subtopics_embedding)
    df.loc[i, "tags_embedding"] = str(tags_embedding)


## WITH CHUNKING

In [8]:
import math

def chunk_text_uniform(text: str, tokenizer, max_tokens: int = MAX_INPUT_TOKENS, overlap: int = 128):
    """
    Split text into evenly sized chunks, each up to max_tokens in length, with optional overlap.

    :param text: The input text to split.
    :param tokenizer: A tokenizer with encode/decode methods (e.g., from tiktoken or HuggingFace).
    :param max_tokens: Maximum tokens allowed in each chunk.
    :param overlap: Number of tokens to overlap between consecutive chunks.
    :return: A generator yielding each chunk as a string.
    """
    tokens = tokenizer.encode(text)
    n_tokens = len(tokens)
    
    # If the text is within the limit, just yield it as one chunk
    if n_tokens <= max_tokens:
        yield tokenizer.decode(tokens)
        return

    # 1) Decide how many chunks we need, ignoring overlap for a moment
    #    We subtract overlap to ensure we won't exceed max_tokens on each chunk
    chunk_count = math.ceil(n_tokens / (max_tokens - overlap))

    # 2) Compute an ideal chunk size so all chunks are about the same length
    #    (some chunks might be smaller if n_tokens isn't divisible)
    chunk_size = math.ceil(n_tokens / chunk_count)

    # 3) Generate chunks in a sliding window with overlap
    start = 0
    for i in range(chunk_count):
        end = min(start + chunk_size, n_tokens)
        chunk_tokens = tokens[start:end]
        yield tokenizer.decode(chunk_tokens)

        # Move start forward by chunk_size - overlap
        # so we get the desired overlap on the next chunk
        start += (chunk_size - overlap)
        if start >= n_tokens:
            break


In [9]:
async def build_chunked_df(df: pd.DataFrame) -> pd.DataFrame:
    new_rows = []

    for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        # 1) Embed fields that are NOT chunked, only once
        try:
            tags_embedding = await get_embedding(row.tags)
            summary_embedding = await get_embedding(row.summary)
            hyq_embedding = await get_embedding(row.hyq)
            hyq_declarative_embedding = await get_embedding(row.hyq_declarative)
            subtopics_embedding = await get_embedding(row.subtopics)
        except Exception as e:
            # If embedding fails for these fields, skip or handle differently
            tags_embedding = summary_embedding = hyq_embedding = None
            hyq_declarative_embedding = subtopics_embedding = None

        # 2) Handle chunking of 'text'
        tokens = tokenizer.encode(row.text)
        if len(tokens) <= MAX_INPUT_TOKENS:
            # No need to chunk
            chunked_texts = [row.text]
        else:
            # Chunk into uniform segments
            chunked_texts = list(chunk_text_uniform(
                row.text,
                tokenizer
            ))
            
        for chunk in chunked_texts:
            try:
                chunk_embedding = await get_embedding(chunk)
            except:
                chunk_embedding = None

            # Create a new row for each chunk
            new_rows.append({
                **row.to_dict(),
                "text": chunk,
                "text_embedding": chunk_embedding,
                "tags_embedding": tags_embedding,
                "summary_embedding": summary_embedding,
                "hyq_embedding": hyq_embedding,
                "hyq_declarative_embedding": hyq_declarative_embedding,
                "subtopics_embedding": subtopics_embedding
            })

    # Convert the collected rows into a new DataFrame
    return pd.DataFrame(new_rows)

In [10]:
df_chunked = await build_chunked_df(df)

  0%|          | 0/13 [00:00<?, ?it/s]

2025-03-18 17:53:22,522 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-18 17:53:22,526 - utils.embedding - INFO - Embedding successfull with model: text-embedding-3-small
2025-03-18 17:53:23,412 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-18 17:53:23,414 - utils.embedding - INFO - Embedding successfull with model: text-embedding-3-small
2025-03-18 17:53:24,142 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-18 17:53:24,144 - utils.embedding - INFO - Embedding successfull with model: text-embedding-3-small
2025-03-18 17:53:25,158 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-18 17:53:25,168 - utils.embedding - INFO - Embedding successfull with model: text-embedding-3-small
2025-03-18 17:53:25,709 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 2

In [None]:
df. info()

In [None]:
df_chunked.info()

In [None]:
df_chunked.tail()

In [None]:
df.head()

In [11]:
df_chunked.to_csv("data/output/fedlex_EMBED.csv", index=None)

In [None]:
df = pd.read_csv("indexing/data/zas_eak_copilot/akis/akis_EMBED_2.csv")

In [None]:
df.tail()