In [None]:
# %pip install tiktoken langchain langchain_experimental langchain_openai

In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [None]:
import sys
sys.path.append("..")

In [None]:
# Select where to run notebook: "azure" or "local"
# my_run = "azure"
my_run = "local"

In [None]:
import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf

In [None]:
import os

if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)

    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

In [None]:
import pandas as pd
from pathlib import Path

comparison_folder = f"{cf.raadsinformatie_out_folder}/comparison/"
Path(comparison_folder).mkdir(parents=True, exist_ok=True)

chunking_comparison_file = Path(comparison_folder, "chunking_comparison.csv")
chunking_comparison_file_xlsx = Path(comparison_folder, "chunking_comparison.xlsx")

chunking_folder = f"{cf.raadsinformatie_out_folder}/comparison/chunking/"

In [None]:
import os
import glob

woo_dirs = \
        [f"{cf.woo_sources['openamsterdam']}/{folder}" for folder in os.listdir(cf.woo_sources['openamsterdam'])] + \
        [f"{cf.woo_sources['raadsinformatie']}/{folder}" for folder in os.listdir(cf.woo_sources['raadsinformatie'])] + \
        [f"{cf.woo_sources['amsterdam.nl']}/{folder}" for folder in os.listdir(cf.woo_sources['amsterdam.nl'])]

woo_files = sum([glob.glob(f"{folder}/*.ocr") for folder in woo_dirs], [])

In [None]:
len(woo_files)

In [None]:
import pandas as pd
from ast import literal_eval

if chunking_comparison_file.exists():
# if False:
    df = pd.read_csv(chunking_comparison_file)
else:
    documents = [open(woo_file, "r").read() for woo_file in woo_files]
    df = pd.DataFrame.from_dict({
        "path": woo_files,
        # "path_rel": map(lambda x: x.replace(f"{cf.woo_sources['openamsterdam']}", ""), woo_files),
        "short_path": map(lambda x: x.removeprefix(cf.raadsinformatie_in_folder), woo_files),
        "file_name": map(lambda x: x.split("/")[-1], woo_files),
        "doc": documents
    })

for column in df.columns:
    if column.startswith("chunks"):
        df[column] = df[column].apply(literal_eval)

# # Temp fix of old bug
# df["short_path"] = df["path"].map(lambda x: x.removeprefix(cf.raadsinformatie_in_folder))

In [None]:
from nltk import word_tokenize
from sentence_transformers import SentenceTransformer
import tiktoken

tiktoken_encoding = tiktoken.get_encoding("cl100k_base")

df["len"] = df.doc.str.len()
df["len_char"] = df.doc.str.len()
df["len_word_token"] = df.doc.map(lambda x: len(word_tokenize(x)))
# Rule of thumb (seems to be majorly underestimating)
df["len_token_appr"] = df["len_char"] // 4
# https://stackoverflow.com/questions/75804599/openai-api-how-do-i-count-tokens-before-i-send-an-api-request
df["len_token_tiktoken"] = df.doc.map(lambda x: len(tiktoken_encoding.encode(x)))

In [None]:
df.head(3)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_overlap = 25

for chunk_size in [256, 512, 8191]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap,
        # separator="\n\n", 
        # is_separator_regex=False,
        # strip_whitespace=False
    )
    # df[f"chunks_basic-{chunk_size}"] = df.doc.map(lambda x: text_splitter.create_documents([x]))
    df[f"chunks_basic-{chunk_size}"] = df.doc.map(lambda x: text_splitter.split_text(x))
    df[f"len_chunks_basic-{chunk_size}"] = df[f"chunks_basic-{chunk_size}"].map(lambda x: len(x))

In [None]:
df.head(3)

In [None]:
from langchain.text_splitter import SpacyTextSplitter

text_splitter_spacy = SpacyTextSplitter(pipeline="nl_core_news_sm")
df["chunks_spacy"] = df.doc.map(lambda x: text_splitter_spacy.split_text(x))
df["len_chunks_spacy"] = df.chunks_spacy.map(lambda x: len(x))

In [None]:
from langchain.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer

from src.llms.model_config import get_embed_model

hf_models = {
    "bert": "jegormeister/bert-base-dutch-cased-snli",
    "robbert": "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers",
    # "cohere": "Cohere/Cohere-embed-multilingual-v3.0",
    "me5": "intfloat/multilingual-e5-large-instruct",
}

for model, model_id in hf_models.items():
    print(model_id)
    # print(text_splitter_hugging_face._chunk_overlap)
    # print(text_splitter_hugging_face._chunk_size)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    text_splitter_hugging_face = TokenTextSplitter.from_huggingface_tokenizer(tokenizer)
    df[f"chunks_{model}"] = df.doc.map(lambda x: text_splitter_hugging_face.split_text(x))
    df[f"len_chunks_{model}"] = df[f"chunks_{model}"].map(lambda x: len(x))

In [None]:
from langchain.text_splitter import TokenTextSplitter
from src.llms.model_config import get_embed_tokenizer

for model in ["bert", "robbert", "me5", "me5-instruct", "cohere"]:
    tokenizer, chunk_size = get_embed_tokenizer(model)
    # print(tokenizer)
    text_splitter_hugging_face = TokenTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=chunk_size, chunk_overlap=0)
    df[f"chunks_{model}"] = df.doc.map(lambda x: text_splitter_hugging_face.split_text(x))
    df[f"len_chunks_{model}"] = df[f"chunks_{model}"].map(lambda x: len(x))

In [None]:
# text_splitter_hugging_face._chunk_overlap
# text_splitter_hugging_face._chunk_size

In [None]:
df.head(3)

### Dump all so far

In [None]:
df.to_csv(chunking_comparison_file, index=False)

In [None]:
# %pip install xlsxwriter

In [None]:
df.to_excel(chunking_comparison_file_xlsx, engine='xlsxwriter')

In [None]:
df.head(5)

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
from tqdm import tqdm

from src.llms.model_config import get_embed_model_info, KNOWN_EMBED_MODELS

tqdm.pandas()

os.environ["AZURE_OPENAI_API_KEY"] = sc.AZURE_OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = st.AZURE_OPENAI_ENDPOINT

def get_embedding_model(model_name):
    if model_name == "ada":
        return AzureOpenAIEmbeddings(
            model="text-embedding-ada-002",
            # deployment_name="text-embedding-ada-002",
            api_key=sc.AZURE_OPENAI_API_KEY,
            azure_endpoint=st.AZURE_OPENAI_ENDPOINT,
            api_version="2023-05-15",
        )
        
    elif model_name in KNOWN_EMBED_MODELS:
        model_id, chunk_size = get_embed_model_info(model_name)
        return HuggingFaceEmbeddings(model_name=model_id)

    else:
        raise ValueError(f"Unknown model {model_name}. Known models: {KNOWN_EMBED_MODELS}")

# for model in list(hf_models.keys()) + ["ada"]:
# for model in ["bert", "robbert", "me5", "me5-instruct", "cohere", "ada"]:
for model in ["bert", "robbert", "me5"]:
# for model in ["me5-instruct", "cohere"]:
    try:
        print(model)

        # if f"chunks_semantic_splitter_{model}" not in df:
        if True:
            embed_model = get_embedding_model(model)
            semantic_splitter = SemanticChunker(embed_model)
            df[f"chunks_semantic_splitter_{model}"] = df.doc.progress_map(lambda x: semantic_splitter.split_text(x))
            df[f"len_chunks_semantic_splitter_{model}"] = df[f"chunks_semantic_splitter_{model}"].progress_map(lambda x: len(x))
            df.to_csv(chunking_comparison_file, index=False)

    except Exception as e:
        print(e)

me5-instruct -> 1h48m

In [None]:
from tqdm import tqdm

for ind, row in tqdm(df.iterrows()):
    short_path = row["path"].removeprefix(cf.raadsinformatie_in_folder)
    output_folder = Path(chunking_folder + short_path)
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    for field, val in row.items():
        if field.startswith("chunks"):
            with open(Path(output_folder, field), "w") as f:
                f.write(f"\n{50*'='}\n".join(row[field]))
        

In [None]:
# df[["short_path", "file_name", "doc", "chunks_basic-256", "chunks_bert"]][df["short_path"].str.contains("manual")]

In [None]:
pd.set_option("max_colwidth", 50)

df.describe().applymap(lambda x: f"{x:0.1f}")

In [None]:
# df[df["len_chunks_me5"] > 1] 
# df["len_word_token"] / df["len_chunks_semantic_splitter_me5"]