In [1]:
import pandas as pd
from tqdm import tqdm
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
def list_txt_files(directory):
    files = os.listdir(directory)
    pdf_files = [file for file in files if file.lower().endswith('.txt')]
    return pdf_files

# Environment variables

In [None]:
source_path = ""    # TODO: Path to the extracted .txt files
csv_path = ""       # TODO: Path to the study_programms .csv

# RecursiveCharacterTextSplitter
For information why 512 is used, refer to https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/blob/main/config.json

In [None]:
recursive_splitter = RecursiveCharacterTextSplitter(
    # for reference: the 2.69 was calculated in section "Eval Chunk size with tokenizer"
    # chunk_size=688,   # 256 * 2,69 = 688,64
    chunk_size=1377,    # 512 * 2,69
    separators=[r"\n\n+", r"(?<=[.?!])\s+", r"[•-◦→]\s+[^\n]*\n[^•-◦→]", r"[•-◦→]", r"\.", r"\n"],
    is_separator_regex=True,
    chunk_overlap=146
)

# Chunk .txt files

In [None]:
import re
recursive_data_list = []

df_meta_data = pd.read_csv(csv_path, sep=";", encoding="latin1")

files = -1
for file in tqdm(list_txt_files(source_path)):
    with open(source_path + os.sep + file, 'r', encoding="utf-8") as txt:
        content = txt.read()
    content = re.sub(r'\s{4,}', '\n\n', content)
    content = re.sub(r'\.{3,}', '...', content)
    # split text
    chunks = recursive_splitter.split_text(text=content)
    # retrieve metadata and save as own chunk
    fnr = int(file.replace(".txt", ""))
    metadata = df_meta_data.loc[df_meta_data['identifier'] == fnr,].reset_index(drop=True)
    wanted_information = ["study_program", "university", "study_form", "degree", "type", "identifier", "location", "website", "fields"]
    metadata_list = metadata.loc[0, wanted_information].astype(str).tolist()
    metadata_as_str = (f"Modulhandbuch Nr.: {metadata_list[5]}\n"
                       f"Studiengang:       {metadata_list[0]}\n"
                       f"Grad:              {metadata_list[3]}\n"
                       f"Studiengang-Form:  {metadata_list[2]}\n"
                       f"Hochschule:        {metadata_list[1]} (Typ: {metadata_list[4]})\n"
                       f"Standort:          {metadata_list[6]}\n"
                       f"Themenfelder:      {metadata_list[8]}\n"
                       f"Link:              {metadata_list[7]}\n")
    file_nr = fnr
    chunk_nr = 0
    text = metadata_as_str
    entry = {
        "file": file_nr,
        "chunk": chunk_nr,
        "text": text
    }
    recursive_data_list.append(entry)
    
    for index, chunk in enumerate(chunks):
        file_nr = int(file.replace(".txt", ""))
        chunk_nr = index+1
        text = chunk
        entry = {
            "file": file_nr,
            "chunk": chunk_nr,
            "text": text
        }
        recursive_data_list.append(entry)
    if files == 0:
        break
    files -= 1

# Save recursive chunk list to DataFrame

In [None]:
out_file = ""   # TODO: determine output file (.csv)

df = pd.DataFrame(recursive_data_list)
df.to_csv(out_file, index=False, sep=";", encoding='utf-8', escapechar="\\")