Notebook for preprocessing the data before inserting it into the vector database.

In [None]:
# 0. Helper methods for preprocessing PDF, web and templates files

import json
from typing import Iterable
from langchain.docstore.document import Document
from typing import List

# Helper methods for storing and loading already generated documents (as the IMG -> TXT process is time-consuming)
def store_documents(documents, file_path: str) -> None:
    with open(file_path, "w") as jsonl_file:
        for doc in documents:
            jsonl_file.write(doc.json() + "\n")


def load_documents(file_path: str) -> List[Document]:
    documents = []
    with open(file_path, "r") as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            documents.append(obj)
    return documents

In [None]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import DirectoryLoader
import pypdfium2 as pdfium
import cv2
import os
import shutil
import pytesseract

pytesseract_path = os.environ.get("TESSERACT_PATH")
pytesseract.pytesseract.tesseract_cmd = pytesseract_path


def update_pdf_documents() -> List[Document]:
    """
    Method for processing and updating documents based on the PDFs. For that the PDFs, that were not processed yet, are converted to images and then transformed to texts.
    For each PDF one document is then created with all text from all pages. In the end the filename is changed, so that it is clear that it was already processed.
    This approach is used because different methods were tested (DirectoryLoader, PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader, PDFMinerLoader, PyMuPDFLoader, PDFPlumberLoader) but the PDF -> IMG + IMG -> TXT approach was the best performing one, creating nicely structured documents.
    """

    # List for either all documents or only new ones
    documents_PDF = []
    # List for all documents
    already_processed_documents = load_documents("./../../inputData/PDF/documents/all_documents")

    PDF_images_path = "./../../inputData/PDF/PDF_Images"
    directory_path = "./../../inputData/PDF/cleaned"

    # Go through each PDF file in the directory
    for file in os.listdir(directory_path):
        if "Tesseract_processed" not in file:
            file_path = os.path.join(directory_path, file)
            pdf = pdfium.PdfDocument(file_path)
            n_pages = len(pdf)
            # Create directory to store the image
            os.makedirs(PDF_images_path + f"/{file}")
            complete_text = ""
            # Go through each page of the PDF and save the according image
            for page_number in range(n_pages):
                page = pdf.get_page(page_number)
                pil_image = page.render(
                    scale=300 / 72,
                    rotation=0,
                    crop=(0, 0, 0, 0),
                ).to_pil()
                pil_image_path = PDF_images_path + f"/{file}/image_{page_number+1}.png"
                pil_image.save(pil_image_path)
                img = cv2.imread(pil_image_path)
                # Convert image to grayscale
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                # Apply threshold to convert to binary image
                threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
                # Pass the image through pytesseract and add the text to the whole document text
                complete_text += pytesseract.image_to_string(threshold_img) + "\n"
                # Remove the image as it is already processed
                os.remove(pil_image_path)

                file_name_without_pdf = file
                if file.endswith(".pdf"):
                    file_name_without_pdf = file[:-4]
            # Create a document based on the whole text and metadata
            document_PDF = Document(page_content=complete_text, metadata={"source": file, "title": file_name_without_pdf})
            documents_PDF.append(document_PDF)
            already_processed_documents.append(document_PDF)

            # Change the filename, so that in future calls the PDF is not processed again
            new_filename = file.replace(".pdf", "_Tesseract_processed.pdf")
            new_pdf_path = os.path.join(directory_path, new_filename)
            print(new_pdf_path)
            pdf.close()
            os.rename(file_path, new_pdf_path)

    # Store docs if new documents were processed
    if len(documents_PDF) > 0:
        # Store all documents, including the new ones
        store_documents(already_processed_documents, "./../../inputData/PDF/documents/all_documents")
        # Store the new documents
        store_documents(documents_PDF, "./../../inputData/PDF/documents/new_documents")
    
    # Delete the empty folders inside the images folder
    target_dir = "./../input_data/PDF/PDF_images"

    # Check if the target directory exists to avoid errors
    if os.path.exists(target_dir):
        # List all the items in the directory
        for item in os.listdir(target_dir):
            item_path = os.path.join(target_dir, item)
            if os.path.isdir(item_path):
                # Use shutil.rmtree to delete the directory and all its contents
                shutil.rmtree(item_path)

def get_web_documents_for_cleaning() -> List[Document]:
    """
    Method for returning documents based on the URLs. Different methods were tested, but the combination of AsyncHTML and HTML2Text produced best results (very structured text).
    """
    directory_path_web = "./../../inputData/Web/URLs/uncleaned_urls.txt"

    imported_urls = []
    with open(directory_path_web, "r") as file:
        for line in file:
            imported_urls.append(line.strip())

    loader_web = AsyncHtmlLoader(imported_urls)
    documents_web = loader_web.load()

    html2text = Html2TextTransformer()
    documents_web_transformed = html2text.transform_documents(documents_web)
    print("Number of documents: " + str(len(documents_web_transformed)) + "\n")

    return documents_web_transformed


def get_pdf_documents(all_docs: bool):
    """
    Method for returning the documents of the PDFs. Processing and updating takes place in update_pdf_documents.
    all_docs parameter defines whether to load all documents or only new ones. Only new ones can be used if the index is already build and new documents should be added.
    """
    pdf_documents = []
    if all_docs:
        pdf_documents = load_documents("./../../inputData/PDF/documents/all_documents")
    else:
        pdf_documents = load_documents("./../../inputData/PDF/documents/new_documents")

    return pdf_documents


def get_web_documents(all_docs: bool) -> List[Document]:
    """
    Method for returning the already processed documents. FIRST need to call get_web_docs_for_cleaning and clean manually. As it is a manual cleaning process, the methods are need to be called asynchronously.
    """
    web_documents = []
    if all_docs:
        web_documents = load_documents("./../../inputData/Web/documents/all_cleaned_documents")
    else:
        web_documents = load_documents("./../../inputData/Web/documents/newly_cleaned_documents")

    return web_documents


def get_template_documents(all_docs: bool) -> List[Document]:
    """
    Method for returning the documents of the templates.
    """
    template_documents = []
    if all_docs:
        template_documents = load_documents("./../../inputData/Templates/documents/all_documents")
    else:
        template_documents = load_documents("./../../inputData/Templates/documents/new_documents")

    return template_documents

In [None]:
# Uncomment and execute if PDF documents should be updated.
#update_pdf_documents()

In [None]:
# 1. Import/read input data (PDF, Web, Template, PDF_Web or All)
def get_documents_from_files(file_type: str, all_docs: bool):
    """
    Gets the specified documents for file type and all docs.
    """
    if file_type == "PDF":
        return get_pdf_documents(all_docs)
    elif file_type == "Web":
        return get_web_documents(all_docs)
    elif file_type == "Template":
        return get_template_documents(all_docs)
    elif file_type == "PDF_Web":
        documents_PDF = get_pdf_documents(all_docs)
        document_web = get_web_documents(all_docs)
        document_web.extend(documents_PDF)
        print("Number of documents: " + str(len(document_web)) + "\n")
        return document_web
    elif file_type == "All":
        documents_PDF = get_pdf_documents(all_docs)
        document_web = get_web_documents(all_docs)
        document_template = get_template_documents(all_docs)
        document_web.extend(documents_PDF)
        document_web.extend(document_template)
        print("Number of documents: " + str(len(document_web)) + "\n")
        return document_web

    else:
        raise Exception("Error, raised exception: Wrong fileType provided.")

In [None]:
# 2. Chunk data
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents: List[Document], chunk_size: int, chunk_overlap: int):
    """
    Splits the docs into chunks.
    """

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" "])
    chunkedDocuments = text_splitter.split_documents(documents)
    return chunkedDocuments

In [None]:
# 3. Clean chunked data
import re

def clean_text(text: str) -> str:
    # Replace multiple whitespaces (except newlines) with a single space
    text = re.sub(r"(?!\n)\s+", " ", text)
    # Replace multiple newlines with a single newline
    text = re.sub(r"\n+", "\n", text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

def clean_and_process_chunked_documents(chunkedDocuments: List[Document], append_summaries_to_each_doc: bool) -> List[Document]:
    """
    Cleans, lowercases and appends summaries to the documents if wanted.
    """

    summaries = []
    if append_summaries_to_each_doc:
        # Change path if needed!
        with open("./../../evaluationInput/retrieval_eval/summaries_1536_264.json", 'r') as json_file:
            summaries = json.load(json_file)

    # Clean whitespaces, add capitalized original text as metadata, lowercase content and if wanted add title to the document
    counter = 1

    if append_summaries_to_each_doc and len(summaries) == len(chunkedDocuments):
       
        previous_summary = None  # Initialize a variable to store the summary of the previous document
        previous_source = None  # Initialize a variable to store the title of the previous document

        for i, summary in zip(chunkedDocuments, summaries):
            i.page_content = clean_text(i.page_content)
            i.metadata["original_text"] = i.page_content
            i.metadata["doc_ID"] = counter
            counter += 1

            current_source = i.metadata["source"]

            # Check if the current document's title is the same as the previous document's title
            if previous_source is not None and current_source == previous_source:
                # If titles match, and there is a summary from the previous document, prepend it
                if previous_summary is not None:
                    i.page_content = previous_summary + "\n" + i.page_content
            
            # Update previous_summary and previous_title for the next iteration
            previous_summary = summary
            previous_source = current_source

            i.page_content = i.page_content.lower()
    else:
         for i in chunkedDocuments:
            i.page_content = clean_text(i.page_content)
            i.metadata["original_text"] = i.page_content
            i.metadata["doc_ID"] = counter
            counter += 1

            i.page_content = i.page_content.lower()       

    return chunkedDocuments

In [None]:
from langchain.embeddings import CohereEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import VoyageEmbeddings

def create_embedding_model(model_provider: str, model_name: str):
    """
    Creates the embedding model and returns it. Possible combinations are (modelProvider: modelName (embedding size, max input length))
    - Cohere: v2 (4096, 512) | v3 (1024, 512) [https://docs.cohere.com/reference/embed]
    - OpenAI: text-embedding-ada-002 (1536, 8191) [https://platform.openai.com/docs/guides/embeddings]
    - Voyage: voyage-lite-01 (1024, 4096) [https://docs.voyageai.com/embeddings/]
    - HuggingFace: 
        - all-mpnet-base-v2 (768, 384) [https://huggingface.co/sentence-transformers/all-mpnet-base-v2]
        - all-MiniLM-L6-v2 (384, 256) [https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2]
        - bge-large-en-v1.5 (1024, 512) [https://huggingface.co/BAAI/bge-large-en-v1.5]
        - SecRoBERTa (824, 512) [https://huggingface.co/jackaduma/SecRoBERTa]
    - Fine-tuned:
        - finetuned-ISO-27001_1024 (1024, 512) [https://huggingface.co/Basti8499/bge-large-en-v1.5-ISO-27001]
    """

    if model_provider == "Cohere":
        if model_name == "v2":
            embeddings = CohereEmbeddings(model="embed-english-v2.0")
            print("Cohere v2 embedding: Vector embedding size - 4096, input length: 512")
            return embeddings
        if model_name == "v3":
            embeddings = CohereEmbeddings(model="embed-english-v3.0")
            print("Cohere v3 embedding: Vector embedding size - 1024, input length: 512")
            return embeddings

    elif model_provider == "HuggingFace":
        if model_name == "all-mpnet-base-v2":
            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
            print("HuggingFace all-mpnet-base-v2 embedding - Vector embedding size: 768, input length: 384")
            return embeddings
        if model_name == "all-MiniLM-L6-v2":
            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
            print("HuggingFace all-MiniLM-L6-v2 embedding - Vector embedding size: 384, input length: 256")
            return embeddings
        if model_name == "bge-large-en-v1.5":
            embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
            print("HuggingFace BAAI/bge-large-en-v1.5 embedding - Vector embedding size: 1024, input length: 512")        
            return embeddings
        if model_name == "Contriever":
            embeddings = HuggingFaceEmbeddings(model_name = "facebook/contriever-msmarco")
            print("HuggingFace facebook/contriever-msmarco embedding - Vector embedding size: 768, input length: 512")   
            return embeddings     
        if model_name == "SecRoBERTa":
            embeddings = HuggingFaceEmbeddings(model_name="jackaduma/SecRoBERTa")
            print("HuggingFace jackaduma/SecRoBERTa embedding - Vector embedding size: 768, input length: 512")        
            return embeddings
        
    elif model_provider == "Voyage":
        if model_name == "voyage-2":
            embeddings = VoyageEmbeddings(model="voyage-2", show_progress_bar=True, batch_size=200)
            print("Voyage embedding - Vector embedding size: 1024, input length: 4096")
            return embeddings 
        
    elif model_provider == "OpenAI":
        if model_name == "text-embedding-ada":
            embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
            print("OpenAI embedding - Vector embedding size: 1536, input length: 8191")
            print("Tokenizer used: cl100k_base")
            return embeddings
        
    elif model_provider == "Fine-tuned":
        if model_name == "finetuned-ISO-27001_1024":
            embeddings = HuggingFaceEmbeddings(model_name="Basti8499/bge-large-en-v1.5-ISO-27001")
            print("Fine-tuned bge-large-en-v1.5 with ISO 27001 - Vector embedding size: 1024, input length: 512")   
            return embeddings                 
    else:
        raise Exception(
            "Error, raised exception: Wrong modelProvider or modelName provided.")

In [None]:
def create_embedding_vectors(embedding_model, documents: List[Document]):
    """
    Creates the embeddings from the documents.
    """
    texts = []
    for document in documents:
        texts.append(document.page_content)

    embeddings = embedding_model.embed_documents(texts)

    return embeddings

In [None]:
def preprocess_data(
    chunk_size: int,
    chunk_overlap: int,
    model_provider: str,
    model_name: str,
    file_type: str,
    append_title_to_each_doc: bool,
    all_docs: bool,
    with_Kersten: bool = False,
    is_recursive: bool = False,
):
    """
    Puts all the above methods together and preprocesses the data for later indexing.
    """
    print("Starting to preprocess data for: Chunk Size - " + str(chunk_size) + ", Chunk Overlap - " + str(chunk_overlap) + ", Model Name: " + model_name)
    documents = get_documents_from_files(file_type, all_docs)
    chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents, append_title_to_each_doc)
    embedding_model = create_embedding_model(model_provider, model_name)
    embeddings = create_embedding_vectors(embedding_model, chunked_cleaned_documents)

    return chunked_cleaned_documents, embedding_model, embeddings

In [None]:
import uuid

def preprocess_data_for_parent_child_retriever(
    chunk_size_parent: int,
    chunk_overlap_parent: int,
    chunk_size_child: int,
    chunk_overlap_child: int,
    model_provider: str,
    model_name: str,
    file_type: str,
    append_title_to_each_doc: bool,
    all_docs: bool,
):
    """
    Preprocesses the data in the case of hierarchical retrieval with the above methods.
    """
    # First process the documents as in the standard case
    documents = get_documents_from_files(file_type, all_docs)
    parent_documents = split_docs(documents, chunk_size=chunk_size_parent, chunk_overlap=chunk_overlap_parent)
    parent_cleaned_documents = clean_and_process_chunked_documents(parent_documents, append_title_to_each_doc)
    # Generate an ID for each document, so in later retrieval the parent doc can be retrieved over the child doc
    parent_doc_ids = [str(uuid.uuid4()) for _ in parent_cleaned_documents]

    child_doc_list = []
    parent_full_docs = []
    index = 0
    for parent_doc in parent_cleaned_documents:
        
        parent_id = parent_doc_ids[index]
        index +=1
        # Append a tuple of an ID and the according document (fs store needs this format)
        parent_full_docs.append((parent_id, parent_doc))
        parent_doc = [parent_doc]

        child_documents = split_docs(parent_doc, chunk_size=chunk_size_child, chunk_overlap=chunk_overlap_child)
        for child_doc in child_documents:
            # Set the parent_id for the parent document in the metadata of the child
            child_doc.metadata["parent_id"] = parent_id
        child_doc_list.extend(child_documents)

    embedding_model = create_embedding_model(model_provider, model_name)    
    child_embeddings = create_embedding_vectors(embedding_model, child_doc_list)

    return parent_full_docs, child_doc_list, child_embeddings

In [None]:
def store_documents_for_sparse_retrieval(chunk_size: int, chunk_overlap: int, file_type: str, append_title_to_each_doc: bool):
    """
    Method used for storing the documents for sparse retrieval in the local memory.
    """
    documents = get_documents_from_files(file_type, True)
    chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents, append_title_to_each_doc)

    document_file_name = str(chunk_size) + "_" + str(chunk_overlap) + "_" + file_type + "_" + str(append_title_to_each_doc)
    store_documents(chunked_cleaned_documents, f"./../../retrievalInput/Documents_For_Sparse/{document_file_name}")

In [None]:
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage.file_system import LocalFileStore

def store_documents_for_sparse_retrieval_parent_child(
    chunk_size_parent: int,
    chunk_overlap_parent: int,
    chunk_size_child: int,
    chunk_overlap_child: int,
    file_type: str,
    append_title_to_each_doc: bool):

    """
    Method used for storing the documents for sparse retrieval in the case of hierarchical retrieval in the local memory.
    """

    documents = get_documents_from_files(file_type, True)
    parent_documents = split_docs(documents, chunk_size=chunk_size_parent, chunk_overlap=chunk_overlap_parent)
    parent_cleaned_documents = clean_and_process_chunked_documents(parent_documents, append_title_to_each_doc)
    # Generate an ID for each document, so in later retrieval the parent doc can be retrieved over the child doc
    parent_doc_ids = [str(uuid.uuid4()) for _ in parent_cleaned_documents]

    child_doc_list = []
    parent_full_docs = []
    index = 0
    
    for parent_doc in parent_cleaned_documents:
        
        parent_id = parent_doc_ids[index]
        index +=1
        # Append a tuple of an ID and the according document (fs store needs this format)
        parent_full_docs.append((parent_id, parent_doc))
        parent_doc = [parent_doc] # split_docs can only process arrays

        child_documents = split_docs(parent_doc, chunk_size=chunk_size_child, chunk_overlap=chunk_overlap_child)
        for child_doc in child_documents:
            # Set the parent_id for the parent document in the metadata of the child
            child_doc.metadata["parent_id"] = parent_id
        child_doc_list.extend(child_documents)

    document_file_name = str(chunk_size_parent) + "_" + str(chunk_overlap_parent) + "_PC_" + str (chunk_size_child) + "_" + str(chunk_overlap_child) + "_" + file_type + "_" + str(append_title_to_each_doc)
    store_documents(child_doc_list, f"./../../retrievalInput/Documents_For_Sparse/{document_file_name}")

    fs = LocalFileStore(os.environ.get("PARENT_DOC_PATH") + f"\\{document_file_name}")
    store = create_kv_docstore(fs)
    store.mset(parent_full_docs)
    print("Successfully created local file store for parent docs. There are", len(parent_documents), "parent documents in the file store.")