In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", None)  # This will be helpful when visualizing retriever outputs

In [None]:
import os
import json
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
import import_ipynb
import importlib

# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class.
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

# Same as the HuggingFace approach.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  
    chunk_overlap=100,  
    add_start_index=True,  
    strip_whitespace=True, 
    separators=MARKDOWN_SEPARATORS,
)

filename = "Documents/combined.json"

# Load the JSON data.
with open(filename, 'r', encoding='utf-8') as f:
    docs_data = json.load(f)

# Reconstruct the LangChain Document objects.
RAW_KNOWLEDGE_BASE = [LangchainDocument(page_content=d["page_content"], metadata=d["metadata"]) for d in docs_data]

# Process the documents using the text splitter.
docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

print(f"Processed {len(docs_processed)} document chunks.")


In [None]:
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter

# GTE embedding model:
#print(f"Model's maximum sequence length: {SentenceTransformer('thenlper/gte-small').max_seq_length}")

# multi e5 embedding model:
print(f"Model's maximum sequence length: {SentenceTransformer('intfloat/multilingual-e5-base').max_seq_length}")

from transformers import AutoTokenizer

# GTE embedding model:
#tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")

# multi e5 embedding model:
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]

# Plot the distribution of document lengths, counted as the number of tokens
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain.docstore.document import Document as LangchainDocument

# GTE embedding model:
#EMBEDDING_MODEL_NAME = "thenlper/gte-small"

# e5 embedding model:
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base"


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:

    # Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model (can vary from model to model, but does not in this case)
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

# Creating the vector database:

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    #model_kwargs={"device": "cuda"}, 
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

# Save the vector database to a folder (e.g., "faiss_index")
# "faiss_index" is the GTE model, "faiss_index_e5" is the index created with the e5 embedding model
KNOWLEDGE_VECTOR_DATABASE.save_local("faiss_index_e5")

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    #model_kwargs={"device": "cuda"}, # Can we use cuda??
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)


vector_db_e5 = FAISS.load_local("Documents/faiss_index_e5", embedding_model, allow_dangerous_deserialization = True)

In [None]:
# Embed a user query in the same space
user_query = "Hvem er Harald Blåtand?"
query_vector = embedding_model.embed_query(user_query)

In [None]:
import pacmap
import numpy as np
import plotly.express as px

# The following is just visualisation:

# PaCMAP
embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

embeddings_2d = [
    list(vector_db_e5.index.reconstruct_n(idx, 1)[0]) for idx in range(len(docs_processed))
] + [query_vector]

# Fit the data (the index of transformed data corresponds to the index of the original data)
documents_projected = embedding_projector.fit_transform(np.array(embeddings_2d), init="pca")

In [None]:
import pandas as pd
import plotly.express as px

df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[i, 0],
            "y": documents_projected[i, 1],
            "category": docs_processed[i].metadata["categories"][0],
            "title": docs_processed[i].metadata["title"],
            "extract": docs_processed[i].page_content[:100] + "...",
            "symbol": "circle",
            "size_col": 4,
        }
        for i in range(len(docs_processed))
    ]
    + [
        {
            "x": documents_projected[-1, 0],
            "y": documents_projected[-1, 1],
            "category": "Query",
            "title": "User query",
            "extract": user_query,
            "symbol": "star",
            "size_col": 100,
        }
    ]
)

fig = px.scatter(
    df,
    x="x",
    y="y",
    color="category",              
    hover_data=["title", "extract"],
    size="size_col",
    symbol="symbol",
    color_discrete_map={"Query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey"))
)
fig.update_layout(
    legend_title_text="<b>Category</b>",
    title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
)
fig.show()


In [None]:
# Example retrieval output:

print(f"\nStarting retrieval for {user_query=}...")
retrieved_docs = vector_db_e5.similarity_search(query=user_query, k=5)
print("\n==================================Top document==================================")
for i in range(5):
    print(retrieved_docs[i].page_content)
print("==================================Metadata======================================")
for i in range(5):
    print(retrieved_docs[i].metadata)