In [None]:
import glob

import operator
from uuid import uuid4

import numpy as np
import hdbscan
from copy import deepcopy

import umap.umap_ as umap
import pandas as pd
import plotly.express as px
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import LlamaCppEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

## Model laden

In [None]:
embeddings_model = LlamaCppEmbeddings(
    model_path="models/mxbai-embed-large-v1.Q8_0.gguf",
    verbose=False,
    n_gpu_layers=-1 # Set to 0 for only cpu
)

## Dateien einlesen

Wir lesen die Dateien so ein, dass jede Datei ein logisches Dokument ist und nicht wie sonst ein Dokument pro Seite

In [None]:
from langchain_community.document_loaders import Docx2txtLoader

pdf_paths = glob.glob("test-data/**/*.pdf", recursive=True)
docs_paths = glob.glob("test-data/**/*.docx", recursive=True)

documents = []

for path in pdf_paths:
    text = ""
    last_meta = {}
    loader = PyPDFLoader(path)
    async for page in loader.alazy_load():
        text += "\n\n"
        text += page.page_content.lower()
        last_meta = page.metadata

    documents.append(Document(text, metadata={
        "source": last_meta["source"],
    }))


for path in docs_paths:
    text = ""
    last_meta = {}
    loader = Docx2txtLoader(path)
    async for page in loader.alazy_load():
        text += "\n\n"
        text += page.page_content.lower()
        last_meta = page.metadata

    documents.append(Document(text, metadata={
        "source": last_meta["source"],
    }))

## Text splitten
Overlap, damit die Embeddings der Chunks näher beieinander sind

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
splits = text_splitter.split_documents(documents)

## Embeddings erstellen und in DB speichern

In [None]:
vectorstore = Chroma(
    embedding_function=embeddings_model,
)

emb_ids = [str(uuid4()) for x in splits]

for i in range(len(splits)):
    splits[i].metadata["id"] = emb_ids[i]

_ = vectorstore.add_documents(documents=splits, ids=emb_ids)

## Alle Embeddings aus der DB lesen

In [None]:
embedding_entries = vectorstore.get(include=["metadatas", "embeddings"])

embeddings = embedding_entries.get("embeddings")
ids = embedding_entries.get("ids")
sources = [x["source"] for x in  embedding_entries.get("metadatas")]

## Embeddings clustern

Die Embeddings bekommen ein Label von -1 bis ..., welches die Cluster ID darstellt <br>
Die ID -1 sagt aus, dass es sich um "Noisy" Punkte handelt

In [None]:
hdb = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10, metric='euclidean', cluster_selection_epsilon=0.2).fit(embeddings)

## Datensatz mit Clustern aufbauen

In [None]:
df = pd.DataFrame()
df["id"] = ids
df["source"] = sources
df["cluster"] = hdb.labels_.astype(int)

#Kopieren für rohen Datensatz als Vergleich
df_raw = deepcopy(df)

## Rohes Cluster darstellen

In [None]:
u_file = umap.UMAP(n_components=2, random_state=42, n_neighbors=80, min_dist=0.1)

df_umap_file = (
    pd.DataFrame(u_file.fit_transform(np.array(embeddings)), columns=['x', 'y'])
    .assign(source=lambda df_: df_raw["source"].astype(str))
)

fig = px.scatter(df_umap_file, x='x', y='y', color="source",  title='Dateien').update_traces(dict(marker_line_width=0.5, marker_line_color="black"))
fig.show()

In [None]:
u_raw = umap.UMAP(n_components=2, random_state=42, n_neighbors=80, min_dist=0.1)

df_umap_raw = (
    pd.DataFrame(u_raw.fit_transform(np.array(embeddings)), columns=['x', 'y'])
    .assign(cluster=lambda df_: df_raw["cluster"].astype(str))
    #.query('cluster != "-1"') # Noisy Punkte filtern
    .sort_values(by='cluster')
)

fig = px.scatter(df_umap_raw, x='x', y='y', color='cluster', title='Rohe Cluster').update_traces(dict(marker_line_width=0.5, marker_line_color="black"))
fig.show()

In [None]:
df_raw

## Cluster nachbearbeiten

Alle Embeddings einer Datei, sollen im gleichen Cluster sein. Dafür setzen wir die Cluster ID auf die ID, welche die Mehrheit der Datenpunkte bekommen hat. <br>
Bei der Mehrheitssuche wird die ID -1 ignoriert, das heißt am Ende gibt es keine Noisy Punkte mehr

<br>

#### TODO
Den Fall berücksichtigen, wenn eine Datei nur aus Noisy Punkten besteht


In [None]:
for source in df["source"].unique():
    assigned_clusters: dict[int, int] = {}

    for row in df.loc[df['source'] == source].itertuples():
        assigned_clusters[row.cluster] = assigned_clusters.get(row.cluster, 0) + 1

    if assigned_clusters.get(-1):
        assigned_clusters[-1] = 0

    target_cluster = max(assigned_clusters.items(), key=operator.itemgetter(1))[0]
    df.loc[df['source'] == source, "cluster"] = target_cluster

last_cluster_id = max(df["cluster"].unique())
for source in df["source"].unique():
    if df.loc[df['source'] == source, "cluster"].iloc[0] != -1:
        continue
    last_cluster_id += 1
    df.loc[df['source'] == source, "cluster"] = last_cluster_id

## Fertiges Cluster

In [None]:
u = umap.UMAP(n_components=2, random_state=42, n_neighbors=80, min_dist=0.1)

df_umap = (
    pd.DataFrame(u.fit_transform(np.array(embeddings)), columns=['x', 'y'])
    .assign(cluster=lambda df_: df["cluster"].astype(str))
    #.query('cluster != "-1"')
    .sort_values(by='cluster')
)

fig = px.scatter(df_umap, x='x', y='y', color='cluster', title='Fertiges clustering').update_traces(dict(marker_line_width=0.5, marker_line_color="black"))
fig.show()

In [None]:
df

## Finde Cluster auf Basis von Text

Bei der Textsuche bekommt man bessere Ergebnisse, wenn man einen Prompt zur Suche benutz als nur mit einem Keyword

In [None]:
search_term = "3D Druck"

clusters = []

result = vectorstore.similarity_search(f"Search for: {search_term}", k=5)
for r in result:
    nearest_embeddings_id = r.metadata["id"]
    nearest_cluster = df.loc[df['id'] == nearest_embeddings_id, "cluster"].iloc[0]
    clusters.append(nearest_cluster)

print(f"Zu dem Suchbegriff '{search_term}' passt Cluster {','.join([str(x) for x in set(clusters)])} am besten")

paths = []
for c in clusters:
    paths.extend(df.loc[df['cluster'] == c, "source"].unique().tolist())

print()
print("\n".join(set(paths)))
