In [1]:
import umap
import re
import numpy as np
from langchain_community.document_loaders import PDFMinerLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from unidecode import unidecode
from sklearn.mixture import GaussianMixture
from typing import List, Optional
from langchain.schema import Document
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from pydantic import BaseModel, Field
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain.output_parsers import RetryOutputParser
from langchain_core.runnables import RunnableLambda, RunnableParallel
from langchain_chroma import Chroma

  from .autonotebook import tqdm as notebook_tqdm
2025-01-08 20:00:05.037026: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-08 20:00:05.045717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-08 20:00:05.054988: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-08 20:00:05.057679: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-08 20:00:05.0

In [2]:
docs = PDFMinerLoader('../documents/Alwyn Scott — Neuroscience: A Mathematical Primer.pdf', concatenate_pages=False).load()
len(docs)

373

In [3]:
for doc in docs:
  doc.page_content = unidecode(doc.page_content)

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=100,
  chunk_overlap=50,
  length_function=len,
  is_separator_regex=False,
  separators=[
    ".",
    "\uff0e", # Fullwidth full stop
    "\u3002", # Ideographic full stop
  ],
)
splitted_docs = text_splitter.create_documents([doc.page_content for doc in docs])
len(splitted_docs)

6553

In [5]:
embedding = OllamaEmbeddings(model='llama3.1')

  embedding = OllamaEmbeddings(model='llama3.1')


In [6]:
llm = Ollama(model='llama3.1', temperature=0)

  llm = Ollama(model='llama3.1', temperature=0)


In [7]:
def extract_json(response):
  json_pattern = r'\{.*?\}'
  match = re.search(json_pattern, response, re.DOTALL)

  if match:
    return match.group().strip().replace('\\\\', '\\')

  return response

In [8]:

class SummarizeAnswer(BaseModel):
  summarized_text: str = Field(
    description="Given the context, write a summary of the following, including as many key details as possible.")
summarize_parser = PydanticOutputParser(pydantic_object=SummarizeAnswer)
summarize_retry_parser = RetryOutputParser.from_llm(
  parser=summarize_parser,
  llm=llm,
  max_retries=3,
)
summarize_template = """
Write a summary of the following, including as many key details as possible.

{format_instructions}

Context:
{context}
"""
summarize_prompt = PromptTemplate(
  template=summarize_template,
  input_variables=['question'],
  partial_variables={'format_instructions': summarize_parser.get_format_instructions()},
)
summarize_chain = RunnableParallel(
  completion=summarize_prompt | llm | extract_json, prompt_value=summarize_prompt
) | RunnableLambda(lambda x: summarize_retry_parser.parse_with_prompt(**x))

summarize_backup_template = """
Write a summary of the following, including as many key details as possible:
{context}
"""
summarize_backup_prompt = ChatPromptTemplate.from_template(summarize_backup_template)
summarize_backup_chain = summarize_backup_prompt | llm | StrOutputParser()

def summarize(context: str) -> Document:
  try:
    response = summarize_chain.invoke({'context': context})
    return Document(response.summarized_text)
  except:
    response = summarize_backup_chain.invoke({'context': context})
    return Document(response)


In [9]:
def get_text(docs: List[Document]) -> str:
    text = ""
    for doc in docs:
        text += f"{' '.join(doc.page_content.splitlines())}"
        text += "\n\n"
    return text

In [10]:
def global_cluster_embeddings(
  embeddings: np.ndarray,
  dim: int,
  n_neighbors: Optional[int] = None,
  metric: str = "cosine",
) -> np.ndarray:
  if n_neighbors is None:
    n_neighbors = int((len(embeddings) - 1) ** 0.5)
  reduced_embeddings = umap.UMAP(
    n_neighbors=n_neighbors, n_components=dim, metric=metric
  ).fit_transform(embeddings)
  return reduced_embeddings

def local_cluster_embeddings(
    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"
) -> np.ndarray:
  reduced_embeddings = umap.UMAP(
    n_neighbors=num_neighbors, n_components=dim, metric=metric
  ).fit_transform(embeddings)
  return reduced_embeddings

def get_optimal_clusters_n(embeddings: np.ndarray, max_clusters: int = 50, random_state: int = 0) -> int:
  max_clusters = min(max_clusters, len(embeddings))
  n_clusters = np.arange(1, max_clusters)
  bics = []
  for n in n_clusters:
    gm = GaussianMixture(n_components=n, random_state=random_state)
    gm.fit(embeddings)
    bics.append(gm.bic(embeddings))
  optimal_clusters = n_clusters[np.argmin(bics)]
  return optimal_clusters

def GMM_cluster(embeddings: np.ndarray, threshold: float = 0.5, random_state: int = 0):
  n_clusters = get_optimal_clusters_n(embeddings)
  gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
  gm.fit(embeddings)
  probs = gm.predict_proba(embeddings)
  labels = [np.where(prob > threshold)[0] for prob in probs]
  return labels, n_clusters

def perform_clustering(
    embeddings: np.ndarray, dim: int, threshold: float, verbose: bool = False
) -> List[np.ndarray]:
  reduced_embeddings_global = global_cluster_embeddings(embeddings, min(dim, len(embeddings) - 2))
  global_clusters, n_global_clusters = GMM_cluster(
    reduced_embeddings_global, threshold
  )

  all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
  total_clusters = 0

  for i in range(n_global_clusters):
    global_cluster_embeddings_ = embeddings[
        np.array([i in gc for gc in global_clusters])
    ]
    if len(global_cluster_embeddings_) == 0:
      continue
    if len(global_cluster_embeddings_) <= dim + 1:
      local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
      n_local_clusters = 1
    else:
      reduced_embeddings_local = local_cluster_embeddings(
        global_cluster_embeddings_, dim
      )
      local_clusters, n_local_clusters = GMM_cluster(
        reduced_embeddings_local, threshold
      )

    for j in range(n_local_clusters):
      local_cluster_embeddings_ = global_cluster_embeddings_[
        np.array([j in lc for lc in local_clusters])
      ]
      indices = np.where(
        (embeddings == local_cluster_embeddings_[:, None]).all(-1)
      )[1]
      for idx in indices:
        all_local_clusters[idx] = np.append(
          all_local_clusters[idx], j + total_clusters
        )

    total_clusters += n_local_clusters

  return all_local_clusters

def perform_raptor_clustering(
  embedding,
  docs: List[Document],
  max_length_in_cluster: int = 3500,
  reduction_dimension: int = 10,
  threshold: float = 0.1
) -> List[List[Document]]:
  # Get the embeddings from the nodes
  embeddings = np.array([embedding.embed_query(doc.page_content) for doc in docs])

  # Perform the clustering
  clusters = perform_clustering(
    embeddings, dim=reduction_dimension, threshold=threshold
  )

  # Initialize an empty list to store the clusters of nodes
  doc_clusters = []

  # Iterate over each unique label in the clusters
  for label in np.unique(np.concatenate(clusters)):
    # Get the indices of the nodes that belong to this cluster
    indices = [i for i, cluster in enumerate(clusters) if label in cluster]

    # Add the corresponding nodes to the doc_clusters list
    cluster_docs = [docs[i] for i in indices]

    # Base case: if the cluster only has one node, do not attempt to recluster it
    if len(cluster_docs) == 1:
      doc_clusters.append(cluster_docs)
      continue

    # Calculate the total length of the text in the nodes
    total_length = sum(
      [len(doc.page_content) for doc in cluster_docs]
    )

    # If the total length exceeds the maximum allowed length, recluster this cluster
    # cluster_docs of length 4 gives default n_neighbors value of 1 which raises an error in umap
    if total_length > max_length_in_cluster and len(cluster_docs) > 4:
      print(f"reclustering cluster with {len(cluster_docs)} nodes")
      doc_clusters.extend(
        perform_raptor_clustering(
          embedding, cluster_docs, max_length_in_cluster
        )
      )
    else:
      doc_clusters.append(cluster_docs)

  return doc_clusters

def get_raptor_docs(embedding, docs):
  levels = [docs]

  while True:
    prev_level = levels[-1]

    if len(prev_level) <= 4:
      break

    print(f'cluster level {len(levels)}')
    clusters = perform_raptor_clustering(embedding, prev_level)

    print(f'{len(clusters)} clusters')

    texts = [get_text(docs) for docs in clusters]
    level = [summarize(text) for text in texts]
    levels.append(level)

  return levels

In [11]:
levels = get_raptor_docs(embedding, splitted_docs)
len(levels)

cluster level 1
reclustering cluster with 23 nodes
reclustering cluster with 55 nodes
reclustering cluster with 21 nodes
reclustering cluster with 54 nodes
reclustering cluster with 31 nodes
reclustering cluster with 30 nodes
reclustering cluster with 41 nodes
reclustering cluster with 42 nodes
reclustering cluster with 23 nodes
reclustering cluster with 43 nodes
reclustering cluster with 35 nodes
reclustering cluster with 40 nodes
reclustering cluster with 45 nodes
reclustering cluster with 50 nodes
reclustering cluster with 47 nodes
reclustering cluster with 40 nodes
reclustering cluster with 34 nodes
reclustering cluster with 32 nodes
reclustering cluster with 53 nodes
reclustering cluster with 38 nodes
reclustering cluster with 19 nodes
reclustering cluster with 28 nodes
reclustering cluster with 29 nodes
reclustering cluster with 33 nodes
reclustering cluster with 38 nodes
reclustering cluster with 32 nodes
reclustering cluster with 31 nodes
reclustering cluster with 29 nodes
recl

10

In [12]:
raptor_docuemnts = [item for sublist in levels for item in sublist]

In [13]:
vector_store = Chroma(
  collection_name='neurorag',
  embedding_function=embedding,
  persist_directory='./chroma_db'
)
vector_store.add_documents(raptor_docuemnts)

['ed7c6009-b068-44ee-bc41-faf2f9572f9a',
 '462846cf-91b7-40f2-bb2f-d1c655eb3163',
 '87a2bf76-0adb-48b8-a4ba-e8a7561bdd2c',
 'c66813bb-a728-4d76-8474-da338c469032',
 '811d2831-c0cf-437d-a4a8-eccb25fca776',
 '250b0fd0-d1f1-491a-97f6-32ceda1cc88b',
 'f15bde23-27f5-4958-8da6-89a75e765834',
 '3abb040e-0bd4-4418-a4e6-09aee8b5a5c5',
 'c97c62a0-e2cd-4f13-aefd-67326c76d128',
 '35f80c38-d793-43f7-90bc-5ef303d141b1',
 'c76a97d1-2e62-46e3-b121-769a7d8fb778',
 '2d5ab2a1-4298-494f-b5a1-529ab572d659',
 '18820e87-3111-415d-a2fe-7679ded854a6',
 '0697e910-1c0c-4b6c-a445-9989f420a799',
 '8b67bf8b-4692-4ab6-83c8-2cc63b46be18',
 '08ffa2c2-ad49-4e53-bcad-dbf8380bad1f',
 '1ccb8eb3-1262-49e4-ade9-e017d4f9f9b2',
 'd3d9a81d-2735-4107-9e6f-d9bbdf20470f',
 'aa3a6c8f-a1c8-4abe-bc64-b08733208ece',
 '64d0fb38-fe59-4665-a0e6-8cc9b89064d5',
 '0ecde8f6-fdd5-4510-a3a7-c0b7613d3a1f',
 'da39bf40-5595-4218-8f5e-2421ab19dc3e',
 'a3c46f0f-0e0e-4485-8605-e65ad12c9ab3',
 'b6deeb92-9a7a-487f-b524-cc85a58f4320',
 'fd576618-4ac1-