In [1]:
!pip install -q langchain langchain_core langchain_community langchain_text_splitters tiktoken langchain-google-genai umap

In [2]:
!pip install -q unstructured umap-learn

In [3]:
import numpy as np
import umap
import pandas as pd
from sklearn.mixture import GaussianMixture
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
import tiktoken

import os

In [4]:
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

In [5]:
class TextClusterSummarizer:
    def __init__(
        self,
        token_limit,
        data_directory,
        glob_pattern="**/*.txt",
    ):
        print("Initializing TextClusterSummarizer...")
        self.token_limit = token_limit
        self.loader = DirectoryLoader(data_directory, glob=glob_pattern)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=20,
            length_function=len,
            is_separator_regex=False,
        )
        self.embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.chat_model = ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-pro")
        self.iteration_summaries = []

    def load_and_split_documents(self):
        print("Loading and splitting documents...")
        docs = self.loader.load()
        return self.text_splitter.split_documents(docs)

    def embed_texts(self, texts):
        print("Embedding texts...")
        return [self.embedding_model.embed_query(txt) for txt in texts]

    def reduce_dimensions(self, embeddings, dim, n_neighbors=None, metric="cosine"):
        print(f"Reducing dimensions to {dim}...")
        if n_neighbors is None:
            n_neighbors = int((len(embeddings) - 1) ** 0.5)
        return umap.UMAP(
            n_neighbors=n_neighbors, n_components=dim, metric=metric
        ).fit_transform(embeddings)

    def num_tokens_from_string(self, string: str) -> int:
        """Returns the number of tokens in a text string."""
        encoding = tiktoken.get_encoding("cl100k_base")
        num_tokens = len(encoding.encode(string))
        return num_tokens

    def cluster_embeddings(self, embeddings, threshold, random_state=0):
        print("Clustering embeddings...")
        n_clusters = self.get_optimal_clusters(embeddings)
        gm = GaussianMixture(n_components=n_clusters, random_state=random_state).fit(
            embeddings
        )
        probs = gm.predict_proba(embeddings)
        return [np.where(prob > threshold)[0] for prob in probs], n_clusters

    def get_optimal_clusters(self, embeddings, max_clusters=50, random_state=1234):
        print("Calculating optimal number of clusters...")
        max_clusters = min(max_clusters, len(embeddings))
        bics = [
            GaussianMixture(n_components=n, random_state=random_state)
            .fit(embeddings)
            .bic(embeddings)
            for n in range(1, max_clusters)
        ]
        print(f"Optimal number of clusters: {np.argmin(bics) + 1}")
        return np.argmin(bics) + 1

    def format_cluster_texts(self, df):
        print("Formatting cluster texts...")
        clustered_texts = {}
        for cluster in df["Cluster"].unique():
            cluster_texts = df[df["Cluster"] == cluster]["Text"].tolist()
            clustered_texts[cluster] = " --- ".join(cluster_texts)
        return clustered_texts

    def generate_summaries(self, texts):
        print("Generating summaries...")
        template = """You are an assistant to create a detailed summary of the text input provided.
Text:
{text}
"""
        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | self.chat_model | StrOutputParser()

        summaries = {}
        for cluster, text in texts.items():
            token_count = self.num_tokens_from_string(text)

            if token_count > self.token_limit:
                raise ValueError(
                    f"Token limit exceeded for cluster {cluster} with {token_count} tokens. Unable to generate summary."
                )

            summary = chain.invoke({"text": text})
            summaries[cluster] = summary
        return summaries

    def run(self):
        print("Running TextClusterSummarizer...")
        docs = self.load_and_split_documents()
        texts = [doc.page_content for doc in docs]
        all_summaries = texts

        iteration = 1

        self.iteration_summaries.append(
            {"iteration": 0, "texts": texts, "summaries": []}
        )

        while True:
            print(f"Iteration {iteration}")
            embeddings = self.embed_texts(all_summaries)

            # Need enough neighbours for UMAP
            n_neighbors = min(int((len(embeddings) - 1) ** 0.5), len(embeddings) - 1)
            if n_neighbors < 2:
                print("Not enough data points for UMAP reduction. Stopping iterations.")
                break

            embeddings_reduced = self.reduce_dimensions(
                embeddings, dim=2, n_neighbors=n_neighbors
            )
            labels, num_clusters = self.cluster_embeddings(
                embeddings_reduced, threshold=0.5
            )

            if num_clusters == 1:
                print("Reduced to a single cluster. Stopping iterations.")
                break

            simple_labels = [label[0] if len(label) > 0 else -1 for label in labels]
            df = pd.DataFrame(
                {
                    "Text": all_summaries,
                    "Embedding": list(embeddings_reduced),
                    "Cluster": simple_labels,
                }
            )

            clustered_texts = self.format_cluster_texts(df)
            summaries = self.generate_summaries(clustered_texts)

            all_summaries = list(summaries.values())
            self.iteration_summaries.append(
                {
                    "iteration": iteration,
                    "texts": all_summaries,
                    "summaries": list(summaries.values()),
                }
            )
            iteration += 1

        final_summary = all_summaries[0] if all_summaries else ""
        return {
            "initial_texts": texts,
            "iteration_summaries": self.iteration_summaries,
            "final_summary": final_summary,
        }

In [6]:
summarizer = TextClusterSummarizer(token_limit=16000, data_directory="/content/data")
final_output = summarizer.run()

Initializing TextClusterSummarizer...
Running TextClusterSummarizer...
Loading and splitting documents...
Iteration 1
Embedding texts...
Reducing dimensions to 2...
Clustering embeddings...
Calculating optimal number of clusters...
Optimal number of clusters: 3
Formatting cluster texts...
Generating summaries...
Iteration 2
Embedding texts...
Not enough data points for UMAP reduction. Stopping iterations.


In [7]:
final_output

{'initial_texts': ['Introduction: Welcome to Bella Vista, where every dish is a love letter to the vibrant flavors, fresh ingredients, and time-honored traditions of Italian cuisine. At the heart of our culinary',
  'of our culinary philosophy lies a deep reverence for the simple yet sublime beauty of the Mediterranean diet, which celebrates the natural bounty of the land and sea, and the art of transforming',
  'art of transforming humble ingredients into extraordinary culinary creations. Join us as we embark on a gastronomic journey through the diverse regions of Italy, from the sun-drenched shores of',
  'shores of Sicily to the verdant hills of Tuscany, to discover the soul-stirring flavors that have captivated palates for centuries.',
  'Chapter 1: The Essence of Italian Cuisine Italian cuisine is a celebration of simplicity, freshness, and flavor, where every dish is a reflection of the land, the culture, and the people who create',
  'people who create it. At Bella Vista, we emb