## Load the textbook embeddings and instantiate the model

In [52]:
import pickle
import pandas as pd

In [47]:
def load_chunks_to_pickle(file_path):
    with open(file_path,'rb') as f:
        embedding_with_metadata = pickle.load(f)
    return embedding_with_metadata





In [66]:
combined_textbook = load_chunks_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/combined_textbook_chunk_metadata.pkl')

## Embeddings using SBERT for vector representations with helper functions 

In [69]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')



#helper function to embed text chunks with metadata
def embed_text_chunks(textbook_chunk_metadata,e_model):
    for chunk in textbook_chunk_metadata:
        text_chunk = chunk['text_chunk']
        embedding = e_model.encode(text_chunk)
        chunk['embedding'] = embedding
    return textbook_chunk_metadata


def get_embeddings(textbook_chunk,embed_model=embedding_model):
    text_embed = embedding_model.encode(textbook_chunk)
    return np.array(text_embed)


    

In [8]:
# textbook1_embed = load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook1_embedding_metadata.pkl')
# textbook2_embed =load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook2_embedding_metadata.pkl')
# textbook3_embed =load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook3_embedding_metadata.pkl')

# RAPTOR indexing

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains import StuffDocumentsChain
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate
import umap
from config import SUM_API_KEY 




## Clustering  helper functions

In [78]:
def gmm_cluster_embeddings_prob_threshold(textbook_embeddings,prob_threshold):
    num_clusters = get_bayesian_optimal_num_clusters(textbook_embeddings)
    
    gmm = GaussianMixture(n_components=num_clusters,random_state=42)
    gmm.fit(textbook_embeddings)
    probas = gmm.predict_proba(textbook_embeddings)
    cluster_labels = [np.where(prob > prob_threshold)[0] for prob in probas]
    return cluster_labels, num_clusters

#get the optimal number using the Bayesian Information criteria with a GMM
def get_bayesian_optimal_num_clusters(textbook_embeddings,max_clusters=50):
    max_clusters = min(max_clusters,len(textbook_embeddings))
    n_clusters = np.arange(1,max_clusters)
    
    bayesian_info_criteria=[]
    
    for num in n_clusters:
        gmm = GaussianMixture(n_components=num,random_state=42)
        gmm.fit(textbook_embeddings)
        bayesian_info_criteria.append(gmm.bic(textbook_embeddings))
    optim_clusters = n_clusters[np.argmin(bayesian_info_criteria)]
    
    return optim_clusters
    
#reducing the cluster using UMAP dimensionality reduction in global and local dimensionality
def umap_reduce_global_cluster_embed(textbook_embeddings,dim,num_neighbors=None,metric="cosine"):
    if num_neighbors is None:
        num_neighbors = int((len(textbook_embeddings)-1)**0.5)
    global_embed= umap.UMAP(n_neighbors=num_neighbors,n_components=dim,metric=metric).fit_transform(textbook_embeddings)
    return global_embed

    

def umap_local_cluster_embed(textbook_embeddings,dim,num_neighbhors=10,metric="cosine"):
    local_embed = umap.UMAP(n_neighbors=num_neighbhors,n_components=dim,metric=metric).fit_transform(textbook_embeddings)
    return local_embed

 
    
    


In [30]:
#embeddings = np.array([chunk['embedding']]for chunk in combined_textbook_embed)

In [32]:
#embeddings

array(<generator object <genexpr> at 0x12a0c5b10>, dtype=object)

## Clustering function

In [74]:
#Cluster the embeddings in two steps:
#  1) reduce their dimensionality globally and the cluster using GMM
# 2) Perfom local clustering for each global cluster
def clustering(textbook_embeddings,dim,prob_cluster_threshold):
    
    #avoid clustering for less data---checking if the number of embeddings is too small to perform meaningful clustering. 
    if len(textbook_embeddings) <=dim +1:
        # all data points belong to a single cluster
        return [np.array([0]) for _ in range(len(textbook_embeddings))]
    
    #global dimensionality reduction
    global_reduct_embed = umap_reduce_global_cluster_embed(textbook_embeddings,dim)
    
    #global GMM clustering
    global_clusters, num_global_clusters = gmm_cluster_embeddings_prob_threshold(global_reduct_embed,prob_cluster_threshold)
    
    total_local_clusters = [np.array([]) for _ in range(len(textbook_embeddings))]
    #keep track of cumulative count of local clusters
    total_clusters = 0
    
    #perform local clustering for each global cluster
    for num_g in range(num_global_clusters):
        #extract current global cluster embeddings
        gc_embeddings = textbook_embeddings[np.array([num_g in gc for gc in global_clusters])]
        
        if len(gc_embeddings) == 0:
            continue
        
        # case for small clusters
        if len(gc_embeddings) <=dim +1:
            local_clusters = [np.array([0])for _ in gc_embeddings]
            num_local_clusters = 1
        else:
            # reduce the local dimensionality and cluster
            local_reduct_embed = umap_local_cluster_embed(gc_embeddings,dim)
            local_clusters , num_local_clusters = gmm_cluster_embeddings_prob_threshold(local_reduct_embed,prob_cluster_threshold)
            
        #specify ids for local clusters
        for num_l in range(num_local_clusters):
            lc_embeddings = gc_embeddings[np.array([num_l in lc for lc in local_clusters])]
            
            local_idx = np.where(textbook_embeddings==lc_embeddings[:,None].all(-1))[1]
            for idx in local_idx:
                total_local_clusters[idx] = np.append(total_local_clusters[idx],num_l + total_clusters)
        
        
        total_clusters += num_local_clusters
        
    return total_local_clusters
            
        
def get_cluster_embed_texts(df_textbook):
    
    text_embed = get_embeddings(df_textbook['text_chunk'].tolist())
    
    # cluster the embeddings
    cluster_labels = clustering(text_embed,10,0.1)
    
    df_textbook['embedding'] = list(text_embed)
    df_textbook['cluster']   = cluster_labels
    
    return df_textbook
    
         
        
        
    
     
    
    






## Summarization function

In [75]:
def summarization(df_textbooks,depth,summarize_llm):
    
    df_cluster_text_metadata = get_cluster_embed_texts(df_textbooks)
    
    
    #use a new dataframe, representing the document-cluster pair in a straighforward way
    new_cluster_list = []
    
    for idx, row in df_cluster_text_metadata.iterrows():
        for cluster in row['cluster']:
            new_cluster_list.append({"text":row['text_chunk'],"embedding":row["embedding"],"cluster":cluster,"book_name":row["book_name"],"page_number":row["page_number"]})
    
    new_cluster_df = pd.DataFrame(new_cluster_list)
    
    #process all the unique clusters idx 
    all_unique_clusters = new_cluster_df["cluster"].unique()
    print("Unique clusters: ",all_unique_clusters)
    
    
    #summarization prompt and chain
    template= """Here is a text from a collection of three textbooks about robotics. Please write a concise and informative summary of the following text. Include the main points, key details, and any important terminology or concepts. Ensure that the summary is clear and captures the essence of the provided text:
                {text} 
                Summary:
                """
    output_parser = StrOutputParser()
    prompt_temp = ChatPromptTemplate.from_template(template)
    summarization_chain = prompt_temp | summarize_llm | output_parser
    
    #get text summaries
    
    textbook_summaries = []
    for c in all_unique_clusters:
        
        df_unq_cluster = new_cluster_df[new_cluster_df["cluster"]==c]
        
        #to maintain clear chunck boundaries between different chunks
        formatted_input_text = "--- --- \n --- --- ".join(df_unq_cluster['text'].to_list())
        
        #invove the summarization chain
        textbook_summary = summarization_chain.invoke({"text": formatted_input_text})
        textbook_summaries.append(textbook_summary)
    #create a new dataframe for summaries
    df_textbook_summary = pd.DataFrame({"textbook-summaries":textbook_summaries,"depth":[depth]*len(textbook_summaries),"cluster":list(all_unique_clusters)})
    
    return df_cluster_text_metadata, df_textbook_summary
        
        
        
        
        
        
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    







## Recursion function for forming hierarchical tree

In [76]:
def embed_cluster_summarize_recursion(df_textbook,summarize_llm,depth,num_recursion_level):
    
    # store results for each depth of recursion as a dict
    rec_depth_results= {}
    
    # for the current depth do the embedding, clustering and summarization
    df_cluster_metadata, df_textbook_summaries = summarization(df_textbook,depth,summarize_llm)
    
    #store the current depth results in the dict
    rec_depth_results[depth] = (df_cluster_metadata,df_textbook_summaries) 
    
    #check if there is a need to do recursion again
    num_uniq_clusters = df_textbook_summaries["cluster"].nunique()
    
    if depth < num_recursion_level and num_uniq_clusters > 1:
        #for the next depth of recursion, use the summaries as the input text
        new_textbook_texts = df_textbook_summaries["textbook-summaries"].tolist()
        new_rec_depth_results = embed_cluster_summarize_recursion(new_textbook_texts,depth +1 ,num_recursion_level)
        
        
        #store the results in the dict
        rec_depth_results.update(new_rec_depth_results)
    return rec_depth_results
    
    
    

In [77]:
#convert the list into a dataframe for easier processing
df_combined_textbook = pd.DataFrame(combined_textbook)

# build the tree and define the llm model for summarization
sum_llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=SUM_API_KEY)

rec_results = embed_cluster_summarize_recursion(df_textbook=df_combined_textbook,summarize_llm=sum_llm,depth=1,num_recursion_level=3)

UnboundLocalError: cannot access local variable 'n_neighbhours' where it is not associated with a value

## Collapsed Tree retrieval(best performance)

In [35]:
from langchain_milvus.vectorstores import Milvus

#store the metadata in the metdata field for milvus maybe a df_metadata