## Load the textbook embeddings and instantiate the model

In [None]:
%pip install -qU transformers accelerate sentence_transformers bitsandbytes langchain-openai tenacity


In [None]:
%pip install -qU umap-learn scikit-learn langchain_community langchain-huggingface

In [1]:
import pickle
import pandas as pd
import os
from tqdm.notebook import tqdm
#api_key = userdata.get('GOOGLE_API_KEY')

In [2]:
def load_chunks_to_pickle(file_path):
    with open(file_path,'rb') as f:
        embedding_with_metadata = pickle.load(f)
    return embedding_with_metadata





In [5]:
combined_textbook = load_chunks_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/combined_textbook_chunk_metadata.pkl')

In [7]:
#convert the list into a dataframe for easier processing
transformed_data = []
for item in combined_textbook:
    metadata = {'book_name': item['book_name'], 'page_number': item['page_number']}
    transformed_data.append({'text_chunk': item['text_chunk'], 'metadata': metadata})
df_textbook_data = pd.DataFrame(transformed_data)

In [8]:
df_textbook_data

Unnamed: 0,text_chunk,metadata
0,Autonomous Mobile RobotsIntroduction toRoland ...,{'book_name': 'Introduction to Autonomous Mobi...
1,"It discusses all facets of mobile robotics,inc...",{'book_name': 'Introduction to Autonomous Mobi...
2,The first two chapters explore low-level locom...,{'book_name': 'Introduction to Autonomous Mobi...
3,Bringing together all aspects of mobile roboti...,{'book_name': 'Introduction to Autonomous Mobi...
4,The idea of providing a robot functional archi...,{'book_name': 'Introduction to Autonomous Mobi...
...,...,...
4492,Universal plan A set of all possible plans for...,"{'book_name': 'mataric-primer', 'page_number':..."
4493,"Index Action Selection, 166 Actuator, 24, 29, ...","{'book_name': 'mataric-primer', 'page_number':..."
4494,"Index 303 Educational Robotics, 284 Effector, ...","{'book_name': 'mataric-primer', 'page_number':..."
4495,"304 Index Photophilic, 12 Photophobic, 12Pitch...","{'book_name': 'mataric-primer', 'page_number':..."


## Embeddings using SBERT for vector representations with helper functions

In [12]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')



#helper function to embed text chunks with metadata
def embed_text_chunks(textbook_chunk_metadata,e_model):
    for chunk in textbook_chunk_metadata:
        text_chunk = chunk['text_chunk']
        embedding = e_model.encode(text_chunk)
        chunk['embedding'] = embedding
    return textbook_chunk_metadata


def get_embeddings(textbook_chunk,embed_model=embedding_model):
    text_embed = embedding_model.encode(textbook_chunk)
    return np.array(text_embed)




  from tqdm.autonotebook import tqdm, trange


In [9]:
# textbook1_embed = load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook1_embedding_metadata.pkl')
# textbook2_embed =load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook2_embedding_metadata.pkl')
# textbook3_embed =load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook3_embedding_metadata.pkl')

# RAPTOR indexing

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import torch
import transformers
from transformers import AutoTokenizer
from langchain.chains import load_summarize_chain
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
#from langchain.llms import HuggingFacePipeline
import umap
from langchain_openai import OpenAIEmbeddings,ChatOpenAI

#from langchain_huggingface import HuggingFacePipeline
#from config import SUM_API_KEY
from tenacity import retry, wait_exponential, stop_after_attempt
from langchain import OpenAI


## Clustering  helper functions

In [11]:
def gmm_cluster_embeddings_prob_threshold(textbook_embeddings,prob_threshold):
    num_clusters = get_bayesian_optimal_num_clusters(textbook_embeddings)

    gmm = GaussianMixture(n_components=num_clusters,random_state=42)
    gmm.fit(textbook_embeddings)
    probas = gmm.predict_proba(textbook_embeddings)
    cluster_labels = [np.where(prob > prob_threshold)[0] for prob in probas]
    return cluster_labels, num_clusters

#get the optimal number using the Bayesian Information criteria with a GMM
def get_bayesian_optimal_num_clusters(textbook_embeddings,max_clusters=50):
    max_clusters = min(max_clusters,len(textbook_embeddings))
    n_clusters = np.arange(1,max_clusters)

    bayesian_info_criteria=[]

    for num in n_clusters:
        gmm = GaussianMixture(n_components=num,random_state=42)
        gmm.fit(textbook_embeddings)
        bayesian_info_criteria.append(gmm.bic(textbook_embeddings))
    optim_clusters = n_clusters[np.argmin(bayesian_info_criteria)]

    return optim_clusters

#reducing the cluster using UMAP dimensionality reduction in global and local dimensionality
def umap_reduce_global_cluster_embed(textbook_embeddings,dim,num_neighbours=None,metric="cosine"):
    if num_neighbours is None:
        num_neighbours = int((len(textbook_embeddings)-1)**0.5)
    global_embed= umap.UMAP(n_neighbors=num_neighbours,n_components=dim,metric=metric).fit_transform(textbook_embeddings)
    return global_embed



def umap_local_cluster_embed(textbook_embeddings,dim,num_neighbhours=10,metric="cosine"):
    local_embed = umap.UMAP(n_neighbors=num_neighbhours,n_components=dim,metric=metric).fit_transform(textbook_embeddings)
    return local_embed






In [12]:
#embeddings = np.array([chunk['embedding']]for chunk in combined_textbook_embed)

In [13]:
#embeddings

## Clustering function

In [14]:
#Cluster the embeddings in two steps:
#  1) reduce their dimensionality globally and the cluster using GMM
# 2) Perfom local clustering for each global cluster
def clustering(textbook_embeddings,dim,prob_cluster_threshold):

    #avoid clustering for less data---checking if the number of embeddings is too small to perform meaningful clustering.
    if len(textbook_embeddings) <=dim +1:
        # all data points belong to a single cluster
        return [np.array([0]) for _ in range(len(textbook_embeddings))]
    
   
    #global dimensionality reduction
    global_reduct_embed = umap_reduce_global_cluster_embed(textbook_embeddings,dim)

    #global GMM clustering
    global_clusters, num_global_clusters = gmm_cluster_embeddings_prob_threshold(global_reduct_embed,prob_cluster_threshold)

    total_local_clusters = [np.array([]) for _ in range(len(textbook_embeddings))]
    #keep track of cumulative count of local clusters
    total_clusters = 0

    #perform local clustering for each global cluster
    for num_g in tqdm(range(num_global_clusters),desc="Processing global clusters"):
        #extract current global cluster embeddings
        gc_embeddings = textbook_embeddings[np.array([num_g in gc for gc in global_clusters])]

        if len(gc_embeddings) == 0:
            continue

        # case for small clusters
        if len(gc_embeddings) <=dim +1:
            local_clusters = [np.array([0])for _ in gc_embeddings]
            num_local_clusters = 1
        else:
            # reduce the local dimensionality and cluster
            local_reduct_embed = umap_local_cluster_embed(gc_embeddings,dim)
            local_clusters , num_local_clusters = gmm_cluster_embeddings_prob_threshold(local_reduct_embed,prob_cluster_threshold)

        #specify ids for local clusters
        for num_l in range(num_local_clusters):
            local_cluster_mask = np.array([num_l in lc for lc in local_clusters])
            lc_embeddings = gc_embeddings[local_cluster_mask]

            # Fix the indexing logic using np.isin
            local_idx = np.where(np.isin(textbook_embeddings, lc_embeddings).all(axis=1))[0]


            for idx in local_idx:
                total_local_clusters[idx] = np.append(total_local_clusters[idx],num_l + total_clusters)


        total_clusters += num_local_clusters
    #cluster id for each embeddings
    
    return total_local_clusters


def get_cluster_embed_texts(df_textbook):

    text_embed = get_embeddings(df_textbook['text_chunk'].tolist())

    # cluster the embeddings
    cluster_labels = clustering(text_embed,10,0.1)

    df_textbook['embedding'] = list(text_embed)
    df_textbook['cluster']   = cluster_labels

    return df_textbook














## Summarization function

In [15]:
# handle api request
import time
def throttle_request(rate_limit, period=60):
    effective_rate_limit = rate_limit - 1 
    sleep_time = period / rate_limit
    time.sleep(sleep_time)


def summarization(df_textbooks,depth,summarize_llm):
    
    print("---Start-Clustering---")
    df_cluster_text_metadata = get_cluster_embed_texts(df_textbooks)
    print("---End-Clustering---")

    #use a new dataframe, representing the document-cluster pair in a straighforward way
    new_cluster_list = []

    for idx, row in df_cluster_text_metadata.iterrows():
        for cluster in row['cluster']:
            new_cluster_list.append({"text_chunk":row['text_chunk'],"embedding":row["embedding"],"cluster":cluster,"metadata":row["metadata"]})

    new_cluster_df = pd.DataFrame(new_cluster_list)

    #process all the unique clusters idx
    all_unique_clusters = new_cluster_df["cluster"].unique()
    print("-----Unique clusters-----: ",len(all_unique_clusters))


    #summarization prompt and chain
    template= """Summarize the following text from robotics textbooks, focusing on the main points, key details, and important terminology:
                {text}
                SUMMARY:
                """
    output_parser = StrOutputParser()
    prompt_temp = ChatPromptTemplate.from_template(template)
    summarization_chain = prompt_temp | summarize_llm | output_parser
    
    #to handle api rate limit(RPM)
    @retry(wait=wait_exponential(min=1,max=60),stop=stop_after_attempt(6))
    def invoke_summarization_chain(summ_chain,text):
        return summ_chain.invoke({"text": text})
        

    #get text summaries
    print("---Start-Summarization---")
    textbook_summaries = []
    agg_metadata=[]
    for c in tqdm(all_unique_clusters,desc="Processing cluster texts"):

        df_unq_cluster = new_cluster_df[new_cluster_df["cluster"]==c]
        
        #to maintain clear chunck boundaries between different chunks
        formatted_input_text = "--- --- \n --- --- ".join(df_unq_cluster['text_chunk'].to_list())
        
        # aggregate the metadata for each cluster 
        aggregated_metadata = df_unq_cluster['metadata'].to_list()
        agg_metadata.append(aggregated_metadata)
        
        #throttle request to avoid rate limit
        throttle_request(rate_limit=500, period=60)
        
        #invove the summarization chain
        textbook_summary = invoke_summarization_chain(summarization_chain,formatted_input_text)
        textbook_summaries.append(textbook_summary)
        
    #create a new dataframe for summaries
    df_textbook_summary = pd.DataFrame({"text_chunk":textbook_summaries,"depth":[depth]*len(textbook_summaries),"cluster":list(all_unique_clusters),"metadata": agg_metadata})
    print("---End-Summarization---")
    return df_cluster_text_metadata, df_textbook_summary































## Recursion function for forming hierarchical tree

In [16]:
def embed_cluster_summarize_recursion(df_textbook,summarize_llm,depth,num_recursion_level):

    # store results for each depth of recursion as a dict
    rec_depth_results= {}

    # for the current depth do the embedding, clustering and summarization
    df_cluster_metadata, df_textbook_summaries = summarization(df_textbook,depth,summarize_llm)

    #store the current depth results in the dict
    rec_depth_results[depth] = (df_cluster_metadata,df_textbook_summaries)

    #check if there is a need to do recursion again
    num_uniq_clusters = df_textbook_summaries["cluster"].nunique()

    if depth < num_recursion_level and num_uniq_clusters > 1:
        print("----------Start-recursion----------")
        #for the next depth of recursion, use the summaries as the input text
        #new_textbook_texts = df_textbook_summaries["textbook-summaries"].tolist()
        new_rec_depth_results = embed_cluster_summarize_recursion(df_textbook_summaries,summarize_llm,depth +1 ,num_recursion_level)


        #store the results in the dict
        rec_depth_results.update(new_rec_depth_results)
    print("----------End-recursion----------")
    return rec_depth_results




In [17]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
openai_key = user_secrets.get_secret("OPEN_AI_KEY")




# build the tree and define the llm model for summarization
sum_llm_model = "gpt-4o-mini"

sum_llm = ChatOpenAI(temperature=0,openai_api_key = openai_key,model_name=sum_llm_model)




rec_results = embed_cluster_summarize_recursion(df_textbook=df_textbook_data,summarize_llm=sum_llm,depth=1,num_recursion_level=5)

---Start-Clustering---


Batches:   0%|          | 0/141 [00:00<?, ?it/s]

Processing global clusters:   0%|          | 0/48 [00:00<?, ?it/s]

---End-Clustering---
-----Unique clusters-----:  484
---Start-Summarization---


Processing cluster texts:   0%|          | 0/484 [00:00<?, ?it/s]

---End-Summarization---
----------Start-recursion----------
---Start-Clustering---


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Processing global clusters:   0%|          | 0/11 [00:00<?, ?it/s]

---End-Clustering---
-----Unique clusters-----:  90
---Start-Summarization---


Processing cluster texts:   0%|          | 0/90 [00:00<?, ?it/s]

---End-Summarization---
----------Start-recursion----------
---Start-Clustering---


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Processing global clusters:   0%|          | 0/12 [00:00<?, ?it/s]

---End-Clustering---
-----Unique clusters-----:  12
---Start-Summarization---


Processing cluster texts:   0%|          | 0/12 [00:00<?, ?it/s]

---End-Summarization---
----------Start-recursion----------
---Start-Clustering---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing global clusters:   0%|          | 0/3 [00:00<?, ?it/s]

---End-Clustering---
-----Unique clusters-----:  3
---Start-Summarization---


Processing cluster texts:   0%|          | 0/3 [00:00<?, ?it/s]

---End-Summarization---
----------Start-recursion----------
---Start-Clustering---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

---End-Clustering---
-----Unique clusters-----:  1
---Start-Summarization---


Processing cluster texts:   0%|          | 0/1 [00:00<?, ?it/s]

---End-Summarization---
----------End-recursion----------
----------End-recursion----------
----------End-recursion----------
----------End-recursion----------
----------End-recursion----------


In [9]:
import pickle
def save_chunk_to_pickle(results,file_path):
    with open(file_path,'wb') as f:
        pickle.dump(results,f)

In [19]:
save_chunk_to_pickle(rec_results,"/kaggle/working/rec_results.pkl")

In [10]:
rec_results = load_chunks_to_pickle("/Users/haridevaraj/Documents/Projects/steps_ai/RAPTOR_indexing/rec_results_full.pkl")

In [11]:
rec_results[1][1]["text_chunk"]

0      **Summary of "Introduction to Autonomous Mobil...
1      The text provides a comprehensive overview of ...
2      The text appears to be a complex excerpt from ...
3      The text provides an overview of significant c...
4      The text is a collection of acknowledgments an...
                             ...                        
479    The text discusses key concepts in reinforceme...
480    The text discusses the differences between uns...
481    **Summary of Learning by Imitation/From Demons...
482    The text discusses maze learning in robotics, ...
483    The text discusses various learning approaches...
Name: text_chunk, Length: 484, dtype: object

In [170]:
rec_results[5][1]

Unnamed: 0,text_chunk,depth,cluster,metadata
0,The text provides a comprehensive overview of ...,5,0,[{'book_name': 'Introduction to Autonomous Mob...


## Collapsed Tree retrieval

In [132]:

def flatten_metadata(metadata):
    """Flatten nested metadata into a list of dictionaries."""
    if isinstance(metadata, list):
        # Check if the list contains only dictionaries
        if all(isinstance(item, dict) for item in metadata):
            return metadata
        else:
            # Flatten any nested lists
            flat_list = []
            for item in metadata:
                if isinstance(item, list):
                    # Recursively flatten nested lists
                    flat_list.extend(flatten_metadata(item))
                else:
                    # Add non-list items directly
                    flat_list.append(item)
            return flat_list
    else:
        # If metadata is not a list, return as a single-item list
        return [metadata]

In [133]:
for depth in sorted(rec_results.keys()):
    rec_results[depth][1]["metadata"] = rec_results[depth][1]['metadata'].apply(flatten_metadata)


In [173]:
from langchain_milvus.vectorstores import Milvus


all_textbook_text = df_textbook_data['text_chunk'].tolist()
all_textbook_metadata = df_textbook_data["metadata"].tolist()
#all_embeddings = rec_results[1][0]["embedding"].tolist()

for depth in sorted(rec_results.keys()):
    # extract the textbook summaries from the current depth
    text_summaries = rec_results[depth][1]["text_chunk"].tolist()
    text_summaries_metadata = rec_results[depth][1]["metadata"].tolist()
    #embeddings_summary = rec_results[depth][1]["embedding"].tolist()
    
    all_textbook_text.extend(text_summaries)
    all_textbook_metadata.extend(text_summaries_metadata)
    #all_embeddings.extend(embeddings_summary)
    
    
    




#store the metadata in the metdata field for milvus maybe a df_metadata


In [175]:
#checking if the legths are same
print(len(all_textbook_metadata))
print(len(all_textbook_text))

5087
5087


## MILVUS LITE - add the data to the database

In [176]:
# Create Document objects with texts and their corresponding metadata
from langchain.docstore.document import Document


documents = []

for text, metadata in zip(all_textbook_text,all_textbook_metadata):
        if isinstance(metadata,dict):
                # single dict case
                documents.append(Document(page_content=text,metadata={"metadata":metadata}))
        elif isinstance(metadata,list) and all(isinstance(item,dict) for item in metadata):
                #list of dicts case
                documents.append(Document(page_content=text,metadata={"metadata":metadata}))
        else:   
                #exception
                print(f"Unexpected metadata format : {metadata}")
          
                



In [177]:
len(documents)

5087

In [189]:
#connect to Milvus lite
from pymilvus import MilvusClient
client = MilvusClient("milvus_robotics_qa.db")

In [191]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings

def get_embeddings(textbook_chunk,embed_model=embedding_model):
    text_embed = embedding_model.encode(textbook_chunk)
    return np.array(text_embed)


embd_model = HuggingFaceEmbeddings(model_name='multi-qa-MiniLM-L6-cos-v1')
EMBD_DIM = embd_model.dict()['client'].get_sentence_embedding_dimension()

MILVUS_HOST = "local_host"
MILVUS_PORT = "19530"




#embedding_model = HuggingFaceEmbeddings('multi-qa-MiniLM-L6-cos-v1')
#embedding_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')


vectorstore = Milvus.from_documents(
                         documents=documents, embedding=embd_model,
                         connection_args={"uri":"./milvus_robotics_qa.db"},
                        collection_name="robotics_textbooks",
                        drop_old =True, consistency_level="Eventually",
                        index_params={"metric_type":"COSINE"})
                     



In [196]:
#testing the db
query = "What is reinforcement learning?"
docs_ans = vectorstore.similarity_search(query)
docs_ans[0].page_content

'Reinforcement learning involves trying different things and seeing what happens; if goodthings happen, we tend to do the behavior again, and if bad things happen, we tend to avoid it. This basic process turns out to be a remarkably versatile tool for learn- ing. It allows robots to learn what to do and not to do in various situations. Consider a typical reactive controller that tells the robot how to react underdifferent sensory inputs.'

In [197]:
#testing the db
query = "What is inverse kinematics?"
docs_ans = vectorstore.similarity_search(query)
docs_ans[0].page_content

'This con-version from a Cartesian (x,y ,z) position of the endpoint (e.g., a ﬁngertip) andthe angles of the whole manipulator (e.g., an arm) is called inverse kinematics . INVERSE KINEMATICS The name refers to the fact that this is the opposite of the simpler process of ﬁguring out where the endpoint of the manipulator is given the joint angles for all of the joints. That was kinematics , presented earlier in this chapter. The'