## Load the textbook embeddings and instantiate the model

In [1]:
import pickle

In [2]:
def load_embeddings_to_pickle(file_path):
    with open(file_path,'rb') as f:
        embedding_with_metadata = pickle.load(f)
    return embedding_with_metadata


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')



#helper function to embed text chunks with metadata
def embed_text_chunks(textbook_chunk_metadata,model):
    for chunk in textbook_chunk_metadata:
        text_chunk = chunk['text_chunk']
        embedding = model.encode(text_chunk)
        chunk['embedding'] = embedding
    return textbook_chunk_metadata

In [13]:
combined_textbook_embed = load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/combined_textbook_embedding_metadata.pkl')

In [8]:
# textbook1_embed = load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook1_embedding_metadata.pkl')
# textbook2_embed =load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook2_embedding_metadata.pkl')
# textbook3_embed =load_embeddings_to_pickle('/Users/haridevaraj/Documents/Projects/steps_ai/Content_extraction_and_chunking_embed/textbook3_embedding_metadata.pkl')

## RAPTOR indexing

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from typing import Optional
import umap




In [None]:
def gmm_cluster_embeddings_prob_threshold(textbook_embeddings,random_state=42):
    num_clusters = get_bayesian_optimal_num_clusters(textbook_embeddings)
    
    gmm = GaussianMixture(n_components=num_clusters,covariance_type='full',random_state=random_state)
    gmm.fit(textbook_embeddings)
    clusters = gmm.predict_proba(textbook_embeddings)
    return clusters, gmm

#get the optimal number using the Bayesian Information criteria with a GMM
def get_bayesian_optimal_num_clusters(textbook_embeddings,max_clusters=30,random_state=42):
    max_clusters = min(max_clusters,len(textbook_embeddings))
    n_clusters = np.arange(1,max_clusters)
    
    bayesian_info_criteria=[]
    
    for num in n_clusters:
        gmm = GaussianMixture(n_components=num,random_state=random_state)
        gmm.fit(textbook_embeddings)
        bayesian_info_criteria.append(gmm.bic())
    optim_clusters = n_clusters[np.argmin(bayesian_info_criteria)]
    
    return optim_clusters
    
    
    
    
    


In [30]:
embeddings = np.array([chunk['embedding']]for chunk in combined_textbook_embed)

In [32]:
embeddings

array(<generator object <genexpr> at 0x12a0c5b10>, dtype=object)

In [None]:
#reducing the cluster using UMAP dimensionality reduction in global and local dimensionality
def umap_red_global_cluster_embed(textbook_embeddings,dim,n_neighbours=None,metric='cosine'):
    if n_neighbhours is None:
        n_neighbhours = int((len(textbook_embeddings)-1)**0.5)
    global_embed= umap.UMAP(n_neighbours=n_neighbhours,n_components=dim,metric=metric).fit_transform(texbook_embeddings)
    return global_embed


def umap_local_cluster_embed(textbook_embeddings,dim,num_neighbhours=10,metric='cosine'):
    local_embed = umap.UMAP(n_neighbours=num_neighbhours,n_components=dim,metric=metric).fit_transform(textbook_embeddings)
    return local_embed






