In [23]:
import os
import re
import json
import loguru
import openai
import pickle
import numpy as np

from llama_index.readers.file import PyMuPDFReader 
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
from tqdm.auto import tqdm

In [2]:
DOC_DIR = '../docs'

loader = PyMuPDFReader()
documents = [loader.load_data(os.path.join(DOC_DIR, f)) for f in os.listdir(DOC_DIR) if f.endswith('.pdf')]
documents = [page for sub_doc in documents for page in sub_doc]

In [3]:
documents = documents[:-1]

In [4]:
documents

[Document(id_='13603cc2-3d4a-444d-a575-165eeddfc01c', embedding=None, metadata={'total_pages': 6, 'file_path': '../docs\\data_description.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. \nNut position and motor current measurements were acquired during the tests. \n \nInitially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were 

In [5]:
def chunk_text(text):
    # Split the input text into individual sentences.
    single_sentences_list = split_sentences(text)
    
    # Combine adjacent sentences to form a context window around each sentence.
    combined_sentences = combine_sentences(single_sentences_list)
    
    # Convert the combined sentences into vector representations using a neural network model.
    embeddings = convert_to_vector(combined_sentences)
    
    # Calculate the cosine distances between consecutive combined sentence embeddings to measure similarity.
    distances = calculate_cosine_distances(embeddings)
    
    # Determine the threshold distance for identifying breakpoints based on the 80th percentile of all distances.
    breakpoint_percentile_threshold = 80
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
    
    # Find all indices where the distance exceeds the calculated threshold, indicating a potential chunk breakpoint.
    indices_above_thresh = [i for i, distance in enumerate(distances) if distance > breakpoint_distance_threshold]
    
    # Initialize the list of chunks and a variable to track the start of the next chunk.
    chunks = []
    start_index = 0
    # Loop through the identified breakpoints and create chunks accordingly.
    for index in indices_above_thresh:
        chunk = ' '.join(single_sentences_list[start_index:index+1])
        chunks.append(chunk)
        start_index = index + 1
    
    # If there are any sentences left after the last breakpoint, add them as the final chunk.
    if start_index < len(single_sentences_list):
        chunk = ' '.join(single_sentences_list[start_index:])
        chunks.append(chunk)
    
    # Return the list of text chunks.
    return chunks

def split_sentences(text):
    # Use regular expressions to split the text into sentences based on punctuation followed by whitespace.
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return sentences

def combine_sentences(sentences):
    # Create a buffer by combining each sentence with its previous and next sentence to provide a wider context.
    combined_sentences = []
    for i in range(len(sentences)):
        combined_sentence = sentences[i]
        if i > 0:
            combined_sentence = sentences[i-1] + ' ' + combined_sentence
        if i < len(sentences) - 1:
            combined_sentence += ' ' + sentences[i+1]
        combined_sentences.append(combined_sentence)
    return combined_sentences

def convert_to_vector(texts):

    # Try to generate embeddings for a list of texts using a pre-trained model and handle any exceptions.
    try:
        response = openai.embeddings.create(
            input=texts,
            model="text-embedding-3-small"
        )
        embeddings = np.array([item.embedding for item in response.data])
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return np.array([])  # Return an empty array in case of an error

def calculate_cosine_distances(embeddings):
    # Calculate the cosine distance (1 - cosine similarity) between consecutive embeddings.
    distances = []
    for i in range(len(embeddings) - 1):
        similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        distance = 1 - similarity
        distances.append(distance)
    return distances

In [6]:
split_sents = split_sentences(documents[0].text)
split_sents

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1.',
 'Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA).',
 'The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user.',
 'Nut position and motor current measurements were acquired during the tests.',
 'Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios.',
 'Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system.',
 'Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems.',
 'This document describes in detail

In [7]:
combin_sents = combine_sentences(split_sents)
combin_sents

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA).',
 'Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user.',
 'Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defi

In [8]:
sents_embeds = convert_to_vector(combin_sents)

In [9]:
sents_embeds

array([[-0.00309117,  0.03502347, -0.00536451, ...,  0.0165641 ,
        -0.00991769,  0.01144299],
       [-0.00811765,  0.04285707, -0.00185588, ...,  0.01412715,
        -0.00429985,  0.00906246],
       [ 0.0176975 ,  0.05201214, -0.00449511, ...,  0.00461408,
        -0.00711244,  0.01050789],
       ...,
       [ 0.00738776, -0.00441051, -0.01096438, ..., -0.01751174,
         0.00287953,  0.01560942],
       [ 0.00117725,  0.02584947, -0.02130744, ..., -0.04793172,
         0.0143341 ,  0.00660931],
       [-0.01273966,  0.01180369, -0.01088071, ..., -0.03689302,
         0.02549232, -0.00630808]])

In [10]:
sents_dists = calculate_cosine_distances(sents_embeds)
sents_dists

[0.026716864892927905,
 0.14821629423397853,
 0.18089307165578417,
 0.19051266927449184,
 0.1253529485052447,
 0.15652605524224739,
 0.20573903629722923,
 0.471104384471112,
 0.20877006405146425,
 0.3071947479063193,
 0.21521621789448964,
 0.0708581480183541,
 0.20921441055286127,
 0.23790664996055266,
 0.1519230554874753,
 0.08784977234786084,
 0.18593279831802711,
 0.26912269041108416,
 0.13437874095530322,
 0.23500181042796664,
 0.1346148752007006,
 0.13670941870560327,
 0.19755957825105508,
 0.16825601062916873,
 0.36442172128142036]

In [11]:
breakpoint_percentile_threshold = 80
breakpoint_distance_threshold = np.percentile(sents_dists, breakpoint_percentile_threshold)
breakpoint_distance_threshold

0.23558277833448385

In [12]:
indices_above_thresh = [i for i, distance in enumerate(sents_dists) if distance > breakpoint_distance_threshold]
indices_above_thresh

[7, 9, 13, 17, 24]

In [13]:
chunks = []
start_index = 0
# Loop through the identified breakpoints and create chunks accordingly.
for index in indices_above_thresh:
    chunk = ' '.join(split_sents[start_index:index+1])
    chunks.append(chunk)
    start_index = index + 1

# If there are any sentences left after the last breakpoint, add them as the final chunk.
if start_index < len(split_sents):
    chunk = ' '.join(split_sents[start_index:])
    chunks.append(chunk)

chunks

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. Nut position and motor current measurements were acquired during the tests. Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system. Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems. This document describes in detail the data files available in

In [14]:
CONFIG_DIR = '../config'
cfg = json.load(open(os.path.join(CONFIG_DIR,"configs.json")))

In [15]:
def get_embedding(texts):

    # Try to generate embeddings for a list of texts using a pre-trained model and handle any exceptions.
    try:
        response = openai.embeddings.create(
            input=texts,
            model=cfg['embedding_model']
        )
        embeddings = np.array([item.embedding for item in response.data])
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return np.array([])  # Return an empty array in case of an error

In [16]:
class SemanticChunker:
    """Build the semantic chunker for text splitting """
        
    def split_text(self, text: str) -> List[str]:
        """Split the given text based on the semantic structure of the text
        
        Args:
            text (str): The text to be split

        Returns:
            list[str]: The list of chunks
        """
        
        split_sents = self._split_sentences(text)
        combin_sents = self._combine_sentences(split_sents)
        sents_embeds = get_embedding(combin_sents)
        sents_dists = self._calculate_cosine_similarity(sents_embeds)
        
        breakpoint_percentile_threshold = 80
        breakpoint_distance_threshold = np.percentile(sents_dists, breakpoint_percentile_threshold)
        
        indices_above_thresh = [i for i, distance in enumerate(sents_dists) if distance > breakpoint_distance_threshold]
        
        chunks = []
        start_index = 0
        for index in indices_above_thresh:
            chunk = ' '.join(split_sents[start_index:index+1])
            chunks.append(chunk)
            start_index = index + 1
        
        if start_index < len(split_sents):
            chunk = ' '.join(split_sents[start_index:])
            chunks.append(chunk)
        
        return chunks

    def _split_sentences(self, text: str) -> List[str]:
        """Split the given text into sentences based on punctuation followed by whitespace.
        
        Args:
            text (str): The text to be split into sentences

        Returns:
            list[str]: The list of sentences
        """
        
        return  re.split(r'(?<=[.?!])\s+', text)

    def _combine_sentences(self, sentences: List[str]) -> List[str]:
        """ Create a buffer by combining each sentence with its previous and next sentence to provide a wider context. 
        For example, combine sentences 1,2,3 and 2,3,4 before computing the cosine simiality.
        
        Args:
            sentences (list[str]): The list of sentences to be combined
            
        Returns:
            list[str]: The list of combined sentences
        """
        
        combined_sentences = []
        for i in range(len(sentences)):
            combined_sentence = sentences[i]
            if i > 0:
                combined_sentence = sentences[i-1] + ' ' + combined_sentence
            if i < len(sentences) - 1:
                combined_sentence += ' ' + sentences[i+1]
            combined_sentences.append(combined_sentence)
        return combined_sentences

    def _calculate_cosine_similarity(self, embeddings: np.ndarray) -> List[float]:
        """ Calculate the cosine distance (1 - cosine similarity) between consecutive embeddings.
        
        Args:
            embeddings (np.ndarray): The embeddings to be compared
            
        Returns:
            list[float]: The list of cosine distances
        """
        
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
            distance = 1 - similarity
            distances.append(distance)
        return distances

In [17]:
semantic_chunker = SemanticChunker()

In [18]:
text_chunks = []
doc_idxs = []
for doc_idx, doc in tqdm(enumerate(documents)):
    cur_text_chunk = semantic_chunker.split_text(doc.text)
    text_chunks.extend(cur_text_chunk)
    doc_idxs.extend([doc_idx] * len(cur_text_chunk))

0it [00:00, ?it/s]

In [19]:
clean_text_chunks = [chunk for chunk in text_chunks if len(chunk) > 0]
clean_text_chunks

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. Nut position and motor current measurements were acquired during the tests. Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system. Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems. This document describes in detail the data files available in

In [20]:
import sys
sys.path.append('..')
from rocket_rag.node import TextNode

In [22]:
doc_nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    doc_nodes.append(node)

doc_nodes[:5]

[TextNode(id_='68cd82d9-20a4-4adb-b81b-28500b75ffe0', embedding=None, metadata={'total_pages': 6, 'file_path': '../docs\\data_description.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], hash=<property object at 0x0000013C3E89B6F0>, text='Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. Nut position and motor current measurements were acquired during the tests. Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spallin

In [24]:
STORE_DIR = '../store/doc_indexing'

if os.path.exists(os.path.join(STORE_DIR, 'nodes.pkl')):
    with open(os.path.join(STORE_DIR, 'nodes.pkl'), 'rb') as f:
        doc_nodes = pickle.load(f)
    loguru.logger.info(f'Successfully loaded doc_indexing.pkl!')
else:
    for node in tqdm(doc_nodes):
        node_embeds = get_embedding(node.text)
        node.embedding = node_embeds

  0%|          | 0/166 [00:00<?, ?it/s]

In [27]:
if not os.path.exists(os.path.join(STORE_DIR, 'nodes.pkl')):
    with open(os.path.join(STORE_DIR, 'nodes.pkl'), 'wb') as f:
        pickle.dump(doc_nodes, f)
    loguru.logger.info(f'Successfully saved doc_indexing.pkl!')
else:
    pass