In [1]:
import os
import re
import json
import loguru
import openai
import pickle
import numpy as np
import sys
sys.path.append('..')

from rocket_rag.node import TextNode
from llama_index.readers.file import PyMuPDFReader 
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple
from tqdm.auto import tqdm

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [4]:
DOC_DIR = '../docs'

loader = PyMuPDFReader()
# documents = [loader.load_data(os.path.join(DOC_DIR, f)) for f in os.listdir(DOC_DIR) if f.endswith('.pdf')]
# documents = [page for sub_doc in documents for page in sub_doc]
documents = loader.load_data(os.path.join(DOC_DIR, 'data_description.pdf'))
# documents = documents[:-1]

In [5]:
documents

[Document(id_='559d4dd8-2cf0-41f6-869e-91a57641ab94', embedding=None, metadata={'total_pages': 6, 'file_path': '../docs\\data_description.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. \nNut position and motor current measurements were acquired during the tests. \n \nInitially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were 

In [6]:
def chunk_text(text):
    # Split the input text into individual sentences.
    single_sentences_list = split_sentences(text)
    
    # Combine adjacent sentences to form a context window around each sentence.
    combined_sentences = combine_sentences(single_sentences_list)
    
    # Convert the combined sentences into vector representations using a neural network model.
    embeddings = convert_to_vector(combined_sentences)
    
    # Calculate the cosine distances between consecutive combined sentence embeddings to measure similarity.
    distances = calculate_cosine_distances(embeddings)
    
    # Determine the threshold distance for identifying breakpoints based on the 80th percentile of all distances.
    breakpoint_percentile_threshold = 80
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
    
    # Find all indices where the distance exceeds the calculated threshold, indicating a potential chunk breakpoint.
    indices_above_thresh = [i for i, distance in enumerate(distances) if distance > breakpoint_distance_threshold]
    
    # Initialize the list of chunks and a variable to track the start of the next chunk.
    chunks = []
    start_index = 0
    # Loop through the identified breakpoints and create chunks accordingly.
    for index in indices_above_thresh:
        chunk = ' '.join(single_sentences_list[start_index:index+1])
        chunks.append(chunk)
        start_index = index + 1
    
    # If there are any sentences left after the last breakpoint, add them as the final chunk.
    if start_index < len(single_sentences_list):
        chunk = ' '.join(single_sentences_list[start_index:])
        chunks.append(chunk)
    
    # Return the list of text chunks.
    return chunks

def split_sentences(text):
    # Use regular expressions to split the text into sentences based on punctuation followed by whitespace.
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return sentences

def combine_sentences(sentences):
    # Create a buffer by combining each sentence with its previous and next sentence to provide a wider context.
    combined_sentences = []
    for i in range(len(sentences)):
        combined_sentence = sentences[i]
        if i > 0:
            combined_sentence = sentences[i-1] + ' ' + combined_sentence
        if i < len(sentences) - 1:
            combined_sentence += ' ' + sentences[i+1]
        combined_sentences.append(combined_sentence)
    return combined_sentences

def convert_to_vector(texts):

    # Try to generate embeddings for a list of texts using a pre-trained model and handle any exceptions.
    try:
        response = openai.embeddings.create(
            input=texts,
            model="text-embedding-3-small"
        )
        embeddings = np.array([item.embedding for item in response.data])
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return np.array([])  # Return an empty array in case of an error

def calculate_cosine_distances(embeddings):
    # Calculate the cosine distance (1 - cosine similarity) between consecutive embeddings.
    distances = []
    for i in range(len(embeddings) - 1):
        similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        distance = 1 - similarity
        distances.append(distance)
    return distances

In [7]:
split_sents = split_sentences(documents[0].text)
split_sents

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1.',
 'Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA).',
 'The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user.',
 'Nut position and motor current measurements were acquired during the tests.',
 'Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios.',
 'Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system.',
 'Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems.',
 'This document describes in detail

In [8]:
combin_sents = combine_sentences(split_sents)
combin_sents

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA).',
 'Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user.',
 'Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defi

In [9]:
sents_embeds = convert_to_vector(combin_sents)

In [10]:
sents_embeds

array([[-0.00313756,  0.03519   , -0.00523305, ...,  0.01668603,
        -0.00978438,  0.01144   ],
       [-0.00813694,  0.04304537, -0.00172709, ...,  0.01420266,
        -0.00410063,  0.00905677],
       [ 0.01772402,  0.05201448, -0.00448245, ...,  0.00462715,
        -0.00709347,  0.01050194],
       ...,
       [ 0.00690299, -0.0045716 , -0.01149412, ..., -0.01682766,
         0.00249582,  0.01635878],
       [ 0.00117168,  0.02586879, -0.02128563, ..., -0.04799623,
         0.01429732,  0.00662754],
       [-0.01276827,  0.0117871 , -0.01090339, ..., -0.03690778,
         0.02551056, -0.00633215]])

In [11]:
sents_dists = calculate_cosine_distances(sents_embeds)
sents_dists

[0.027047005926273515,
 0.14747357939316053,
 0.18092324086678024,
 0.1905382109496141,
 0.12514677466266222,
 0.15629309687730908,
 0.20554314449771738,
 0.471104384471112,
 0.20875743998446206,
 0.30714935082095707,
 0.21520814882672812,
 0.070842776173353,
 0.20924889678457315,
 0.23785564148797533,
 0.15195594070542673,
 0.08784912396977373,
 0.18626953839898275,
 0.2692460310680381,
 0.13433083871119522,
 0.2350278209239669,
 0.1346422926347297,
 0.13666671768562366,
 0.19798996122634704,
 0.16818506415937218,
 0.3644681081722627]

In [12]:
breakpoint_percentile_threshold = 80
breakpoint_distance_threshold = np.percentile(sents_dists, breakpoint_percentile_threshold)
breakpoint_distance_threshold

0.2355933850367686

In [13]:
indices_above_thresh = [i for i, distance in enumerate(sents_dists) if distance > breakpoint_distance_threshold]
indices_above_thresh

[7, 9, 13, 17, 24]

In [14]:
chunks = []
start_index = 0
# Loop through the identified breakpoints and create chunks accordingly.
for index in indices_above_thresh:
    chunk = ' '.join(split_sents[start_index:index+1])
    chunks.append(chunk)
    start_index = index + 1

# If there are any sentences left after the last breakpoint, add them as the final chunk.
if start_index < len(split_sents):
    chunk = ' '.join(split_sents[start_index:])
    chunks.append(chunk)

chunks

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. Nut position and motor current measurements were acquired during the tests. Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system. Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems. This document describes in detail the data files available in

In [15]:
CONFIG_DIR = '../config'
cfg = json.load(open(os.path.join(CONFIG_DIR,"configs.json")))

In [16]:
def get_embedding(texts):

    # Try to generate embeddings for a list of texts using a pre-trained model and handle any exceptions.
    try:
        response = openai.embeddings.create(
            input=texts,
            model=cfg['embedding_model']
        )
        embeddings = np.array([item.embedding for item in response.data])
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return np.array([])  # Return an empty array in case of an error

In [17]:
class SemanticChunker:
    """Build the semantic chunker for text splitting """
        
    def split_text(self, text: str) -> List[str]:
        """Split the given text based on the semantic structure of the text
        
        Args:
            text (str): The text to be split

        Returns:
            list[str]: The list of chunks
        """
        
        split_sents = self._split_sentences(text)
        combin_sents = self._combine_sentences(split_sents)
        sents_embeds = get_embedding(combin_sents)
        sents_dists = self._calculate_cosine_similarity(sents_embeds)
        
        breakpoint_percentile_threshold = 80
        breakpoint_distance_threshold = np.percentile(sents_dists, breakpoint_percentile_threshold)
        
        indices_above_thresh = [i for i, distance in enumerate(sents_dists) if distance > breakpoint_distance_threshold]
        
        chunks = []
        start_index = 0
        for index in indices_above_thresh:
            chunk = ' '.join(split_sents[start_index:index+1])
            chunks.append(chunk)
            start_index = index + 1
        
        if start_index < len(split_sents):
            chunk = ' '.join(split_sents[start_index:])
            chunks.append(chunk)
        
        return chunks

    def _split_sentences(self, text: str) -> List[str]:
        """Split the given text into sentences based on punctuation followed by whitespace.
        
        Args:
            text (str): The text to be split into sentences

        Returns:
            list[str]: The list of sentences
        """
        
        return  re.split(r'(?<=[.?!])\s+', text)

    def _combine_sentences(self, sentences: List[str]) -> List[str]:
        """ Create a buffer by combining each sentence with its previous and next sentence to provide a wider context. 
        For example, combine sentences 1,2,3 and 2,3,4 before computing the cosine simiality.
        
        Args:
            sentences (list[str]): The list of sentences to be combined
            
        Returns:
            list[str]: The list of combined sentences
        """
        
        combined_sentences = []
        for i in range(len(sentences)):
            combined_sentence = sentences[i]
            if i > 0:
                combined_sentence = sentences[i-1] + ' ' + combined_sentence
            if i < len(sentences) - 1:
                combined_sentence += ' ' + sentences[i+1]
            combined_sentences.append(combined_sentence)
        return combined_sentences

    def _calculate_cosine_similarity(self, embeddings: np.ndarray) -> List[float]:
        """ Calculate the cosine distance (1 - cosine similarity) between consecutive embeddings.
        
        Args:
            embeddings (np.ndarray): The embeddings to be compared
            
        Returns:
            list[float]: The list of cosine distances
        """
        
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
            distance = 1 - similarity
            distances.append(distance)
        return distances

In [18]:
semantic_chunker = SemanticChunker()

In [19]:
text_chunks = []
doc_idxs = []
for doc_idx, doc in tqdm(enumerate(documents)):
    cur_text_chunk = semantic_chunker.split_text(doc.text)
    text_chunks.extend(cur_text_chunk)
    doc_idxs.extend([doc_idx] * len(cur_text_chunk))

0it [00:00, ?it/s]

In [20]:
clean_text_chunks = [chunk for chunk in text_chunks if len(chunk) > 0]
clean_text_chunks

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. Nut position and motor current measurements were acquired during the tests. Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system. Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems. This document describes in detail the data files available in

In [21]:
doc_nodes = []
for idx, text_chunk in tqdm(enumerate(clean_text_chunks)):
    node = TextNode(
        text=text_chunk,
        embedding=get_embedding(text_chunk).squeeze().tolist(),
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    doc_nodes.append(node)

0it [00:00, ?it/s]

In [22]:
doc_nodes[:5]

[TextNode(id_='91521361-a16e-402c-939d-e741491f7d03', embedding=[-0.012448227033019066, 0.04810984060168266, 0.013283288106322289, -0.04575354605913162, 0.011697320267558098, 0.026618361473083496, -0.03814090043306351, 0.01828070357441902, -0.0048032160848379135, -0.002636266639456153, 0.043992798775434494, -0.0316675640642643, -0.009036778472363949, -0.05062149465084076, 0.047358933836221695, 0.0498964823782444, 0.04404458403587341, -0.022190598770976067, -0.056810006499290466, 0.0557742714881897, 0.04963754862546921, -0.023368746042251587, 0.03741588816046715, -0.02711033634841442, 0.01864321157336235, -0.04181775823235512, 0.017931142821907997, -0.00804635789245367, 0.0120533537119627, -0.05675822123885155, 0.014603848569095135, -0.017982929944992065, -0.03798554092645645, -0.010247292928397655, -0.05199384316802025, 0.046815175563097, -0.029725564643740654, 0.032418470829725266, -0.03679444640874863, 0.012156927026808262, -0.02150442637503147, -0.04181775823235512, -0.0404195152223

In [23]:
STORE_DIR = '../store/doc_indexing'    

In [24]:
with open(os.path.join(STORE_DIR, 'doc_nodes.pkl'), 'wb') as f:
    pickle.dump(doc_nodes, f)
with open(os.path.join(STORE_DIR, 'doc_nodes.pkl'), 'rb') as f:
    doc_nodes = pickle.load(f)

In [25]:
query_str = "What is spalling7 means?"

In [26]:
query_emb = get_embedding(query_str).squeeze().tolist()

In [27]:
def get_top_k_embeddings(
    query_embedding: List[float],
    doc_embeddings: List[List[float]],
    doc_ids: List[str],
    similarity_top_k: int = 5,
) -> Tuple[List[float], List]:
    """Get top nodes by similarity to the query."""
    # dimensions: D
    qembed_np = np.array(query_embedding)
    # dimensions: N x D
    dembed_np = np.array(doc_embeddings)
    # dimensions: N
    dproduct_arr = np.dot(dembed_np, qembed_np)
    # dimensions: N
    norm_arr = np.linalg.norm(qembed_np) * np.linalg.norm(
        dembed_np, axis=1, keepdims=False
    )
    # dimensions: N
    cos_sim_arr = dproduct_arr / norm_arr

    # now we have the N cosine similarities for each document
    # sort by top k cosine similarity, and return ids
    tups = [(cos_sim_arr[i], doc_ids[i]) for i in range(len(doc_ids))]
    sorted_tups = sorted(tups, key=lambda t: t[0], reverse=True)

    sorted_tups = sorted_tups[:similarity_top_k]

    result_similarities = [s for s, _ in sorted_tups]
    result_ids = [n for _, n in sorted_tups]
    return result_similarities, result_ids

In [28]:
def dense_search(query_embedding: List[float], nodes: List[TextNode]):
    """Dense search."""
    # query_embedding = cast(List[float], query.query_embedding)
    doc_embeddings = [n.embedding for n in nodes]
    doc_ids = [n.node_id for n in nodes]
    return get_top_k_embeddings(
        query_embedding,
        doc_embeddings,
        doc_ids,
        similarity_top_k=5,
    )

In [29]:
similarities, node_ids = dense_search(query_emb, doc_nodes)

In [30]:
node_text = [node.text for node in doc_nodes if node.node_id in node_ids]
for nt in node_text:
    print(nt)   

Seeded faults were introduced because there was insufficient test time to run the ball 
screws naturally to failure. The faults were introduced gradually in order to observe 
how the severity of the faults affect the measured signals. In the case of lack of lubrication, in the first stage the lubricant was removed with 
degreaser. No dramatic changes were observed in the signals, mainly due to the 
inherent low friction of the ball-screw architecture. In order to increase the severity of 
the fault, the bolts holding the plastic seal at both ends of the nut containing the balls 
were tightened, to create more friction (see Fig. 3).
Fig. 3: Bolt holding seal in ballnut 
 
The spalling defect was started as a 1 mm diameter surface defect on the rolling 
surface of the screw (see Fig. 4)).
Starr 
Trough-Life Engineering Services Institute, Cranfield University 
 
4 
 
replicated in a neighbour channel, and in stage 6 the size of both defects was 
increased affecting the sidewall between t