In [1]:
import os
import sys
import json
sys.path.append('../')

from pathlib import Path
from rocket_rag.node_indexing import *
from llama_index.readers.file import PyMuPDFReader

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [2]:
cfg = json.load(open("../config/configs.json"))

INSTANCES_DIR = '../data/instances/'
INFERENCE_DIR = '../data/inference/'
STATES = ['normal', 
            'backlash1', 'backlash2',
            'lackLubrication1', 'lackLubrication2',
            'spalling1', 'spalling2', 'spalling3', 'spalling4', 'spalling5', 'spalling6', 'spalling7', 'spalling8']
LOADS= ['20kg', '40kg', '-40kg']
DOC_DIR = '../docs'

### Documentation indexing

In [3]:
loader = PyMuPDFReader()
documents = loader.load_data(os.path.join(DOC_DIR, 'data_description.pdf'))

In [14]:
txt_transform = TextTransform(cfg=cfg)
txt_node_indexer = TextNodeIndexer(f'../store/doc_indexing/data_description.pkl')

In [6]:
text_chunks = []
doc_idxs = []
for doc_idx, doc in tqdm(enumerate(documents)):
    cur_text_chunk = txt_transform.split_text(doc.text)
    text_chunks.extend(cur_text_chunk)
    doc_idxs.extend([doc_idx] * len(cur_text_chunk))

0it [00:00, ?it/s]

In [7]:
clean_text_chunks = [chunk for chunk in text_chunks if len(chunk) > 0]
clean_text_chunks

['Data set for "Data-based Detection and Diagnosis of Faults \nin Linear Actuators" \n \n1. Introduction \nThe dataset presented here was acquired for the study of degradation in linear \nactuators, particularly electro-mechanical actuators (EMA). The data was acquired \nfrom an instrumented rig (described in section 2) where a ball-screw actuator moved \nleft to right following a defined motion profile with a level of load selected by the user. Nut position and motor current measurements were acquired during the tests. Initially the rig was tested in normal conditions (absence of faults) under different \nmotion and loading scenarios. Subsequently different mechanical faults, including lack \nof lubrication, spalling and backlash, were seeded in the system. Position and motor \ncurrent data were acquired under these conditions to study monitoring methods to \ndetect faults and degradation in this particular type of systems. This document describes in detail the data files available in

In [9]:
embeds = [txt_transform.get_embedding_from_openai(text_chunk).squeeze().tolist() for text_chunk in clean_text_chunks]

In [12]:
txt_nodes = txt_node_indexer.indexing(clean_text_chunks, embeds)
txt_node_indexer.save_nodes(txt_nodes, '../store/doc_indexing/data_description.pkl')

[32m2024-09-24 16:44:02.778[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m130[0m - [34m[1mIndexing txt nodes...[0m
[32m2024-09-24 16:44:02.779[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m139[0m - [34m[1mIndexing txt nodes DONE![0m
[32m2024-09-24 16:44:02.780[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m71[0m - [34m[1mCreating a new file at ../store/doc_indexing/data_description.pkl...[0m
[32m2024-09-24 16:44:02.781[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m74[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-24 16:44:02.803[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m78[0m - [34m[1mAll nodes are stored.[0m


### Time series indexing

In [15]:
ts_transform = TimeSeriesTransform(cfg=cfg)

In [16]:
load = '20kg'
for load in LOADS:
    loguru.logger.debug(f'{load} time series nodes indexing...')

    load_num = load[:2]
    ids = [os.listdir(os.path.join(INSTANCES_DIR, load, state)) for state in STATES]
    ids = [filename for sublist in ids for filename in sublist]
    
    ts = []
    for f in ids:
        state = re.match(fr'(.*)_{load_num}', f).group(1)
        temp_ts_df = pd.read_csv(os.path.join(INSTANCES_DIR, load, state, f))
        ts.append(ts_transform.smoothing(ts_df=temp_ts_df, field='current'))
    
    ts_node_indexer = TimeSeriesNodeIndexer(ts_transform=ts_transform, nodes_filename=f'../store/ts_indexing/current_nodes_{load}.pkl')
    ts_node_indexer.indexing(ts=ts, ids=ids, meta_info={'load': load})

[32m2024-09-24 16:52:07.214[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1m20kg time series nodes indexing...[0m
[32m2024-09-24 16:52:10.141[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m172[0m - [34m[1mIndexing time series nodes...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-24 16:52:45.474[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m182[0m - [34m[1mIndexing time series nodes DONE.[0m
[32m2024-09-24 16:52:45.475[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m74[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-24 16:52:55.201[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m78[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-24 16:52:55.202[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1m40kg time series nodes indexing...[0m
[32m2024-09-24 16:52:58.048[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m172[0m - [34m[1mIndexing time series nodes...[0m


  0%|          | 0/452 [00:00<?, ?it/s]

[32m2024-09-24 16:53:33.641[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m182[0m - [34m[1mIndexing time series nodes DONE.[0m
[32m2024-09-24 16:53:33.641[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m71[0m - [34m[1mCreating a new file at ../store/ts_indexing/current_nodes_40kg.pkl...[0m
[32m2024-09-24 16:53:33.642[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m74[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-24 16:53:43.241[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m78[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-24 16:53:43.242[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1m-40kg time series nodes indexing...[0m
[32m2024-09-24 16:53:46.199[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m172[0m - [34m[1mInde

  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-24 16:54:21.722[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindexing[0m:[36m182[0m - [34m[1mIndexing time series nodes DONE.[0m
[32m2024-09-24 16:54:21.723[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m71[0m - [34m[1mCreating a new file at ../store/ts_indexing/current_nodes_-40kg.pkl...[0m
[32m2024-09-24 16:54:21.724[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m74[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-24 16:54:31.449[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m78[0m - [34m[1mAll nodes are stored.[0m
