In [1]:
import os
import sys
import json
sys.path.append('../')

from pathlib import Path
from rocket_rag.node_indexing import *
from llama_index.readers.file import PyMuPDFReader

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [2]:
cfg = json.load(open("../config/configs.json"))

INSTANCES_DIR = '../data/instances/'
INFERENCE_DIR = '../data/inference/'
STATES = ['normal', 
            'backlash1', 'backlash2',
            'lackLubrication1', 'lackLubrication2',
            'spalling1', 'spalling2', 'spalling3', 'spalling4', 'spalling5', 'spalling6', 'spalling7', 'spalling8']
LOADS= ['20kg', '40kg', '-40kg']
DOC_DIR = '../docs'

### Documentation indexing

In [3]:
loader = PyMuPDFReader()
documents = loader.load_data(os.path.join(DOC_DIR, 'data_description.pdf'))

In [4]:
txt_transform = TextTransform(cfg=cfg)
txt_node_indexer = TextNodeIndexer()

In [5]:
text_chunks = []
doc_idxs = []
for doc_idx, doc in tqdm(enumerate(documents)):
    cur_text_chunk = txt_transform.split_text(doc.text)
    if len(cur_text_chunk) > 0:
        text_chunks.extend(cur_text_chunk)
        doc_idxs.extend([doc_idx] * len(cur_text_chunk))
    else:
        continue

0it [00:00, ?it/s]

In [None]:
txt_nodes = txt_node_indexer.index(txt=text_chunks, txt_transform=txt_transform, doc_ids=doc_idxs, meta_info={'doc_name': 'data_description.pdf'})

In [12]:
txt_node_indexer.save_nodes(nodes=txt_node_indexer.nodes, filename=f'../store/doc_indexing/data_description.pkl')

[32m2024-09-26 13:14:41.360[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m89[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 13:14:41.381[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m93[0m - [34m[1mAll nodes are stored.[0m


### Time series indexing

In [3]:
ts_transform = TimeSeriesTransform(cfg=cfg)
ts_node_indexer = TimeSeriesNodeIndexer()

In [4]:
for load in LOADS:
    loguru.logger.debug(f'{load} time series nodes indexing...')

    load_num = load[:2]
    filenames = [os.listdir(os.path.join(INSTANCES_DIR, load, state)) for state in STATES]
    filenames = [filename for sublist in filenames for filename in sublist]
    
    ts = []
    states = []
    for f in filenames:
        state = re.match(fr'(.*)_{load_num}', f).group(1)
        states.append(state)
        temp_ts_df = pd.read_csv(os.path.join(INSTANCES_DIR, load, state, f))
        ts.append(ts_transform.smoothing(ts_df=temp_ts_df, field='current'))
    
    ts_node_indexer.index(ts=ts, ts_transform=ts_transform, labels=states, meta_info={'load': load})
    ts_node_indexer.save_nodes(nodes=ts_node_indexer.nodes, filename=f'../store/ts_indexing/current_nodes_{load}.pkl')

[32m2024-09-27 21:07:24.236[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m20kg time series nodes indexing...[0m
[32m2024-09-27 21:07:24.690[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-27 21:07:39.378[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-27 21:07:39.379[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-27 21:07:59.777[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-27 21:07:59.779[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m40kg time series nodes indexing...[0m
[32m2024-09-27 21:08:02.619[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/452 [00:00<?, ?it/s]

[32m2024-09-27 21:08:12.645[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-27 21:08:12.720[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-27 21:08:33.210[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-27 21:08:33.211[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m-40kg time series nodes indexing...[0m
[32m2024-09-27 21:08:36.109[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-27 21:08:45.983[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-27 21:08:46.038[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-27 21:09:06.560[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m


In [5]:
for load in LOADS:
    loguru.logger.debug(f'{load} time series nodes indexing...')

    load_num = load[:2]
    filenames = [os.listdir(os.path.join(INSTANCES_DIR, load, state)) for state in STATES]
    filenames = [filename for sublist in filenames for filename in sublist]
    
    ts = []
    states = []
    for f in filenames:
        state = re.match(fr'(.*)_{load_num}', f).group(1)
        states.append(state)
        temp_ts_df = pd.read_csv(os.path.join(INSTANCES_DIR, load, state, f))
        ts.append(ts_transform.smoothing(ts_df=temp_ts_df, field='position_error'))
    
    ts_node_indexer.index(ts=ts, ts_transform=ts_transform, labels=states, meta_info={'load': load})
    ts_node_indexer.save_nodes(nodes=ts_node_indexer.nodes, filename=f'../store/ts_indexing/position_error_nodes_{load}.pkl')

[32m2024-09-27 21:09:06.579[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m20kg time series nodes indexing...[0m
[32m2024-09-27 21:09:07.027[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-27 21:09:17.028[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-27 21:09:17.089[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-27 21:09:37.789[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-27 21:09:37.790[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m40kg time series nodes indexing...[0m
[32m2024-09-27 21:09:38.233[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/452 [00:00<?, ?it/s]

[32m2024-09-27 21:09:48.208[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-27 21:09:48.267[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-27 21:10:08.783[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-27 21:10:08.784[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m-40kg time series nodes indexing...[0m
[32m2024-09-27 21:10:09.233[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-27 21:10:19.276[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-27 21:10:19.333[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-27 21:10:40.525[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m


In [6]:
cur_20kg_nodes = ts_node_indexer.load_nodes(filename=f'../store/ts_indexing/current_nodes_20kg.pkl')

[32m2024-09-27 21:10:43.002[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mload_nodes[0m:[36m59[0m - [34m[1mLoading all nodes...[0m
[32m2024-09-27 21:10:44.959[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mload_nodes[0m:[36m63[0m - [34m[1mAll nodes are loaded.[0m


In [9]:
cur_20kg_nodes[0]

TimeSeriesNode(id_='normal', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], text='', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n', rocket=[0.889334536174357, 0.3201754385964912, 1.5945589850468775, 0.13271604938271606, 2.599105391623455, 0.27469135802469136, 1.6720864870481988, 0.6666666666666666, 1.807026114382156, 0.9598765432098766, 2.336138354868506, 0.8617021276595744, 0.8734635793671, 0.006172839506172839, -0.3829183698305513, 0.0, 1.4126985268332577, 0.8395061728395061, 1.6319262334927491, 1.0, 1.212306276639614, 0.49074074074074076, -0.6816909908427977, 0.0, 1.9777871545265322, 0.7993827160493827, 1.450688066816199, 1.0, 2.361106734073175, 0.8703703703703703, 1.6329808079478727, 0.9506172839506173, 0.8458889864130101, 0.7816901408450704, 1.1971664103901698, 0.925531914893617, 1.2071095242276255, 1.0, 0.690357371