In [1]:
import os
import sys
import json
sys.path.append('../')

from pathlib import Path
from rocket_rag.node_indexing import *
from llama_index.readers.file import PyMuPDFReader

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [2]:
cfg = json.load(open("../config/configs.json"))

INSTANCES_DIR = '../data/instances/'
INFERENCE_DIR = '../data/inference/'
STATES = ['normal', 
            'backlash1', 'backlash2',
            'lackLubrication1', 'lackLubrication2',
            'spalling1', 'spalling2', 'spalling3', 'spalling4', 'spalling5', 'spalling6', 'spalling7', 'spalling8']
LOADS= ['20kg', '40kg', '-40kg']
DOC_DIR = '../docs'

### Documentation indexing

In [3]:
loader = PyMuPDFReader()
documents = loader.load_data(os.path.join(DOC_DIR, 'data_description.pdf'))

In [4]:
txt_transform = TextTransform(cfg=cfg)
txt_node_indexer = TextNodeIndexer()

In [5]:
text_chunks = []
doc_idxs = []
for doc_idx, doc in tqdm(enumerate(documents)):
    cur_text_chunk = txt_transform.split_text(doc.text)
    if len(cur_text_chunk) > 0:
        text_chunks.extend(cur_text_chunk)
        doc_idxs.extend([doc_idx] * len(cur_text_chunk))
    else:
        continue

0it [00:00, ?it/s]

In [None]:
txt_nodes = txt_node_indexer.index(txt=text_chunks, txt_transform=txt_transform, doc_ids=doc_idxs, meta_info={'doc_name': 'data_description.pdf'})

In [12]:
txt_node_indexer.save_nodes(nodes=txt_node_indexer.nodes, filename=f'../store/doc_indexing/data_description.pkl')

[32m2024-09-26 13:14:41.360[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m89[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 13:14:41.381[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m93[0m - [34m[1mAll nodes are stored.[0m


### Time series indexing

In [3]:
ts_transform = TimeSeriesTransform(cfg=cfg)
ts_node_indexer = TimeSeriesNodeIndexer()

In [4]:
for load in LOADS:
    loguru.logger.debug(f'{load} time series nodes indexing...')

    load_num = load[:2]
    ids = [os.listdir(os.path.join(INSTANCES_DIR, load, state)) for state in STATES]
    ids = [filename for sublist in ids for filename in sublist]
    
    ts = []
    for f in ids:
        state = re.match(fr'(.*)_{load_num}', f).group(1)
        temp_ts_df = pd.read_csv(os.path.join(INSTANCES_DIR, load, state, f))
        ts.append(ts_transform.smoothing(ts_df=temp_ts_df, field='current'))
    
    ts_node_indexer.index(ts=ts, ts_transform=ts_transform, labels=ids, meta_info={'load': load})
    ts_node_indexer.save_nodes(nodes=ts_node_indexer.nodes, filename=f'../store/ts_indexing/current_nodes_{load}.pkl')

[32m2024-09-26 14:30:51.902[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m20kg time series nodes indexing...[0m
[32m2024-09-26 14:30:52.354[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-26 14:31:27.605[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-26 14:31:27.606[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 14:31:37.321[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-26 14:31:37.322[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m40kg time series nodes indexing...[0m
[32m2024-09-26 14:31:37.767[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/452 [00:00<?, ?it/s]

[32m2024-09-26 14:32:12.222[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-26 14:32:12.268[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 14:32:21.994[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-26 14:32:21.995[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m-40kg time series nodes indexing...[0m
[32m2024-09-26 14:32:22.445[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-26 14:32:57.136[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-26 14:32:57.168[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 14:33:06.946[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m


In [5]:
for load in LOADS:
    loguru.logger.debug(f'{load} time series nodes indexing...')

    load_num = load[:2]
    ids = [os.listdir(os.path.join(INSTANCES_DIR, load, state)) for state in STATES]
    ids = [filename for sublist in ids for filename in sublist]
    
    ts = []
    for f in ids:
        state = re.match(fr'(.*)_{load_num}', f).group(1)
        temp_ts_df = pd.read_csv(os.path.join(INSTANCES_DIR, load, state, f))
        ts.append(ts_transform.smoothing(ts_df=temp_ts_df, field='position_error'))
    
    ts_node_indexer.index(ts=ts, ts_transform=ts_transform, labels=ids, meta_info={'load': load})
    ts_node_indexer.save_nodes(nodes=ts_node_indexer.nodes, filename=f'../store/ts_indexing/position_error_{load}.pkl')

[32m2024-09-26 14:33:06.977[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m20kg time series nodes indexing...[0m
[32m2024-09-26 14:33:07.421[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-26 14:33:42.286[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-26 14:33:42.322[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 14:33:52.905[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-26 14:33:52.906[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m40kg time series nodes indexing...[0m
[32m2024-09-26 14:33:53.340[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/452 [00:00<?, ?it/s]

[32m2024-09-26 14:34:31.225[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-26 14:34:31.255[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 14:34:41.306[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m
[32m2024-09-26 14:34:41.307[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [34m[1m-40kg time series nodes indexing...[0m
[32m2024-09-26 14:34:41.749[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m194[0m - [34m[1mTime Series Nodes Indexing...[0m


  0%|          | 0/455 [00:00<?, ?it/s]

[32m2024-09-26 14:35:17.574[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mindex[0m:[36m203[0m - [34m[1mTime Series Nodes Indexing DONE.[0m
[32m2024-09-26 14:35:17.611[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m86[0m - [34m[1mSaving all nodes...[0m
[32m2024-09-26 14:35:27.469[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36msave_nodes[0m:[36m90[0m - [34m[1mAll nodes are stored.[0m


In [6]:
cur_20kg_nodes = ts_node_indexer.load_nodes(filename=f'../store/ts_indexing/current_nodes_20kg.pkl')

[32m2024-09-26 13:45:45.259[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mload_nodes[0m:[36m59[0m - [34m[1mLoading all nodes...[0m
[32m2024-09-26 13:45:46.433[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mload_nodes[0m:[36m63[0m - [34m[1mAll nodes are loaded.[0m


In [9]:
cur_20kg_nodes_ = ts_node_indexer.load_nodes(filename=f'../store/ts_indexing/20kg.pkl')

[32m2024-09-26 13:47:20.532[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mload_nodes[0m:[36m59[0m - [34m[1mLoading all nodes...[0m
[32m2024-09-26 13:47:21.487[0m | [34m[1mDEBUG   [0m | [36mrocket_rag.node_indexing[0m:[36mload_nodes[0m:[36m63[0m - [34m[1mAll nodes are loaded.[0m


In [198]:
import random
from pyts.transformation import ROCKET, BOSS


rocket = ROCKET(random_state=42)
boss = BOSS(word_size=2, n_bins=4, window_size=12, sparse=False)

In [223]:
rand_idx = random.randint(0, len(cur_20kg_nodes)-1)
print(rand_idx)

61


In [224]:
print(cur_20kg_nodes[rand_idx].rocket)
# print(cur_20kg_nodes_[rand_idx].rocket)

[0.38271606, 0.7623457, 0.14814815, 0.5277778, 0.9074074, 0.29320988, 0.6728395, 0.058641978, 0.4382716, 0.81790125, 0.2037037, 0.5833333, 0.962963, 0.34876543, 0.72839504, 0.11419753, 0.49382716, 0.8734568, 0.25925925, 0.6388889, 0.021604938, 0.40432099, 0.7839506, 0.38271606, 0.7623457, 0.14814815, 0.5277778, 0.9074074, 0.29320988, 0.6728395, 0.058641978, 0.4382716, 0.81790125, 0.2037037, 0.5833333, 0.962963, 0.34876543, 0.72839504, 0.11419753, 0.49382716, 0.8734568, 0.25925925, 0.6388889, 0.021604938, 0.40432099, 0.7839506, 0.38271606, 0.7623457, 0.14814815, 0.5277778, 0.9074074, 0.29320988, 0.6728395, 0.058641978, 0.4382716, 0.81790125, 0.2037037, 0.5833333, 0.962963, 0.34876543, 0.72839504, 0.11419753, 0.49382716, 0.8734568, 0.25925925, 0.6388889, 0.021604938, 0.40432099, 0.7839506, 0.38271606, 0.7623457, 0.14814815, 0.5277778, 0.9074074, 0.29320988, 0.6728395, 0.058641978, 0.4382716, 0.81790125, 0.2037037, 0.5833333, 0.962963, 0.34876543, 0.72839504, 0.11419753, 0.49382716, 0.873

In [225]:
print(rocket.fit_transform([ts[rand_idx]]))

[[0.59517484 0.1754386  2.78704352 ... 0.         2.27164949 0.44444444]]


In [226]:
print(boss.fit_transform([ts[rand_idx]]))

[[ 2  7  9  6  3  6  6  4  6  8  9  5  6  8 10  4]]


In [177]:
boss.vocabulary_

{8: 'ca',
 10: 'cc',
 11: 'cd',
 9: 'cb',
 12: 'da',
 13: 'db',
 14: 'dc',
 15: 'dd',
 7: 'bd',
 3: 'ad',
 0: 'aa',
 2: 'ac',
 1: 'ab',
 4: 'ba',
 5: 'bb',
 6: 'bc'}