# Reading Data

In [None]:
import json
from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
with open("/home/ltnga/LawVN-Instructction-Gen/src/data/data.json") as f:
    all_data = json.load(f)


In [None]:
documents = all_data

In [None]:
documents = [Document(text=doc) for doc in documents]

# Now create and use the TokenTextSplitter
base_node_parser = TokenTextSplitter(
    chunk_overlap=50,
    chunk_size=300,
    separator=" ",
    backup_separators=["__", "..", "--"],
    include_prev_next_rel=False
)

# Transformation

In [None]:
from pyvi import ViTokenizer
from transformers import AutoTokenizer
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter

tokenizer = AutoTokenizer.from_pretrained("qducnguyen/vietnamese-bi-encoder")

In [None]:
base_node_parser = TokenTextSplitter( 
                                chunk_overlap=0,
                                chunk_size=1000,
                                separator=" ",
                                backup_separators=["__", "..", "--"],
                                include_prev_next_rel=False
                                )


base_nodes = base_node_parser.get_nodes_from_documents(documents,
                                                       show_progress=True)

In [None]:
print(base_nodes[3].text) 

In [None]:
# # Save as parent parquet 
# import os
# from datasets import Dataset

# dataset = Dataset.from_list([node.to_dict() for node in base_nodes])

# with open(os.path.join("/home/s/ducnq/law-rag/data/hf", "parent_nodes.parquet"), "wb") as fOut:
#     dataset.to_parquet(fOut)

In [None]:
# Child nodes
from tqdm import tqdm
for base_node in tqdm(base_nodes):
    base_node.metadata["parent_text"] = base_node.text
    # base_node.text = ViTokenizer.tokenize(base_node.text.lower())
    base_node.excluded_embed_metadata_keys.append("parent_text")
    base_node.excluded_llm_metadata_keys.append("parent_text")

In [None]:
child_node_parser= SentenceSplitter(
                    chunk_size=100,
                    chunk_overlap=0,
                    separator=" ",
                    include_prev_next_rel=False,
                    )

In [None]:
child_nodes = child_node_parser.get_nodes_from_documents(base_nodes,
                                                         show_progress=True)

In [None]:
print(child_nodes[0].text) 

In [None]:
from tqdm import tqdm
from llama_index.core.schema import NodeRelationship

for child_node in tqdm(child_nodes):
    child_node.text = ViTokenizer.tokenize(child_node.text.lower())
    try:
        del child_node.relationships[NodeRelationship.SOURCE].metadata
    except AttributeError:
        continue

In [None]:
len(child_nodes), child_nodes[2]

In [None]:
child_nodes[0].to_dict()

In [None]:
# # Save as child parquet 
# import os
# from datasets import Dataset

# dataset = Dataset.from_list([node.to_dict() for node in child_nodes])

# with open(os.path.join("/home/s/ducnq/law-rag/data/hf", "child_nodes.parquet"), "wb") as fOut:
#     dataset.to_parquet(fOut)

In [None]:
# Upload to folder to HF 
from huggingface_hub import HfApi

REPO_ID = "bkai-foundation-models/TVPL"
REPO_TYPE = "dataset"
api = HfApi()

# api.create_repo(
#     repo_id=REPO_ID,
#     private=True,
#     repo_type=REPO_TYPE,
#     exist_ok=False
# )


api.upload_folder(folder_path="/home/s/ducnq/law-rag/data/hf",
                  repo_id=REPO_ID,
                  repo_type=REPO_TYPE)

In [None]:
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt 

# Indexing

In [None]:
# Save to disk
import weaviate
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from weaviate.classes.init import Auth


WEAVIATE_URL = "https://jd11sxlqap7tdknwzega.c0.asia-southeast1.gcp.weaviate.cloud"
weaviate_api_key = "93M51uT7bsG5EMnfL5z78woitWLg7XuAn4ps"
DATA_COLLECTION = "ND168"
DEVICE = "cuda:0"
MODEL_NAME = "qducnguyen/vietnamese-bi-encoder"
embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME, 
                                   max_length=256,
                                   device=DEVICE)

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

In [None]:
vector_store = WeaviateVectorStore(weaviate_client=client,
                                   index_name=DATA_COLLECTION)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(child_nodes, 
                         storage_context=storage_context, 
                         embed_model=embed_model,
                         insert_batch_size=32768,
                         show_progress=True)

# Test Retrieval

In [None]:
## 
from pyvi import ViTokenizer
from llama_index.core.response.notebook_utils import display_source_node
base_retriever = index.as_retriever(vector_store_query_mode="hybrid",
                                    similarity_top_k=100, 
                                    alpha=0.7)

In [None]:
TEST_QUESTION = "đi xe máy không đội mũ bảo hiểm bị phạt bao nhiêu tiền?"
retrievals = base_retriever.retrieve(
    ViTokenizer.tokenize(TEST_QUESTION.lower())
)

for n in retrievals[:5]:
    display_source_node(n, source_length=1000, show_source_metadata=True)