In [1]:
from llama_index import Document, VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.readers.chroma import ChromaReader
from llama_index.storage.storage_context import StorageContext
# from transformers import AutoTokenizer, AutoModel
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from IPython.display import Markdown, display
import chromadb
import pandas as pd 
import openai
import os
import getpass

In [2]:
data_path = os.path.join('/workspace/data/')

In [3]:
data = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_DEPOSIT_vec.csv'))
data.head(1)

Unnamed: 0,보증_고유키,상품설명
0,보증_0,기업 전용 정기적금 상품


In [4]:
category = 'deposit'

id_list = data.보증_고유키.values.tolist()
desc_list = data.상품설명.values.tolist()

In [5]:
documents = []

for idx in range(len(desc_list)):
    doc = Document(text=desc_list[idx], doc_id=id_list[idx], metadata={"category": category}, excluded_llm_metadata_keys = ['category'])
    documents.append(doc)

In [6]:
print(f'embedding model see this: {documents[2].get_content(metadata_mode=MetadataMode.EMBED)}', end='\n\n')
print(f'LLM see this: {documents[2].get_content(metadata_mode=MetadataMode.LLM)}')

embedding model see this: category: deposit

누구나 쉽게 이해하고 가입할 수 있는 정기예금 대표상품

LLM see this: 누구나 쉽게 이해하고 가입할 수 있는 정기예금 대표상품


#### 직접 노드 정의하기 

In [7]:
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo

In [8]:
desc_list[:2]

['기업 전용 정기적금 상품', '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품']

In [9]:
node1 = TextNode(text='<text_chunk>', id_='<node_id>')
node2 = TextNode(text='<text_chunk>', id_='<node_id>')

In [10]:
node1.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
    node_id=node2.node_id
)
node2.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(
    node_id=node1.node_id
)

nodes = [node1, node2]

In [11]:
node2.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
    node_id=node1.node_id, metadata={'key': 'val'}
)

#### Node Parser 사용 

In [12]:
from llama_index.text_splitter import SentenceSplitter 

splitter = SentenceSplitter(
    chunk_size=512, 
    chunk_overlap=20,   # number of characters to overlap between two chunks for preserving the semantic context in subsequent chunks 
)

In [13]:
nodes = splitter.get_nodes_from_documents(documents)

In [14]:
nodes[0]

TextNode(id_='90dd965e-940a-459c-8446-745e919ce026', embedding=None, metadata={'category': 'deposit'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['category'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='2ecd58205351e819e0afa38f030f468fd516c3820082cf5f9fd15f569846d656'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='1d475255-979e-4540-b2cf-c6e28f956084', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='35f2f0b6242fe666c5002acd9cc3722b0676152cd9973e45a062ac16ba45a687')}, hash='d86f9b0332ed973917c372f0aaa256a0937ddde44827e5fc20f7160a10a593e7', text='기업 전용 정기적금 상품', start_char_idx=0, end_char_idx=13, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [15]:
nodes[1]

TextNode(id_='1d475255-979e-4540-b2cf-c6e28f956084', embedding=None, metadata={'category': 'deposit'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['category'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='bd3da67018372aeeba75cb39050e5c942755b5618b9dc56ee5d29c93d020f27a'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='90dd965e-940a-459c-8446-745e919ce026', node_type=<ObjectType.TEXT: '1'>, metadata={'category': 'deposit'}, hash='d86f9b0332ed973917c372f0aaa256a0937ddde44827e5fc20f7160a10a593e7'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='ad7e49b2-179e-4b98-b045-547f68f1a9d3', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='b109485229c08a64655ca9fa6794291daab34ea8c57eed6290c166f9c12b2ae2')}, hash='35f2f0b6242fe666c5002acd9cc3722b0676152cd9973e45a062ac16ba45a687', text='복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', start_char_idx=0, end_char_idx=27

In [16]:
nodes[0].node_id

'90dd965e-940a-459c-8446-745e919ce026'

In [17]:
from llama_index.node_parser import SentenceWindowNodeParser 

In [18]:
node_parser = SentenceWindowNodeParser.from_defaults(
    # how many sentences on either side to capture
    window_size=3, 
    # the metadata key that holds the window of surrounding sentnces
    window_metadata_key='window',
    # the metadatakey that holds the original sentnce
    original_text_metadata_key='original_sentence',
)

In [19]:
window_nodes = node_parser.get_nodes_from_documents(documents)

In [20]:
window_nodes[0]

TextNode(id_='72577f72-6d09-4cf7-bbff-1447caea0313', embedding=None, metadata={'window': '기업 전용 정기적금 상품', 'original_sentence': '기업 전용 정기적금 상품', 'category': 'deposit'}, excluded_embed_metadata_keys=['window', 'original_sentence'], excluded_llm_metadata_keys=['category', 'window', 'original_sentence'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='2ecd58205351e819e0afa38f030f468fd516c3820082cf5f9fd15f569846d656'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='d1235d94-0878-42ea-9ee4-0b165e582a67', node_type=<ObjectType.TEXT: '1'>, metadata={'window': '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', 'original_sentence': '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품'}, hash='35f2f0b6242fe666c5002acd9cc3722b0676152cd9973e45a062ac16ba45a687')}, hash='d86f9b0332ed973917c372f0aaa256a0937ddde44827e5fc20f7160a10a593e7', text='기업 전용 정기적금 상품', start_char_idx=0, end_char_idx=13, text_template='{metadata_str}\n\n{co

In [21]:
window_nodes[1]

TextNode(id_='d1235d94-0878-42ea-9ee4-0b165e582a67', embedding=None, metadata={'window': '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', 'original_sentence': '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', 'category': 'deposit'}, excluded_embed_metadata_keys=['window', 'original_sentence'], excluded_llm_metadata_keys=['category', 'window', 'original_sentence'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='bd3da67018372aeeba75cb39050e5c942755b5618b9dc56ee5d29c93d020f27a'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='72577f72-6d09-4cf7-bbff-1447caea0313', node_type=<ObjectType.TEXT: '1'>, metadata={'window': '기업 전용 정기적금 상품', 'original_sentence': '기업 전용 정기적금 상품', 'category': 'deposit'}, hash='d86f9b0332ed973917c372f0aaa256a0937ddde44827e5fc20f7160a10a593e7'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b0570588-2a21-47b5-bc67-b6b66d832cf8', node_type=<ObjectType.TEXT: '1'>, metadata={'w

In [22]:
window_nodes[2]

TextNode(id_='b0570588-2a21-47b5-bc67-b6b66d832cf8', embedding=None, metadata={'window': '누구나 쉽게 이해하고 가입할 수 있는 정기예금 대표상품', 'original_sentence': '누구나 쉽게 이해하고 가입할 수 있는 정기예금 대표상품', 'category': 'deposit'}, excluded_embed_metadata_keys=['window', 'original_sentence'], excluded_llm_metadata_keys=['category', 'window', 'original_sentence'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_2', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='2fd7834cc67c7eb2ea30cd4417fe229865502e0d0ad63320d28909fe95700f5c'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='d1235d94-0878-42ea-9ee4-0b165e582a67', node_type=<ObjectType.TEXT: '1'>, metadata={'window': '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', 'original_sentence': '복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', 'category': 'deposit'}, hash='35f2f0b6242fe666c5002acd9cc3722b0676152cd9973e45a062ac16ba45a687'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='78a736f2-5d2f-4e69-9d0c-dfbbd52fd4bc', node_type=<O

#### IngestionPipeline 활용 

In [23]:
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor 
from llama_index.embeddings import OpenAIEmbedding
from llama_index.schema import TransformComponent
from llama_index.ingestion import IngestionPipeline

In [25]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

OpenAI API Key: ········


In [26]:
import re 

class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        try:
            assert nodes is not NULL
        except:
            return nodes
            
        for node in nodes:
            try:
                node.text = re.sub(r'[^0-9a-zA-Zㄱ-ㅣ가-힣 ]', '', node.text)
            except:
                continue 

In [27]:
node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=30)
extractor = TitleExtractor()

In [28]:
pipeline = IngestionPipeline(
    transformations=[
        node_parser,
        TextCleaner(), 
        # OpenAIEmbedding(),   # Node 임베딩 값 계산
    ],
)

In [29]:
Document.example().metadata

{'filename': 'README.md', 'category': 'codebase'}

In [30]:
nodes = pipeline.run(documents=documents[:4])
nodes[1]

TextNode(id_='391a3217-cd94-497f-bd85-c18d97bf43ce', embedding=None, metadata={'category': 'deposit'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['category'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='bd3da67018372aeeba75cb39050e5c942755b5618b9dc56ee5d29c93d020f27a'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='09987cbc-4252-4144-a9dd-d8d69e61daa5', node_type=<ObjectType.TEXT: '1'>, metadata={'category': 'deposit'}, hash='d86f9b0332ed973917c372f0aaa256a0937ddde44827e5fc20f7160a10a593e7'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='12437f58-00f5-4c21-b021-28a7e3616c00', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='b109485229c08a64655ca9fa6794291daab34ea8c57eed6290c166f9c12b2ae2')}, hash='35f2f0b6242fe666c5002acd9cc3722b0676152cd9973e45a062ac16ba45a687', text='복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', start_char_idx=0, end_char_idx=27

In [31]:
pipeline = IngestionPipeline(
    transformations=[
        node_parser,
        TextCleaner(), 
        # embed_model,
    ],
)

In [32]:
nodes = pipeline.run(documents=documents[:4])
nodes[1]

TextNode(id_='91986bcd-10d9-464c-9d4c-f7b9a4f6136f', embedding=None, metadata={'category': 'deposit'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['category'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='보증_1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'deposit'}, hash='bd3da67018372aeeba75cb39050e5c942755b5618b9dc56ee5d29c93d020f27a'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='082b613f-a498-4804-8035-0254e09fee0b', node_type=<ObjectType.TEXT: '1'>, metadata={'category': 'deposit'}, hash='d86f9b0332ed973917c372f0aaa256a0937ddde44827e5fc20f7160a10a593e7'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='daebaccb-1e2a-4b04-9736-ae260fa4924e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='b109485229c08a64655ca9fa6794291daab34ea8c57eed6290c166f9c12b2ae2')}, hash='35f2f0b6242fe666c5002acd9cc3722b0676152cd9973e45a062ac16ba45a687', text='복잡한 우대금리 조건 없이 쉽고 간편한 적금 상품', start_char_idx=0, end_char_idx=27