# 인덱스 만들기
- Case 1: Document Object로 바로 Index화 시키기
- Case 2: Node Object 단위로 Index화 시키기

In [1]:
# Manual하게 Document 오브젝트 생성하기
from llama_index.core import Document, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import nest_asyncio
nest_asyncio.apply()
import os
os.environ["OPENAI_API_KEY"] = ""

#Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small"
)




In [2]:
# Document Object로 바로 Index화 시키기
#2-1. llama_dataset 제공 함수 download_llama_dataset 이용하기

from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core import VectorStoreIndex

#rag_dataset, documents = download_llama_dataset("MiniCovidQaDataset", "./data")
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_dir="./data_manual/source_files").load_data()


In [None]:
documents[:2]

In [None]:
default_index = VectorStoreIndex.from_documents(documents[:2])

- IngestionPipeline으로 노드오브젝트 커스터마이즈
- 커스텀 노드를 벡터인덱스화

In [3]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=2000),
        TitleExtractor(llm=OpenAI(model="gpt-4o-mini")),
        OpenAIEmbedding(model="text-embedding-3-small")
    ]
)


In [4]:
# run the pipeline
nodes = pipeline.run(documents=documents[:2])

100%|██████████| 5/5 [00:03<00:00,  1.54it/s]
100%|██████████| 5/5 [00:02<00:00,  2.24it/s]


In [None]:
nodes[100].metadata

In [None]:
node_index = VectorStoreIndex(nodes)

In [None]:
node_index

# VectorstoreIndex 에 써드파티 VectorDB Backend 엔진으로 사용하기 (Qdrant)

In [None]:
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client
from qdrant_client import models
client = qdrant_client.QdrantClient(
    url="", 
    api_key="",
)


In [None]:
nodes

In [None]:
documents

In [None]:
from llama_index.core import StorageContext
vector_store = QdrantVectorStore(client=client, collection_name="corona_sq")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents[:2],
    storage_context=storage_context,
)

# 생성된 Index 활용해서 Retriever 생성하기

In [None]:
retriever = index.as_retriever()
nodes = retriever.retrieve("What is corona?")

In [None]:
# 생성된 NodeWithScore 객체 확인
nodes

In [None]:
# 다이렉트하게 쿼리 엔진으로 묶어서 보기
query_engine = index.as_query_engine()
response = query_engine.query("what is corona?")

In [None]:
response