In [None]:
%%capture
# the first line of imports you should already have installed if you are running this notebook locally 
# !pip install llama-index==0.10.20 llama-index-embeddings-openai qdrant-client llama-index-vector-stores-qdrant
!pip install llama-index-storage-kvstore-redis

In [1]:
import os
import requests
import nest_asyncio

from tqdm import tqdm 
from pathlib import Path
from getpass import getpass

nest_asyncio.apply()

In [2]:
os.environ['OPENAI_API_KEY'] = getpass("Enter your OpenAI API key: ")

In [3]:
QDRANT_URL = getpass("Enter your Qdrant URL:")

In [4]:
QDRANT_API_KEY = getpass("Enter your Qdrant API Key:")

In [5]:
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Ingestion Pipeline

- 🔄 **IngestionPipeline Overview**: Utilizes `Transformations` applied to input data, modifying data into nodes, which are returned or inserted to a vector database.

- 💾 **Caching Mechanism**: Each node+transformation pair is cached, enhancing efficiency for identical subsequent operations by utilizing cached results.


### Using an `IngestionPipeline`

First, let's read in some data. 

In [6]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files = ["../data/it_can_be_done.txt"], filename_as_id=True).load_data()

# Ingestion Pipeline with Document Management

 •  💾 **Caching in IngestionPipeline**: Hashes and stores each node + transformation combination to speed up future processes with identical data.

 •  📁 **Local Cache Management**: The input nodes list and transformation pair are cached in the pipeline. When we apply the same transformation to that list of nodes again, the output nodes are retrieved from the cache.

 •  📚 **Docstore Attachment**:  Enables document management in the ingestion pipeline, using `doc_id` or `node.ref_doc_id` for identification. Prevents running a transformation on the same document multiple times by using the document ID and the hash of the document content to manage duplicates.

 •  🗂️ **Duplicate Handling**:
  - Maintains a `doc_id` to `document_hash` map to identify duplicates.

  - Re-processes documents if the same `doc_id` is found with a changed hash.

  - Skips documents if the same `doc_id` is found but the hash remains unchanged.

 •  🚫 **Without Vector Store**:
  - Limited to checking and removing duplicate inputs.

 •  ✨ **With Vector Store**:
  - Enables handling of upserts for updated documents, offering advanced management capabilities.

In [None]:
from qdrant_client import QdrantClient
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache

client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

vector_store = QdrantVectorStore(client=client, collection_name="it_can_be_done")

ingest_cache = IngestionCache(
    cache=RedisCache.from_host_and_port(host="127.0.0.1", port=6379),
    collection="it_can_be_done",
)

# create pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        TokenTextSplitter(chunk_size=256, chunk_overlap=16),
        OpenAIEmbedding(model="text-embedding-3-small")
    ],
    docstore=SimpleDocumentStore(),
    vector_store=vector_store
)

# run the pipeline
nodes = pipeline.run(documents = documents)

In [None]:

index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query(
    "What does Burton Braley says about starting?"
)
print(str(response))