In [None]:
%%capture
!pip install llama-index llama-index-embeddings-openai qdrant-client llama-index-vector-stores-qdrant

In [None]:
import os
import requests
from pathlib import Path
from getpass import getpass
from tqdm import tqdm 

import nest_asyncio
nest_asyncio.apply()

In [None]:
os.environ['OPENAI_API_KEY'] = getpass("Enter your OpenAI API key: ")

# Ingestion Pipeline with Document Management

- 📚 **Docstore Attachment**: Enables document management in the ingestion pipeline, using `doc_id` or `node.ref_doc_id` for identification.

- 🗂️ **Duplicate Handling**:
  - Maintains a `doc_id` to `document_hash` map to identify duplicates.
  - Re-processes documents if the same `doc_id` is found with a changed hash.
  - Skips documents if the same `doc_id` is found but the hash remains unchanged.

- 🚫 **Without Vector Store**:
  - Limited to checking and removing duplicate inputs.

- ✨ **With Vector Store**:
  - Enables handling of upserts for updated documents, offering advanced management capabilities.

In [None]:
from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache

documents = SimpleDirectoryReader("gutenberg_books").load_data()

# create pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128, chunk_overlap=16),
        TitleExtractor(),
        OpenAIEmbedding(model="text-embedding-3-small",dimensions=512)
    ],
    docstore=SimpleDocumentStore(),
)

pipeline.load("gutenberg_books/pipeline_storage")

nodes = pipeline.run(documents=documents)

# Test Document Management

In [None]:
!echo "I am adding this as a text file to the directory." > gutenberg_books/test1.txt
!echo "That way I can see if my document management is working" > gutenberg_books/test2.txt

In [None]:
documents = SimpleDirectoryReader("gutenberg_books").load_data()

nodes = pipeline.run(documents=documents)

In [None]:
print(f"Ingested {len(nodes)} Nodes")

Double check which nodes were ingested

In [None]:
for node in nodes:
    print(f"Node: {node.text}")

In [None]:
print(len(pipeline.docstore.docs))
