In [None]:
%%capture
!pip install llama-index==0.10.20 llama-index-embeddings-openai qdrant-client llama-index-vector-stores-qdrant

In [None]:
import os
import requests
from pathlib import Path
from getpass import getpass
from tqdm import tqdm 

import nest_asyncio
nest_asyncio.apply()

In [None]:
os.environ['OPENAI_API_KEY'] = getpass("Enter your OpenAI API key: ")

# Ingestion Pipeline

- 🔄 **IngestionPipeline Overview**: Utilizes `Transformations` applied to input data, modifying data into nodes, which are returned or inserted to a vector database.

- 💾 **Caching Mechanism**: Each node+transformation pair is cached, enhancing efficiency for identical subsequent operations by utilizing cached results.


### How to use an `IngestionPipeline`:

In [None]:
# Base URL for Project Gutenberg texts
base_url = "https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"

# Directory to save the downloaded files
directory = Path("gutenberg_books")

# Create the directory if it doesn't exist
directory.mkdir(parents=True, exist_ok=True)

# Generate a list of book IDs to download
book_ids = range(1, 20)

# Generate URLs for each book ID
urls = [base_url.format(book_id=book_id) for book_id in book_ids]

# Download each file and save it in the specified directory
for url in tqdm(urls, desc="Downloading books"):  # Wrap urls with tqdm for a progress bar
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the filename from the URL using the book ID and create a file name
        book_id = url.split('/')[-2]  # Extracts the book ID from the URL
        filename = f"pg{book_id}.txt"
        file_path = directory / filename
        # Save the file to the specified directory
        file_path.write_text(response.text)
    else:
        tqdm.write(f"Failed to download {url}. HTTP status code: {response.status_code}")  # Use tqdm.write for messages


In [None]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("gutenberg_books").load_data()

In [None]:
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache

# create pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128, chunk_overlap=16),
        TitleExtractor(),
        OpenAIEmbedding(model="text-embedding-3-small",dimensions=512)
    ],
)

# run the pipeline
nodes = pipeline.run(documents = documents)

In [20]:
import qdrant_client
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore

client = qdrant_client.QdrantClient(path="gutenberg_books/qdrant")

vector_store = QdrantVectorStore(client=client, collection_name="gutenberg_books")

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128, chunk_overlap=16),
        TitleExtractor(),
        OpenAIEmbedding(model="text-embedding-3-small",dimensions=512)
    ],
    vector_store=vector_store
)

pipeline.run(documents = documents)

index = VectorStoreIndex.from_vector_store(vector_store=vector_store)


# Caching

- 💾 **Caching in IngestionPipeline**: Hashes and stores each node + transformation combination to expedite future processes with identical data.

- 📁 **Local Cache Management**: Guides on storing and loading pipeline cache for enhanced efficiency and convenience.

In [None]:
# save
pipeline.persist("gutenberg_books/pipeline_storage")

# load and restore state
new_pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=256, chunk_overlap=32),
        TitleExtractor(),
    ],
)

new_pipeline.load("gutenberg_books/pipeline_storage")

# will run instantly due to the cache
nodes = new_pipeline.run(documents=documents)