In [None]:
from naive_rag.helpers.utils import retrieve_documents
# %%capture
!pip install llama-index==0.10.37 openai==1.30.1 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19

In [None]:
import os
import sys
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables (fixing potential missing file argument issue)
load_dotenv()
current_dir = Path.cwd()       # or Path().resolve()
helpers_path = current_dir.parent / 'helpers'
sys.path.append(str(helpers_path))


In [None]:
import os
import nest_asyncio
from getpass import getpass
from dotenv import load_dotenv

load_dotenv(r"C:\Users\anteb\PycharmProjects\JupyterProject\.env")
nest_asyncio.apply()

# Debug: Print environment variables to verify they are loaded
print("CO_API_KEY:", os.environ.get('CO_API_KEY'))
print("OPENAI_API_KEY:", os.environ.get('OPENAI_API_KEY'))
print("QDRANT_URL:", os.environ.get('QDRANT_URL'))
print("QDRANT_API_KEY:", os.environ.get('QDRANT_API_KEY'))

CO_API_KEY = os.environ.get('CO_API_KEY') or getpass("Enter CO_API_KEY: ")
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') or getpass("Enter OPENAI_API_KEY: ")
QDRANT_URL = os.environ.get('QDRANT_URL') or getpass("Enter QDRANT_URL: ")
QDRANT_API_KEY = os.environ.get('QDRANT_API_KEY') or getpass("Enter QDRANT_API_KEY: ")


In [None]:
from llama_index.core.settings import Settings
from llama_index.llms.cohere import Cohere
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = Cohere(model="command-r-plus", api_key=CO_API_KEY)

Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

In [15]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=[r"C:\Users\anteb\PycharmProjects\JupyterProject\naive_rag\data\raw\2502.20364.pdf"],
    filename_as_id=True).load_data()

In [17]:
from naive_rag.helpers.IngestionCacheManager import SmartIngestionCache
from qdrant_client import QdrantClient
from datetime import datetime
from llama_index.core.ingestion import IngestionCache
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.ingestion import IngestionCache, IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.qdrant import QdrantVectorStore




client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="rag_articles")

ingest_cache = SmartIngestionCache().get_cache()

# create pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        TokenTextSplitter(chunk_size=256, chunk_overlap=16),
        Settings.embed_model
    ],
    docstore=SimpleDocumentStore(),
    vector_store=vector_store,
    cache=ingest_cache,
)

# run the pipeline
nodes = pipeline.run(documents = documents)

In [18]:
nodes[0].__dict__.keys()

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'mimetype', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator'])

In [19]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [20]:
retriever = index.as_retriever(
    similarity_top_k=7,
    return_sources=True
    )

In [21]:
retrieved_nodes = retriever.retrieve("How RAG can improve the user experience?")

In [22]:
len(retrieved_nodes)

5

In [23]:
print(retrieved_nodes[0].get_text())

TELERAG: Efficient Retrieval-Augmented Generation Inference
with Lookahead Retrieval
Chien-Yu Lin1,∗Keisuke Kamahori1,∗Yiyu Liu2,†Xiaoxiang Shi2,†Madhav Kashyap1Yile Gu1
Rulin Shao1Zihao Ye1Kan Zhu1Stephanie Wang1Arvind Krishnamurthy1Rohan Kadekodi1
Luis Ceze1Baris Kasikci1
1University of Washington2Shanghai Jiao Tong University
Abstract
Retrieval-augmented generation (RAG) extends large lan-
guage models (LLMs) with external data sources to en-
hance factual correctness and domain coverage. Modern RAG
pipelines rely on large datastores, leading to system chal-
lenges in latency-sensitive deployments, especially when lim-
ited GPU memory is available. To address these challenges,
we propose TELERAG , an efficient inference system that re-
duces RAG latency with minimal GPU memory requirements.
The core innovation of TELERAG islookahead retrieval ,
a prefetching mechanism that anticipates required data and
transfers it from CPU to GPU in parallel with LLM gener-
ation. By leveraging the

In [24]:
print(retrieved_nodes[3].get_score())

0.5403015


In [25]:
pipeline.persist('./pipeline_storage')