In [5]:
pip install transformers sentence-transformers faiss-cpu datasets

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.9.0.post1-cp312-cp312-win_amd64.whl (13.8 MB)
   ---------------------------------------- 0.0/13.8 MB ? eta -:--:--
   -- ------------------------------------- 0.8/13.8 MB 4.8 MB/s eta 0:00:03
   ---- ----------------------------------- 1.6/13.8 MB 4.7 MB/s eta 0:00:03
   ------- -------------------------------- 2.6/13.8 MB 4.6 MB/s eta 0:00:03
   ---------- ----------------------------- 3.7/13.8 MB 4.6 MB/s eta 0:00:03
   ------------ --------------------------- 4.5/13.8 MB 4.6 MB/s eta 0:00:03
   --------------- ------------------------ 5.5/13.8 MB 4.7 MB/s eta 0:00:02
   ------------------ --------------------- 6.6/13.8 MB 4.7 MB/s eta 0:00:02
   --------------------- ------------------ 7.3/13.8 MB 4.7 MB/s eta 0:00:02
   ------------------------ --------------- 8.4/13.8 MB 4.7 MB/s eta 0:00:02
   -------------------------- ------------- 9.2/13.8 MB 4.7 MB/s e

In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare corpus
corpus = [
    "Content of document 1.",
    "Content of document 2.",
]
corpus_embeddings = embedding_model.encode(corpus, convert_to_tensor=False)

# Create FAISS index
d = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(d)  # L2 distance metric
index.add(np.array(corpus_embeddings))
print(f"Added {index.ntotal} documents to the index.")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Added 2 documents to the index.


In [7]:
query = "What is the content of document 1?"
query_embedding = embedding_model.encode(query, convert_to_tensor=False)

# Retrieve top 3 results
D, I = index.search(np.array([query_embedding]), k=3)  # Distances and indices
print(f"Top 3 documents: {I[0]}, Scores: {D[0]}")

Top 3 documents: [ 0  1 -1], Scores: [1.0664676e-01 3.2043615e-01 3.4028235e+38]


In [8]:
from transformers import pipeline

# Load a generative model (e.g., T5 or GPT)
generator = pipeline("text2text-generation", model="t5-small")

# Retrieve a document from the corpus and pass it to the generator
retrieved_docs = [corpus[i] for i in I[0]]
context = " ".join(retrieved_docs)
prompt = f"Based on the following documents: {context}. Answer the question: {query}"

# Generate a response
response = generator(prompt, max_length=100)
print(f"Generated Response: {response[0]['generated_text']}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


Generated Response: Answer the question: What is the content of document 1?


In [9]:
def rag_pipeline(query, corpus, index, embedding_model, generator):
    # Retrieve relevant documents
    query_embedding = embedding_model.encode(query, convert_to_tensor=False)
    D, I = index.search(np.array([query_embedding]), k=3)
    retrieved_docs = [corpus[i] for i in I[0]]
    
    # Generate response based on retrieved documents
    context = " ".join(retrieved_docs)
    prompt = f"Based on the following documents: {context}. Answer the question: {query}"
    response = generator(prompt, max_length=100)
    return response[0]['generated_text']

# Example usage
response = rag_pipeline(query, corpus, index, embedding_model, generator)
print(f"RAG Response: {response}")

RAG Response: Answer the question: What is the content of document 1?


In [10]:
pip install fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 (from fastapi)
  Downloading pydantic-2.10.5-py3-none-any.whl.metadata (30 kB)
Collecting click>=7.0 (from uvicorn)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting h11>=0.8 (from uvicorn)
  Using cached h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting annotated-types>=0.6.0 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.2 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Downloading pydantic_core-2.27.2-cp312-cp312-win_amd64.whl.metadata (6.7 kB)

In [11]:
from fastapi import FastAPI

app = FastAPI()

@app.post("/rag")
def rag_api(query: str):
    response = rag_pipeline(query, corpus, index, embedding_model, generator)
    return {"response": response}

# Run the app
# uvicorn filename:app --reload