In [1]:
# import necessary libraries
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline
import faiss
import numpy as np
import sentencepiece
from google.protobuf import text_format

In [2]:
# load a pre-trained sentence transformer for embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
# sample documents
documents = [
    "The capital of France is Paris.",
    "Python is a programming language widely used for machine learning.",
    "The Great Wall of China is visible from space.",
    "Water boils at 100 degrees Celsius under standard atmospheric pressure."
]

In [4]:
# step 1: embed and index the documents
document_embeddings = embedder.encode(documents)
dimension = document_embeddings.shape[1]

In [5]:
# create a faiss index
index = faiss.IndexFlatL2(dimension)
index.add(np.array(document_embeddings))

In [6]:
# step 2: set up the generation model
model_name = "t5-small"
generator = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)

In [7]:
def retrieve(query, top_k=2):
    # retrieve the top_k relevant documents for a query.
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    
    # debugging
    print(f"Distances: {distances}")
    print(f"Indices: {indices}")
    
    # check if any results are returned
    if len(indices[0]) == 0:
        return [] # return empty list if no results are found

    # debugging
    for i, idx in enumerate(indices[0]):
        print(f"Index: {idx}, Distance: {distances[0][i]}, Document: {documents[idx]}")
        
    return [(documents[idx], distances[0][i]) for i, idx in enumerate(indices[0]) if idx < len(documents)]

In [8]:
def generate_response(query):
    # generate a response using retrieved documents as a context.
    retrieved_docs = retrieve(query)
    context = " ".join([doc[0] for doc in retrieved_docs])
    input_text = f"question: {query} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    # generate response
    outputs = generator.generate(input_ids, max_length=50, num_beams=2, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [9]:
# test the RAG system
query = "What is the capital of France?"
response = generate_response(query)
print(f"Query: {query}")
print(f"Response: {response}")

Distances: [[0.24197832 1.7641159 ]]
Indices: [[0 1]]
Index: 0, Distance: 0.24197831749916077, Document: The capital of France is Paris.
Index: 1, Distance: 1.7641159296035767, Document: Python is a programming language widely used for machine learning.
Query: What is the capital of France?
Response: Paris


In [10]:
query = "Is python important?"
response = generate_response(query)
print(f"Query: {query}")
print(f"Response: {response}")

Distances: [[0.5197036 1.9164784]]
Indices: [[1 0]]
Index: 1, Distance: 0.5197036266326904, Document: Python is a programming language widely used for machine learning.
Index: 0, Distance: 1.9164783954620361, Document: The capital of France is Paris.
Query: Is python important?
Response: widely used for machine learning


In [11]:
document_embeddings

array([[ 0.10325703,  0.03042013,  0.0290958 , ...,  0.05853158,
         0.08585993, -0.0056698 ],
       [-0.0504921 ,  0.00360606, -0.02291742, ...,  0.11549734,
         0.14984636,  0.03146859],
       [ 0.04052198,  0.07120404,  0.0485715 , ...,  0.01960546,
        -0.09164685,  0.07383972],
       [-0.04159974,  0.0122102 , -0.04180289, ...,  0.04172173,
        -0.03254224,  0.03809972]], shape=(4, 384), dtype=float32)

In [12]:
print(f"Number of docuemnts: {len(documents)}")
print(f"FAISS index size: {index.ntotal}")

Number of docuemnts: 4
FAISS index size: 4
