In [None]:
pip install sentence-transformers faiss-cpu numpy transformers torch

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
#load document
documents = [
    "Python is a high-level programming language.",
    "RAG combines information retrieval with text generation.",
    "FAISS is used for efficient similarity search.",
    "Transformers are deep learning models for NLP tasks.",
    "Scoring helps rank retrieved documents."
]

In [None]:
#create embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")

doc_embeddings = embedder.encode(
    documents,
    convert_to_numpy=True,
    normalize_embeddings=True
)

embedding_dim = doc_embeddings.shape[1]

In [None]:
#building faiss (Facebook AI Similarity Search) index
index = faiss.IndexFlatIP(embedding_dim)  # Inner Product = Cosine (normalized)
index.add(doc_embeddings)

In [None]:
#retrival with scoring
def retrieve_documents(query, top_k=3):
    query_embedding = embedder.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    scores, indices = index.search(query_embedding, top_k)

    retrieved = []
    for idx, score in zip(indices[0], scores[0]):
        retrieved.append({
            "document": documents[idx],
            "score": float(score)
        })

    return retrieved


In [None]:
#loading LLm for generation
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


In [None]:
def generate_answer(query, retrieved_docs):
    context = "\n".join(
        [f"[Score: {doc['score']:.2f}] {doc['document']}" for doc in retrieved_docs]
    )

    prompt = f"""
Use the following context to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    output = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
def rag_pipeline(query, top_k=3):
    retrieved_docs = retrieve_documents(query, top_k=top_k)

    print("\nRetrieved Documents with Scores:")
    for doc in retrieved_docs:
        print(f"- {doc['document']} (Score: {doc['score']:.3f})")

    answer = generate_answer(query, retrieved_docs)
    return answer

In [None]:
query = "What is RAG and why is scoring important?"
response = rag_pipeline(query)

print("\nGenerated Answer:")
print(response)