<a href="https://colab.research.google.com/github/Avinash-Chitransh/Assignments/blob/main/RAG_pipeline_with_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install sentence-transformers faiss-cpu numpy transformers torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [2]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch



In [3]:
#load document
documents = [
    "Python is a high-level programming language.",
    "RAG combines information retrieval with text generation.",
    "FAISS is used for efficient similarity search.",
    "Transformers are deep learning models for NLP tasks.",
    "Scoring helps rank retrieved documents."
]

In [4]:
#create embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")

doc_embeddings = embedder.encode(
    documents,
    convert_to_numpy=True,
    normalize_embeddings=True
)

embedding_dim = doc_embeddings.shape[1]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
#building faiss (Facebook AI Similarity Search) index
index = faiss.IndexFlatIP(embedding_dim)  # Inner Product = Cosine (normalized)
index.add(doc_embeddings)

In [7]:
#retrival with scoring
def retrieve_documents(query, top_k=3):
    query_embedding = embedder.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    scores, indices = index.search(query_embedding, top_k)

    retrieved = []
    for idx, score in zip(indices[0], scores[0]):
        retrieved.append({
            "document": documents[idx],
            "score": float(score)
        })

    return retrieved


In [8]:
#loading LLm for generation
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [9]:
def generate_answer(query, retrieved_docs):
    context = "\n".join(
        [f"[Score: {doc['score']:.2f}] {doc['document']}" for doc in retrieved_docs]
    )

    prompt = f"""
Use the following context to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    output = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [10]:
def rag_pipeline(query, top_k=3):
    retrieved_docs = retrieve_documents(query, top_k=top_k)

    print("\nRetrieved Documents with Scores:")
    for doc in retrieved_docs:
        print(f"- {doc['document']} (Score: {doc['score']:.3f})")

    answer = generate_answer(query, retrieved_docs)
    return answer

In [11]:
query = "What is RAG and why is scoring important?"
response = rag_pipeline(query)

print("\nGenerated Answer:")
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Retrieved Documents with Scores:
- RAG combines information retrieval with text generation. (Score: 0.442)
- Scoring helps rank retrieved documents. (Score: 0.432)
- FAISS is used for efficient similarity search. (Score: 0.152)

Generated Answer:

Use the following context to answer the question.

Context:
[Score: 0.44] RAG combines information retrieval with text generation.
[Score: 0.43] Scoring helps rank retrieved documents.
[Score: 0.15] FAISS is used for efficient similarity search.

Question:
What is RAG and why is scoring important?

Answer:

RAG is the type of semantic search. You can use it to rank documents in a semantic context. You can use the following context to answer the question:

Context:

[Score: 0.16] RAG combines information retrieval with text generation.

[Score: 0.16] Scoring helps rank retrieved documents.

[Score: 0.16] FAISS is used for efficient similarity search.

Question:

What is RAG and why is scoring important?

Answer:

RAG and RAG combine semantic 