In [1]:
pip install datasets sentence-transformers faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.9.0.post1 sentence-transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [3]:
# Load a dataset
from datasets import load_dataset
dataset = load_dataset("squad", split="train[:1000]")
# dataset = load_dataset("squad")
 
# Preprocess the context passages
passages = [item['context'] for item in dataset]

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
# Create embeddings using SentenceTransformers
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedder.encode(passages, convert_to_tensor=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [5]:
# Create FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))

In [6]:
# Save index and passages
faiss.write_index(faiss_index, "faiss_index.bin")
np.save(r"passages.npy", np.array(passages))
 
# # Load during runtime
# faiss_index = faiss.read_index("data/faiss_index.bin")
# passages = np.load("data/passages.npy", allow_pickle=True)

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import faiss
import numpy as np

In [8]:
# Load FLAN-T5 model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# Load FAISS index and passages
faiss_index = faiss.read_index("faiss_index.bin")
passages = np.load("passages.npy", allow_pickle=True)

In [10]:
def retrieve(query, top_k=5):
    """Retrieve top-k contexts for a query."""
    query_embedding = embedder.encode([query])
    _, indices = faiss_index.search(query_embedding, top_k)
    return [passages[i] for i in indices[0]]

In [11]:
def generate_answer(query, contexts):
    """Generate answer using FLAN-T5."""
    context_text = " ".join(contexts)
    input_text = f"question: {query} context: {context_text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(
    **inputs,
    max_length=150,              # Adjust based on query complexity
    temperature=0.1,             # Lower temperature (e.g., 0.7) for more focused responses
    top_k=50,                    # Limit the selection to top 50 tokens (reduce randomness)
    top_p=0.9,                   # Nucleus sampling (focus on the top 90% of token probability)
    repetition_penalty=1.2,      # Penalize repeated phrases
    num_return_sequences=1,     # Generate a single response per query
    do_sample=True,              # Enable sampling (necessary for top-k and top-p)
    early_stopping=True)          # Stop early to avoid unnecessarily long outputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [12]:
query = 'who is first united states of america president'
contexts=retrieve(query)
response=generate_answer(query, contexts)
print(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Barack Obama
