In [2]:
%pip install faiss-cpu faiss-gpu-cu12 sentence_transformers transformers accelerate sentencepiece



In [5]:
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
INDEX_FILE = "faiss_index.index"
META_FILE = "meta.jsonl"
MODEL_NAME = "all-MiniLM-L6-v2"

def load_metadata():
    metadata = []
    with open(META_FILE, "r") as f:
        for line in f:
            metadata.append(json.loads(line))
    return metadata

def search(query, top_k=5):
    model = SentenceTransformer(MODEL_NAME)
    query_vector = model.encode(query).astype("float32").reshape(1, -1)
    query_vector /= np.linalg.norm(query_vector, axis=1, keepdims=True)  # Normalize for cosine sim

    index = faiss.read_index(INDEX_FILE)
    metadata = load_metadata()

    D, I = index.search(query_vector, top_k)
    results = []

    for idx in I[0]:
        item = metadata[idx]
        results.append({
            "doc_id": item["doc_id"],
            "chunk_index": item["chunk_index"],
            "text": item["text"]
        })

    return results

# if __name__ == "__main__":
#     query = input("Enter your question: ")
#     results = search(query)

#     print("\nTop matching chunks:\n")
#     for i, r in enumerate(results, 1):
#         print(f"[{i}] {r['text']}\n")
#     print("\nSearch completed.")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the LLM for generation
llm_model_id = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
model = AutoModelForCausalLM.from_pretrained(
    llm_model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

def generate_answer(context_chunks, question):
    context = "\n\n".join([chunk['text'] for chunk in context_chunks])
    prompt = f"""You are a helpful assistant. Answer the following question using only the given context. Be brief and factual.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.2,top_p=0.9)

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = decoded.split("Answer:")[-1].strip()
    return answer

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
if __name__ == "__main__":
    query = input("Enter your question: ")
    results = search(query)

    print("\nTop matching chunks:\n")
    for i, r in enumerate(results, 1):
        print(f"[{i}] {r['text']}\n")

    print("Generating answer...\n")
    answer = generate_answer(results, query)
    print(answer)