In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
from bs4 import BeautifulSoup




In [2]:
# Load Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')


In [3]:
# STEP 1: Fetch large text from Wikipedia
url = "https://en.wikipedia.org/wiki/India"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
paragraphs = [p.get_text() for p in soup.find_all("p")]
large_text = " ".join(paragraphs[:20])  


In [4]:
# STEP 2: Split into chunks
def split_text(text, chunk_size=100):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = split_text(large_text)
print(f"📄 Total Chunks Created: {len(chunks)}")

📄 Total Chunks Created: 29


In [5]:
# STEP 3: Convert to embeddings and index
embeddings = model.encode(chunks, convert_to_numpy=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [6]:
# STEP 3: Create FAISS index
embeddings = model.encode(chunks, convert_to_numpy=True)
dim = embeddings.shape[1]
index = faiss.IndexIDMap(faiss.IndexFlatL2(dim))  # Use IDMap to track IDs
ids = np.arange(len(embeddings))
index.add_with_ids(embeddings, ids)

In [7]:
# Map ID to chunk
id_to_text = {i: chunk for i, chunk in zip(ids, chunks)}

In [8]:
# ----- READ -----
def search(query, k=3):
    query_vec = model.encode([query])
    distances, indices = index.search(query_vec, k)
    print(f"\n🔍 Query: {query}")
    for rank, idx in enumerate(indices[0]):
        if idx in id_to_text:
            print(f"  Rank {rank + 1}: {id_to_text[idx][:100]}... (Dist: {distances[0][rank]:.4f})")
        else:
            print(f"  Rank {rank + 1}: [DELETED] (Dist: {distances[0][rank]:.4f})")

In [9]:
# ----- CREATE -----
def add_new_document(text, doc_id):
    emb = model.encode([text], convert_to_numpy=True)
    index.add_with_ids(emb, np.array([doc_id]))
    id_to_text[doc_id] = text
    print(f"✅ Document added with ID {doc_id}")

In [10]:
# ----- UPDATE -----
def update_document(doc_id, new_text):
    if doc_id in id_to_text:
        index.remove_ids(np.array([doc_id]))
        add_new_document(new_text, doc_id)
        print(f"🔁 Document ID {doc_id} updated.")
    else:
        print(f"⚠️ Document ID {doc_id} not found.")

In [11]:
# ----- DELETE -----
def delete_document(doc_id):
    if doc_id in id_to_text:
        index.remove_ids(np.array([doc_id]))
        del id_to_text[doc_id]
        print(f"❌ Document ID {doc_id} deleted.")
    else:
        print(f"⚠️ Document ID {doc_id} not found.")

In [12]:
# DEMO OUTPUT
search("Tell me about Indian independence")
add_new_document("India gained independence in 1947 from British rule.", 999)
search("independence")
update_document(999, "India became independent on 15th August 1947.")
search("independence from British")
delete_document(999)
search("independence")


🔍 Query: Tell me about Indian independence
  Rank 1: the public life took root.[53] A nationalist movement emerged in India, the first in the non-Europea... (Dist: 0.8480)
  Rank 2: It was marked by British reforms but also repressive legislation, by more strident Indian calls for ... (Dist: 0.9668)
  Rank 3: in the second millennium.[43] The resulting Delhi Sultanate drew northern India into the cosmopolita... (Dist: 0.9736)
✅ Document added with ID 999

🔍 Query: independence
  Rank 1: India gained independence in 1947 from British rule.... (Dist: 1.0294)
  Rank 2: the public life took root.[53] A nationalist movement emerged in India, the first in the non-Europea... (Dist: 1.3472)
  Rank 3: It was marked by British reforms but also repressive legislation, by more strident Indian calls for ... (Dist: 1.3553)
✅ Document added with ID 999
🔁 Document ID 999 updated.

🔍 Query: independence from British
  Rank 1: India became independent on 15th August 1947.... (Dist: 0.9894)
  Rank 2: th