In [1]:
!pip install pymupdf langchain faiss-cpu sentence-transformers

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>

In [2]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import json

In [3]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    pages = []
    for page_num, page in enumerate(doc):
        text = page.get_text()
        if text.strip():
            pages.append({"page": page_num + 1, "text": text})
    return pages

In [4]:
def chunk_text(pages, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    all_chunks = []

    for page in pages:
        chunks = splitter.split_text(page["text"])
        for chunk in chunks:
            all_chunks.append({
                "text": chunk.strip(),
                "metadata": {
                    "page": page["page"]
                }
            })
    return all_chunks

In [5]:
def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    texts = [chunk['text'] for chunk in chunks]
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings, chunks

In [6]:
def store_in_faiss(embeddings, chunks, faiss_index_path="faiss.index", metadata_path="metadata.json"):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    faiss.write_index(index, faiss_index_path)

    with open(metadata_path, "w") as f:
        json.dump(chunks, f, indent=2)

    print(f"✅ FAISS index and metadata saved.")

In [7]:
def process_pdf(pdf_path):
    pages = extract_text_from_pdf(pdf_path)
    print(f"✅ Extracted {len(pages)} pages.")

    chunks = chunk_text(pages)
    print(f"✅ Created {len(chunks)} text chunks.")

    embeddings, enriched_chunks = embed_chunks(chunks)
    embeddings = np.array(embeddings).astype("float32")

    store_in_faiss(embeddings, enriched_chunks)

In [8]:
if __name__ == "__main__":
    pdf_file = "/content/ESC_Soft_Common_Sense_Constraints_ICML_2023.pdf"
    process_pdf(pdf_file)

✅ Extracted 14 pages.
✅ Created 143 text chunks.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ FAISS index and metadata saved.


In [16]:
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer

In [17]:
index = faiss.read_index("faiss.index")

with open("metadata.json", "r") as f:
    metadata = json.load(f)

print("✅ FAISS index and metadata loaded")
print(f"📄 Total chunks loaded: {len(metadata)}")

model = SentenceTransformer("all-MiniLM-L6-v2")

✅ FAISS index and metadata loaded
📄 Total chunks loaded: 143


In [18]:
def search(query, k=5):
    print(f"\n🔍 Searching for: '{query}'\n")

    query_vector = model.encode([query])

    D, I = index.search(np.array(query_vector).astype("float32"), k)

    for i, idx in enumerate(I[0]):
        chunk = metadata[idx]
        text = chunk["text"]
        page = chunk["metadata"].get("page", "?")

        print(f"--- Result {i+1} (Page {page}) ---")
        print(text[:500] + "...\n")

In [19]:
search("What is the key idea in this paper?")
search("How does the model handle constraints?")


🔍 Searching for: 'What is the key idea in this paper?'

--- Result 1 (Page 11) ---
from human demonstrations at scale. In CVPR, 2022.
Sarch, G., Fang, Z., Harley, A. W., Schydlo, P., Tarr, M. J.,
Gupta, S., and Fragkiadaki, K. Tidee: Tidying up novel
rooms using visuo-semantic commonsense priors. In
Avidan, S., Brostow, G., Ciss´e, M., Farinella, G. M.,
and Hassner, T. (eds.), Computer Vision – ECCV 2022,
pp. 480–496, Cham, 2022. Springer Nature Switzerland.
ISBN 978-3-031-19842-7.
Selvaraju, R. R., Cogswell, M., Das, A., Vedantam, R.,...

--- Result 2 (Page 13) ---
the object navigation process, we visualize an example in
Fig. 5. The agent chooses 3 frontiers during the navigation
process as the green points show. First, the agent detects a
kitchen and performs commonsense reasoning that the toilet
is not likely in the kitchen. Therefore it selects a frontier
with a certain distance from the kitchen. When it gets out of
the kitchen, it detects a table and several chairs, which are
al