In [None]:
!pip install -Uq sentence-transformers hnswlib tqdm
!pip install --upgrade setuptools

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
Collecting setuptools
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Using cached setuptools-80.9.0-py3-none-any.whl (1.2 MB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.2.0
    Uninstalling setuptools-75.2.0:
      Successfully uninstalled setuptools-75.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0mSuccessfully installed

In [None]:
import json
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import hnswlib

KeyboardInterrupt: 

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
CHUNKS_PATH = "/content/drive/MyDrive/NLP/codes/data/chunks/chunks.jsonl"
INDEX_DIR = "/content/drive/MyDrive/NLP/codes/data/index"

In [None]:
def load_chunks(chunks_path: str):
    """
    chunks.jsonl dosyasını okuyup:
    - texts: [metin, ...]
    - metadatas: [{...}, ...] (chunk_id, paper_id, title, vs.)
    döndürür.
    """

    texts = []
    metadatas = []

    path = Path(chunks_path)
    assert path.exists(), f"Chunks file not found: {path}"

    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="Loading chunks")):
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)

            raw_text = obj.get("text") or ""
            text = raw_text.strip()
            if not text:
                continue

            texts.append(text)

            meta = {
                "idx": i,
                "chunk_id": obj.get("chunk_id"),
                "paper_id": obj.get("paper_id"),
                "title": obj.get("title"),
                "section_title": obj.get("section_title"),
                "section_path": obj.get("section_path"),
                "para_index": obj.get("para_index"),
                "reference_ids": obj.get("reference_ids", []),
                "inline_citations": obj.get("inline_citations", []),
                "references": obj.get("references", []),
                "year": obj.get("year"),
                "url": obj.get("url"),
                "venue": obj.get("venue"),
                "authors": obj.get("authors"),
                "text": text,
            }
            metadatas.append(meta)

        return texts, metadatas

In [None]:
def build_embeddings(texts, model_name: str, batch_size: int = 256):
    """
    BGE-M3 ile embedding üretir.
    """

    print(f"Loading embedding model: {model_name} on CUDA...")
    model = SentenceTransformer(model_name, device="cuda", trust_remote_code=True)

    model.max_seq_length = 8192

    print(f"Encoding {len(texts)} chunks with batch_size={batch_size}...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )

    return embeddings

In [None]:
def build_hnsw_index(
        embeddings: np.ndarray,
        index_dir: str,
        space: str = "cosine"
):
    """
    HNSW index'i oluşturup kaydeder.
    """

    index_dir = Path(index_dir)
    index_dir.mkdir(parents=True, exist_ok=True)

    dim = embeddings.shape[1]
    num_elements = embeddings.shape[0]

    print(f"Building HNSW index | dim={dim}, n={num_elements}")

    index = hnswlib.Index(space=space, dim=dim)

    index.init_index(
        max_elements=num_elements,
        ef_construction=200,
        M=32,
    )

    labels = np.arange(num_elements)

    index.add_items(embeddings, labels)

    index.set_ef(96)

    index_path = index_dir / "hnsw_index.bin"
    print(f"Saving HNSW index to: {index_path}")
    index.save_index(str(index_path))

    return labels

In [None]:
def save_metadata(metadatas, labels, index_dir: str):
    """
    Her label için metadata'yı JSONL olarak kaydeder.
    HNSW label -> metadata eşlemesi için.
    """

    index_dir = Path(index_dir)
    meta_path = index_dir / "metadatas.jsonl"

    print(f"Saving metadata to: {meta_path}")

    with meta_path.open("w", encoding="utf-8") as f:
        for label, meta in zip(labels, metadatas):
            meta_out = dict(meta)
            meta_out["label"] = int(label)
            f.write(json.dumps(meta_out, ensure_ascii=False) + "\n")

In [None]:
print("Step 1: Loading chunks")
texts, metadatas = load_chunks(CHUNKS_PATH)

print("Step 2: Building embeddings")
embeddings = build_embeddings(texts, EMBEDDING_MODEL_NAME, batch_size=64)

print(f"Embeddings shape: {embeddings.shape}")

print("Step 3: Building HNSW index")
labels = build_hnsw_index(embeddings, INDEX_DIR, space="cosine")

print("Step 4: Saving metadata")
save_metadata(metadatas, labels, INDEX_DIR)

print("Done! Index + metadata are ready.")

Step 1: Loading chunks


Loading chunks: 0it [00:00, ?it/s]

Step 2: Building embeddings
Loading embedding model: BAAI/bge-m3 on CUDA...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Encoding 602123 chunks with batch_size=64...


Batches:   0%|          | 0/9409 [00:00<?, ?it/s]

Embeddings shape: (602123, 1024)
Step 3: Building HNSW index
Building HNSW index | dim=1024, n=602123
Saving HNSW index to: /content/drive/MyDrive/NLP/codes/data/index/hnsw_index.bin
Step 4: Saving metadata
Saving metadata to: /content/drive/MyDrive/NLP/codes/data/index/metadatas.jsonl
Done! Index + metadata are ready.
