## Embeddings des articles (data/raw)

Ce notebook extrait le texte des fichiers HTML, crée des chunks, génère des embeddings avec un modèle open‑source `sentence-transformers`, puis sauvegarde:
- `data/vectors_base/embeddings.npy`
- `data/vectors_base/metadata.jsonl`

Paramètres principaux dans la cellule d’exécution: `CHUNK_SIZE`, `OVERLAP`, `BATCH_SIZE`, `MODEL_NAME`.


In [1]:
# Embeddings pipeline: HTML -> chunks -> hashing embeddings -> save

from __future__ import annotations

import os
import re
import json
import math
from pathlib import Path
from typing import List, Dict, Tuple, Iterable, Optional
from html import unescape
import numpy as np

In [2]:
# Project paths (robust for notebooks; cwd == ntb/)
ROOT = Path.cwd().parent
RAW_DIR = ROOT / "data" / "raw"
OUT_DIR = ROOT / "data" / "vectors_base"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"ROOT: {ROOT}")
print(f"RAW_DIR: {RAW_DIR}")
print(f"OUT_DIR: {OUT_DIR}")

ROOT: /Users/perso/Documents/Agents/Agentic_Times
RAW_DIR: /Users/perso/Documents/Agents/Agentic_Times/data/raw
OUT_DIR: /Users/perso/Documents/Agents/Agentic_Times/data/vectors_base


In [3]:
def html_to_text(html: str) -> str:
    text = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.IGNORECASE)
    text = re.sub(r"<style[\s\S]*?</style>", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
    text = unescape(text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def simple_tokenize(text: str) -> List[str]:
    text = text.lower()
    tokens = re.findall(r"[a-z0-9]+", text)
    return tokens


def chunk_tokens(tokens: List[str], chunk_size: int = 500, overlap: int = 50) -> List[List[str]]:
    if chunk_size <= 0:
        raise ValueError("chunk_size must be > 0")
    if overlap < 0 or overlap >= chunk_size:
        raise ValueError("overlap must be >= 0 and < chunk_size")

    chunks = []
    start = 0
    step = chunk_size - overlap
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunks.append(tokens[start:end])
        if end == len(tokens):
            break
        start += step
    return chunks


def det_hash(token: str) -> int:
    h = 1469598103934665603
    for c in token:
        h ^= ord(c)
        h = (h * 1099511628211) & 0xFFFFFFFFFFFFFFFF
    return h


def hashing_embedding(tokens: List[str], dim: int = 768) -> np.ndarray:
    vec = np.zeros(dim, dtype=np.float32)
    for t in tokens:
        h = det_hash(t)
        idx = h % dim
        sign = -1.0 if ((h >> 63) & 1) else 1.0
        vec[idx] += sign

    norm = float(np.linalg.norm(vec))
    if norm > 0:
        vec /= norm
    return vec

def collect_html_files(raw_dir: Path) -> List[Path]:
    return sorted([p for p in raw_dir.glob("**/*.html") if p.is_file()])


def process_file(path: Path, chunk_size: int, overlap: int) -> List[Dict]:
    html = path.read_text(encoding="utf-8", errors="ignore")
    text = html_to_text(html)
    tokens = simple_tokenize(text)
    token_chunks = chunk_tokens(tokens, chunk_size=chunk_size, overlap=overlap)

    chunks = []
    for i, tok_chunk in enumerate(token_chunks):
        chunk_text = " ".join(tok_chunk)
        chunks.append({
            "source": str(path.relative_to(RAW_DIR)),
            "chunk_index": i,
            "num_tokens": len(tok_chunk),
            "text": chunk_text,
        })
    return chunks


def build_embeddings(raw_dir: Path, chunk_size: int = 500, overlap: int = 50, dim: int = 768) -> Tuple[np.ndarray, List[Dict]]:
    files = collect_html_files(raw_dir)
    print(f"Found {len(files)} HTML files.")

    all_meta: List[Dict] = []
    vectors: List[np.ndarray] = []

    for f in files:
        chunks = process_file(f, chunk_size=chunk_size, overlap=overlap)
        for ch in chunks:
            toks = ch["text"].split()
            vec = hashing_embedding(toks, dim=dim)
            vectors.append(vec)
            all_meta.append({k: v for k, v in ch.items() if k != "text"})
        print(f"Processed {f.name}: {len(chunks)} chunks")

    if len(vectors) == 0:
        return np.zeros((0, dim), dtype=np.float32), []

    emb = np.vstack(vectors)
    return emb, all_meta


### Notes
- Pipeline: extraction HTML → tokenisation → chunking → embeddings (open‑source `sentence-transformers`).
- Ajustez `CHUNK_SIZE`, `OVERLAP`, `BATCH_SIZE`, et `MODEL_NAME` si besoin.
- Sorties:
  - `data/vectors_base/embeddings.npy`: matrice float32 (n_chunks × dim_modèle)
  - `data/vectors_base/metadata.jsonl`: JSONL aligné (par ligne: `source`, `chunk_index`, `num_tokens`, `row_index`).


In [4]:
try:
    from sentence_transformers import SentenceTransformer
except Exception as e:
    raise RuntimeError(
        "sentence-transformers is required. Install with: pip install sentence-transformers"
    ) from e

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # small, fast, high-quality
print(f"Loading model: {MODEL_NAME}")
model: SentenceTransformer = SentenceTransformer(MODEL_NAME)
print("Model loaded.")


Loading model: sentence-transformers/all-MiniLM-L6-v2
Model loaded.


In [5]:
def build_embeddings_model(
    raw_dir: Path,
    model: "SentenceTransformer",
    chunk_size: int = 500,
    overlap: int = 50,
    batch_size: int = 64,
    convert_to_numpy: bool = True,
    normalize_embeddings: bool = True,
) -> Tuple[np.ndarray, List[Dict]]:
    files = collect_html_files(raw_dir)
    print(f"Found {len(files)} HTML files.")

    all_chunks: List[Dict] = []
    for f in files:
        chs = process_file(f, chunk_size=chunk_size, overlap=overlap)
        all_chunks.extend(chs)
        print(f"Prepared {f.name}: {len(chs)} chunks")

    if not all_chunks:
        return np.zeros((0, model.get_sentence_embedding_dimension()), dtype=np.float32), []

    texts = [c["text"] for c in all_chunks]

    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=convert_to_numpy,
        normalize_embeddings=normalize_embeddings,
    )

    metadata = []
    for i, ch in enumerate(all_chunks):
        m = {k: v for k, v in ch.items() if k != "text"}
        metadata.append(m)

    if isinstance(embeddings, list):
        embeddings = np.asarray(embeddings, dtype=np.float32)

    return embeddings.astype(np.float32, copy=False), metadata


In [6]:
CHUNK_SIZE = 500
OVERLAP = 50
BATCH_SIZE = 64

embeddings, metadata = build_embeddings_model(
    RAW_DIR,
    model=model,
    chunk_size=CHUNK_SIZE,
    overlap=OVERLAP,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True,
    normalize_embeddings=True,
)
print(f"Embeddings shape: {embeddings.shape}")

emb_path = OUT_DIR / "embeddings.npy"
np.save(emb_path, embeddings)
print(f"Saved embeddings to {emb_path}")

meta_path = OUT_DIR / "metadata.jsonl"
with meta_path.open("w", encoding="utf-8") as f:
    for i, m in enumerate(metadata):
        m_out = dict(m)
        m_out["row_index"] = i
        f.write(json.dumps(m_out, ensure_ascii=False) + "\n")
print(f"Saved metadata to {meta_path}")

Found 9856 HTML files.
Prepared 1-000-for-a-one-hour-appointment-why-are-fees-for-australia-s-specialist-doctors-skyrocketing.html: 4 chunks
Prepared 1-5m-foreign-workers-already-in-uk-could-face-longer-wait-for-permanent-settlement.html: 3 chunks
Prepared 10-of-the-best-chocolate-advent-calendars.html: 1 chunks
Prepared 100-gaza-children-hope-to-be-evacuated-to-uk-for-urgent-medical-care.html: 2 chunks
Prepared 2020s-on-course-to-be-weakest-decade-for-global-economy-since-1960s-says-world-bank.html: 2 chunks
Prepared 2025-on-track-to-beat-uk-record-for-wildfires-warn-firefighters.html: 1 chunks
Prepared 23andme-back-on-the-auction-block-after-former-ceo-makes-11th-hour-bid.html: 2 chunks
Prepared 30-of-the-best-uk-pubs-for-an-autumn-escape-with-great-food.html: 10 chunks
Prepared 37-000-more-children-affected-by-brutal-two-child-benefit-cap-data-shows.html: 2 chunks
Prepared 38-londres-street-by-philippe-sands-review-pinochet-and-the-nazis.html: 2 chunks
Prepared 48-dead-after-soviet-

Batches:   0%|          | 0/426 [00:00<?, ?it/s]

Embeddings shape: (27251, 384)
Saved embeddings to /Users/perso/Documents/Agents/Agentic_Times/data/vectors_base/embeddings.npy
Saved metadata to /Users/perso/Documents/Agents/Agentic_Times/data/vectors_base/metadata.jsonl
