# Encoder amélioré

Mise en place d'un encodeur amélioré. Les résultats de l'encodeur sont mis dans le fichier `encoder` afin de ne pas avoir à refaire tout le temps les calculs.

## Ajouts des importations

In [2]:
import json
import numpy as np
import networkx as nx
from collections import defaultdict, Counter
from typing import Dict, List, Tuple
import os
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import warnings

warnings.filterwarnings("ignore")

np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_corpus(file_path: str) -> dict[str, dict]:
    """
    Load corpus data from JSONL file.
    Returns dictionary mapping document IDs to document data.
    """
    corpus = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            line = line.strip()
            obj = json.loads(line)
            docid = str(obj["_id"])
            corpus[docid] = obj
    return corpus


def load_queries(file_path: str) -> dict[str, dict]:
    """
    Load query data from JSONL file.
    Returns dictionary mapping query IDs to query data.
    """
    queries = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            line = line.strip()
            obj = json.loads(line)
            qid = str(obj["_id"])
            queries[qid] = obj
    return queries


def load_qrels(file_path: str) -> dict[str, dict[str, int]]:
    """
    Load relevance judgments from TSV file.
    Returns dictionary mapping query IDs to candidate relevance scores.
    """
    qrels = defaultdict(dict)
    with open(file_path, "r") as f:
        lines = f.readlines()
        for line in lines[1:]:
            qid, docid, score = line.strip().split("\t")
            qrels[qid][docid] = int(score)
    return qrels


print("Loading dataset...")
corpus = load_corpus("./data/corpus.jsonl")
queries = load_queries("./data/queries.jsonl")
qrels_valid = load_qrels("./data/valid.tsv")


print(f"Loaded {len(corpus)} documents in corpus")
print(f"Loaded {len(queries)} queries")
print(f"Loaded relevance for {len(qrels_valid)} queries (dataset)")

Loading dataset...
Loaded 25657 documents in corpus
Loaded 1000 queries
Loaded relevance for 700 queries (dataset)


In [4]:
model_name = "intfloat/e5-small-v2"  # modèle par défaut rapide et performant
encoder_path = "encoder_e5-small-v2.pkl"


def _get_text_corpus(item):
    return item.get("title") + item.get("text")


def _get_text_queries(item):
    return item.get("text")


if os.path.exists(encoder_path):
    with open(encoder_path, "rb") as f:
        encoder = pickle.load(f)
    print(f"Loaded encoder from {encoder_path}")
else:
    model = SentenceTransformer(model_name)
    # Corpus embeddings
    corpus_ids = list(corpus.keys())
    corpus_texts = [_get_text_corpus(corpus[_id]) for _id in corpus_ids]
    corpus_emb = model.encode(
        corpus_texts, show_progress_bar=True, batch_size=64, convert_to_numpy=True
    )
    # Queries embeddings
    query_ids = list(queries.keys())
    query_texts = [_get_text_queries(queries[_id]) for _id in query_ids]
    query_emb = model.encode(
        query_texts, show_progress_bar=True, batch_size=64, convert_to_numpy=True
    )

    encoder = {
        "model": model_name,
        "corpus_ids": corpus_ids,
        "corpus_emb": corpus_emb,
        "query_ids": query_ids,
        "query_emb": query_emb,
    }
    with open(encoder_path, "wb") as f:
        pickle.dump(encoder, f)
    print(f"Saved encoder to {encoder_path}")
# ...existing code...

Batches: 100%|██████████| 401/401 [33:09<00:00,  4.96s/it]  
Batches: 100%|██████████| 16/16 [00:04<00:00,  3.25it/s]


Saved encoder to encoder_e5-small-v2.pkl
