**Ejercicio 10: Re-ranking**

**Nombre:** Aaròn Yumancela

**Objetivo:** Implementar y evaluar un pipeline de Recuperación de Información en dos etapas, y analizar el impacto del re-ranking en la calidad del ranking.

**Parte 1. Preparación del corpus**

In [1]:
# Instalar dependencias del proyecto
!pip -q install beir rank_bm25 sentence-transformers lightgbm scikit-learn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/304.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Importar librerías necesarias
import os, re, math, numpy as np, pandas as pd
from tqdm import tqdm

from beir import util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from rank_bm25 import BM25Okapi

from sentence_transformers import CrossEncoder, SentenceTransformer
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Tokenización básica para BM25
def simple_tokenize(text: str):
    text = text.lower()
    return re.findall(r"[a-z0-9]+", text)

# Construir DataFrame del corpus
def build_df_corpus(corpus: dict):
    df = (pd.DataFrame.from_dict(corpus, orient="index")
            .reset_index()
            .rename(columns={"index": "doc_id"}))
    if "title" not in df.columns: df["title"] = ""
    if "text" not in df.columns: df["text"] = ""
    df["full_text"] = (df["title"].fillna("") + " " + df["text"].fillna("")).str.strip()
    return df[["doc_id","title","text","full_text"]]

# Construir DataFrame de queries
def build_df_queries(queries: dict):
    df = (pd.DataFrame.from_dict(queries, orient="index", columns=["query"])
            .reset_index()
            .rename(columns={"index": "query_id"}))
    return df

# Construir DataFrame de qrels
def build_df_qrels(qrels: dict):
    rows = []
    for qid, docs in qrels.items():
        for doc_id, rel in docs.items():
            rows.append({"query_id": str(qid), "doc_id": str(doc_id), "relevance": int(rel)})
    return pd.DataFrame(rows)

# Convertir qrels a formato BEIR
def qrels_to_beir_format(df_qrels: pd.DataFrame):
    qrels = {}
    for r in df_qrels.itertuples(index=False):
        qrels.setdefault(str(r.query_id), {})[str(r.doc_id)] = int(r.relevance)
    return qrels

# Seleccionar queries con muchos relevantes
def pick_queries_with_many_rels(df_qrels, min_rels=3, top_n=10):
    cnt = (df_qrels[df_qrels["relevance"] > 0]
           .groupby("query_id")["doc_id"].count()
           .sort_values(ascending=False))
    return cnt[cnt >= min_rels].head(top_n)

# Comparar cambios en top-10
def compare_top10_changes(run_a, run_b, qid):
    a = run_a[qid][:10]
    b = run_b[qid][:10]
    pos_a = {d:i+1 for i,d in enumerate(a)}
    pos_b = {d:i+1 for i,d in enumerate(b)}
    shared = sorted(set(a).intersection(b), key=lambda d: pos_a[d])
    moved = [(d, pos_a[d], pos_b[d]) for d in shared if pos_a[d] != pos_b[d]]
    new_in_b = [d for d in b if d not in a]
    dropped = [d for d in a if d not in b]
    return moved, new_in_b, dropped


  from tqdm.autonotebook import tqdm


In [3]:
# Descargar y cargar dataset BEIR
DATASET_NAME = "scifact"
DATA_DIR = "/content/data/beir_datasets"
os.makedirs(DATA_DIR, exist_ok=True)

url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{DATASET_NAME}.zip"
util.download_and_unzip(url, DATA_DIR)

dataset_path = os.path.join(DATA_DIR, DATASET_NAME)

# Cargar datos para entrenamiento
corpus_train, queries_train, qrels_train = GenericDataLoader(dataset_path).load(split="train")

# Cargar datos para evaluación
corpus_test, queries_test, qrels_test = GenericDataLoader(dataset_path).load(split="test")

# Crear DataFrames principales
df_corpus = build_df_corpus(corpus_test)
df_queries_test = build_df_queries(queries_test)
df_qrels_test = build_df_qrels(qrels_test)

df_queries_train = build_df_queries(queries_train)
df_qrels_train = build_df_qrels(qrels_train)

print("Corpus test:", df_corpus.shape)
print("Queries train:", df_queries_train.shape, "Qrels train:", df_qrels_train.shape)
print("Queries test:", df_queries_test.shape, "Qrels test:", df_qrels_test.shape)


/content/data/beir_datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

Corpus test: (5183, 4)
Queries train: (809, 2) Qrels train: (919, 3)
Queries test: (300, 2) Qrels test: (339, 3)


In [4]:
# Elegir queries con muchos relevantes
topq = pick_queries_with_many_rels(df_qrels_test, min_rels=3, top_n=10)
topq


Unnamed: 0_level_0,doc_id
query_id,Unnamed: 1_level_1
133,5
873,5
1379,4
179,4
971,4
1274,3
275,3
1029,3
597,3


In [5]:
# Analizar una query representativa
qid = str(topq.index[0])
print("Query ID:", qid)
print(df_queries_test.loc[df_queries_test["query_id"] == qid, "query"].values[0])

print("\nDocs relevantes (qrels):")
print(df_qrels_test[(df_qrels_test["query_id"] == qid) & (df_qrels_test["relevance"] > 0)])


Query ID: 133
Assembly of invadopodia is triggered by focal generation of phosphatidylinositol-3,4-biphosphate and the activation of the nonreceptor tyrosine kinase Src.

Docs relevantes (qrels):
   query_id    doc_id  relevance
31      133  38485364          1
32      133   6969753          1
33      133  17934082          1
34      133  16280642          1
35      133  12640810          1


**Parte 2. Retrieval inicial (baseline)**

In [6]:
# Construir índice BM25
doc_ids = df_corpus["doc_id"].astype(str).tolist()
docs = df_corpus["full_text"].tolist()
tokenized_docs = [simple_tokenize(t) for t in tqdm(docs, desc="Tokenizando corpus")]

bm25 = BM25Okapi(tokenized_docs)


Tokenizando corpus: 100%|██████████| 5183/5183 [00:00<00:00, 5654.34it/s]


In [7]:
# Ejecutar retrieval BM25
def bm25_run(queries_df, top_k=100):
    run = {}
    scores_run = {}
    for row in tqdm(queries_df.itertuples(index=False), total=len(queries_df), desc="BM25 retrieval"):
        qid = str(row.query_id)
        qtext = row.query
        qtok = simple_tokenize(qtext)
        scores = bm25.get_scores(qtok)
        top_idx = np.argsort(scores)[::-1][:top_k]
        ranked_doc_ids = [doc_ids[i] for i in top_idx]
        run[qid] = ranked_doc_ids
        scores_run[qid] = {doc_ids[i]: float(scores[i]) for i in top_idx}
    return run, scores_run

bm25_ranked, bm25_scores_beir = bm25_run(df_queries_test, top_k=100)


BM25 retrieval: 100%|██████████| 300/300 [00:16<00:00, 18.71it/s]


In [8]:
qrels_test_beir = qrels_test

evaluator = EvaluateRetrieval()
# Evaluar baseline BM25
ndcg, _map, recall, precision = evaluator.evaluate(qrels_test_beir, bm25_scores_beir, k_values=[10])

print("BM25 baseline")
print("nDCG@10:", ndcg["NDCG@10"])
print("Recall@10:", recall["Recall@10"])
print("MAP@10:", _map["MAP@10"])


BM25 baseline
nDCG@10: 0.65228
Recall@10: 0.77567
MAP@10: 0.60711


**Parte 3. Implementación del re-ranking cross-encoder**

In [9]:
# Inicializar Cross-Encoder en GPU
import torch
from sentence_transformers import CrossEncoder

device = "cuda" if torch.cuda.is_available() else "cpu"

cross = CrossEncoder(
    "cross-encoder/ms-marco-MiniLM-L-6-v2",
    device=device
)

print("CrossEncoder usando:", device)

doc_text_by_id = dict(zip(df_corpus["doc_id"].astype(str), df_corpus["full_text"]))

# Re-rankear candidatos con Cross-Encoder
def cross_encoder_rerank(queries_df, initial_ranked, top_k=50):
    reranked = {}
    reranked_scores_beir = {}

    for row in tqdm(
        queries_df.itertuples(index=False),
        total=len(queries_df),
        desc="Cross-Encoder reranking"
    ):
        qid = str(row.query_id)
        qtext = row.query

        candidates = initial_ranked[qid][:top_k]
        pairs = [(qtext, doc_text_by_id[d]) for d in candidates]

        ce_scores = cross.predict(
            pairs,
            batch_size=128,
            show_progress_bar=False
        )

        order = np.argsort(ce_scores)[::-1]
        ranked_docs = [candidates[i] for i in order]

        reranked[qid] = ranked_docs
        reranked_scores_beir[qid] = {
            ranked_docs[i]: float(ce_scores[order[i]])
            for i in range(len(ranked_docs))
        }

    return reranked, reranked_scores_beir


ce_ranked, ce_scores_beir = cross_encoder_rerank(
    df_queries_test,
    bm25_ranked,
    top_k=50
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CrossEncoder usando: cuda


Cross-Encoder reranking: 100%|██████████| 300/300 [01:34<00:00,  3.19it/s]


In [10]:
# Analizar cambios en top-10
moved, new_in_b, dropped = compare_top10_changes(
    run_a=bm25_ranked,
    run_b=ce_ranked,
    qid=qid
)

print("Query:", df_queries_test.loc[df_queries_test["query_id"] == qid, "query"].values[0])
print("\nMovidos (doc_id, pos_BM25, pos_CE):")
print(moved[:20])

print("\nNuevos en Top10 (CE pero no BM25):")
print(new_in_b)

print("\nSalieron del Top10 (BM25 pero no CE):")
print(dropped)


Query: Assembly of invadopodia is triggered by focal generation of phosphatidylinositol-3,4-biphosphate and the activation of the nonreceptor tyrosine kinase Src.

Movidos (doc_id, pos_BM25, pos_CE):
[('19752008', 3, 9), ('16280642', 5, 3), ('35660758', 10, 1)]

Nuevos en Top10 (CE pero no BM25):
['12640810', '6969753', '9507605', '86694016', '14328288', '42708716', '17934082']

Salieron del Top10 (BM25 pero no CE):
['5270265', '26688294', '45764440', '12785130', '5914739', '11200685', '37964706']


In [11]:
# Evaluar re-ranking Cross-Encoder
ndcg_ce, map_ce, recall_ce, _ = evaluator.evaluate(qrels_test_beir, ce_scores_beir, k_values=[10])

print("Cross-Encoder rerank")
print("nDCG@10:", ndcg_ce["NDCG@10"])
print("Recall@10:", recall_ce["Recall@10"])
print("MAP@10:", map_ce["MAP@10"])


Cross-Encoder rerank
nDCG@10: 0.68274
Recall@10: 0.80628
MAP@10: 0.63732


**Parte 4. Implementación del re-ranking LTR**

In [12]:
# Preparar features TF-IDF
tfidf = TfidfVectorizer(max_features=50000, stop_words="english")
X_tfidf_docs = tfidf.fit_transform(df_corpus["full_text"].tolist())

# Generar embeddings bi-encoder
biencoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
doc_emb = biencoder.encode(df_corpus["full_text"].tolist(), batch_size=64, show_progress_bar=True, normalize_embeddings=True)

doc_index = {doc_id: i for i, doc_id in enumerate(doc_ids)}


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/81 [00:00<?, ?it/s]

In [13]:
# Construir dataset para LTR
def build_ltr_dataset(queries_df, qrels_df, initial_ranked, top_k=100):
    """
    Devuelve:
      X: features (N x F)
      y: labels (N,)
      group: tamaño de cada query (list)
      meta: DataFrame con (query_id, doc_id, bm25_score, ...)
    """
    qrels_map = qrels_to_beir_format(qrels_df)

    X_list, y_list, group = [], [], []
    meta_rows = []

    for row in tqdm(queries_df.itertuples(index=False), total=len(queries_df), desc="Construyendo dataset LTR"):
        qid = str(row.query_id)
        qtext = row.query
        candidates = initial_ranked[qid][:top_k]

        # query TF-IDF y embedding
        q_tfidf = tfidf.transform([qtext])
        q_emb = biencoder.encode([qtext], normalize_embeddings=True)

        qtok = simple_tokenize(qtext)
        bm25_scores_full = bm25.get_scores(qtok)

        for d in candidates:
            di = doc_index[d]
            bm25_s = float(bm25_scores_full[di])

            doc_len = len(simple_tokenize(doc_text_by_id[d]))
            tfidf_cos = float(cosine_similarity(q_tfidf, X_tfidf_docs[di])[0,0])

            bi_cos = float(np.dot(q_emb[0], doc_emb[di]))  # embeddings normalizados => dot = cosine

            feats = [bm25_s, doc_len, tfidf_cos, bi_cos]

            label = int(qrels_map.get(qid, {}).get(d, 0))
            X_list.append(feats)
            y_list.append(label)
            meta_rows.append({"query_id": qid, "doc_id": d, "bm25": bm25_s})

        group.append(len(candidates))

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.int32)
    meta = pd.DataFrame(meta_rows)
    return X, y, group, meta


In [14]:
# Candidatos BM25 para TRAIN queries (para entrenar LTR)
bm25_ranked_train, _ = bm25_run(df_queries_train, top_k=100)

X_train, y_train, group_train, meta_train = build_ltr_dataset(
    df_queries_train, df_qrels_train, bm25_ranked_train, top_k=100
)

print("X_train:", X_train.shape, "y_train:", y_train.shape, "num_queries:", len(group_train))
print("Relevantes (y=1) en train:", int((y_train > 0).sum()))


BM25 retrieval: 100%|██████████| 809/809 [00:17<00:00, 46.74it/s]
Construyendo dataset LTR: 100%|██████████| 809/809 [01:34<00:00,  8.55it/s]


X_train: (80900, 4) y_train: (80900,) num_queries: 809
Relevantes (y=1) en train: 830


In [15]:
# Entrenar modelo Learning-to-Rank
ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=20,
    random_state=42
)

ranker.fit(
    X_train, y_train,
    group=group_train
)

print("Entrenado.")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 80900, number of used features: 4
Entrenado.


In [16]:
X_test, y_test, group_test, meta_test = build_ltr_dataset(
    df_queries_test, df_qrels_test, bm25_ranked, top_k=100
)

# Aplicar re-ranking LTR
pred_test = ranker.predict(X_test)

# reconstruir ranking por query con esas predicciones
ltr_ranked = {}
ltr_scores_beir = {}

start = 0
for (qid, g) in zip(df_queries_test["query_id"].astype(str).tolist(), group_test):
    chunk_pred = pred_test[start:start+g]
    chunk_meta = meta_test.iloc[start:start+g].copy()

    order = np.argsort(chunk_pred)[::-1]
    ranked_docs = chunk_meta.iloc[order]["doc_id"].tolist()

    ltr_ranked[qid] = ranked_docs
    ltr_scores_beir[qid] = {ranked_docs[i]: float(chunk_pred[order[i]]) for i in range(len(ranked_docs))}

    start += g

print("LTR reranking listo.")


Construyendo dataset LTR: 100%|██████████| 300/300 [00:35<00:00,  8.51it/s]


LTR reranking listo.


In [17]:
# Comparar top-10 tras LTR
moved_ltr, new_in_ltr, dropped_ltr = compare_top10_changes(
    run_a=bm25_ranked,
    run_b=ltr_ranked,
    qid=qid
)

print("Movidos (doc_id, pos_BM25, pos_LTR):")
print(moved_ltr[:20])

print("\nNuevos en Top10 (LTR pero no BM25):")
print(new_in_ltr)

print("\nSalieron del Top10 (BM25 pero no LTR):")
print(dropped_ltr)


Movidos (doc_id, pos_BM25, pos_LTR):
[('928281', 2, 8), ('19343151', 3, 1), ('18218379', 9, 2)]

Nuevos en Top10 (LTR pero no BM25):
['12650610', '26008462', '1976183', '25419778', '31803596', '6308416', '7506409']

Salieron del Top10 (BM25 pero no LTR):
['13923069', '7711685', '16630060', '1084345', '10698739', '38793927', '4387484']


In [18]:
# Evaluar re-ranking LTR
ndcg_ltr, map_ltr, recall_ltr, _ = evaluator.evaluate(qrels_test_beir, ltr_scores_beir, k_values=[10])

print("LTR rerank")
print("nDCG@10:", ndcg_ltr["NDCG@10"])
print("Recall@10:", recall_ltr["Recall@10"])
print("MAP@10:", map_ltr["MAP@10"])


LTR rerank
nDCG@10: 0.73362
Recall@10: 0.82572
MAP@10: 0.70049


**Parte 5. Evaluación post re-ranking**

In [19]:
# Comparar resultados finales
results = pd.DataFrame([
    ["BM25", ndcg["NDCG@10"], recall["Recall@10"], _map["MAP@10"]],
    ["CrossEncoder", ndcg_ce["NDCG@10"], recall_ce["Recall@10"], map_ce["MAP@10"]],
    ["LTR", ndcg_ltr["NDCG@10"], recall_ltr["Recall@10"], map_ltr["MAP@10"]],
], columns=["Modelo", "nDCG@10", "Recall@10", "MAP@10"])

results


Unnamed: 0,Modelo,nDCG@10,Recall@10,MAP@10
0,BM25,0.65228,0.77567,0.60711
1,CrossEncoder,0.68274,0.80628,0.63732
2,LTR,0.73362,0.82572,0.70049
