# Other LLMs usage

- Bi-encoder as first stage retriever
- Cross-encoder
- Causal LM

---

In this notebook we have some informal trial of other models.

In [1]:
### FOR COLAB

# !pip install -q condacolab
# import condacolab
# condacolab.install()

# !conda install -c pytorch faiss-gpu -y
# !conda install -c conda-forge py-xgboost -y

# !pip install --upgrade -q python-terrier
# !pip install -q sentence_transformers ipdb emoji
# !pip Install -q pyterrier-caching

# !gdown 1HhgXzyEpsZNcenU9XhJuOYyDUKEzUse4
# !unzip pir_data.zip

In [2]:
import pandas as pd
import re
import os
import warnings
import shutil
import torch
import numpy as np
import random
import time
import faiss
import joblib
from functools import partial
import math

# Hugging Face
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, QuantoConfig
# VISUALIZATION
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

# TEXT PROCESSING
from textblob import TextBlob
import emoji
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# TERRIER
from pyterrier.measures import *
import pyterrier as pt
from pyterrier_caching import RetrieverCache

# Move to the parent directory
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

if not pt.java.started():
    pt.utils.set_tqdm('notebook')
    pt.java.init()


# FILTER WARNINGS
warnings.filterwarnings(
    "ignore",
    message="The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead",
)
warnings.filterwarnings(
    "ignore",
    message="`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48",
)

# GLOBAL VARIABLES
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# SET SEED
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [3]:
##### UTILITY function
### DISPLAY STYLED df
# set with colors highest values in each column
def display_styled(df, ignore_cols=[], color="#37614a"):
    """ignore_cols: list of columns to not color"""
    def highlight_max(s):
        if s.name in ignore_cols:  # Skip styling for the 'Name' column
            return ['' for _ in s]
        is_max = s == s.max()
        return [f'font-weight: bold; background-color: {color};' if v else '' for v in is_max]

    styled_df = (
        df.style
        .apply(highlight_max, axis=0)  # Apply styling
        .format({col: "{:.3f}" for col in df.select_dtypes(include='number').columns})  # Format numeric columns only
    )
    display(styled_df)
    return None

Load data

In [4]:
stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def preprocess_text(text, apply_stemmer=False, remove_stopwords=False):
    # remove emojis
    text = emoji.replace_emoji(text, "")
    # remove links
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    # remove html tags
    # text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # lowercase verything
    text = text.lower()
    # remove backslashes
    text = re.sub(r"\\", "", text)
    # remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # remove whitespaces
    text = re.sub(r"\s+", " ", text)
    # remove leading and trailing whites
    text = text.strip()
    # apply spelling correction
    # text = TextBlob(text).correct()
    tokens = text.split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
    if apply_stemmer:
        tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /home/ghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# COLLECTION OF DOCUMENTS (ANSWERS)
def preprocess_corpus(df):
    df = df.reset_index()
    df.columns = ['docno', 'text']
    df = df.reset_index(drop=True)
    return df

corpus_df = preprocess_corpus(pd.read_json('PIR_data/answer_retrieval/subset_answers.json', orient='index'))

# SAMPLES (QUERIES)
def preprocess_queries_df(path):
    df = pd.read_json(path, lines=True)
    df = df[['id', 'text', 'user_id', 'timestamp']]
    df.columns = ['qid', 'query_unprocessed', 'user_id', 'timestamp']
    df['query'] = df['query_unprocessed'].apply(lambda x: preprocess_text(x, apply_stemmer=True, remove_stopwords=True))
    df['timestamp'] = df["timestamp"].astype(int) // 10**9
    df['user_id'] = df['user_id'].astype(str)
    df = df.reset_index(drop=True)
    return df

train_queries = preprocess_queries_df('PIR_data/answer_retrieval/train/subset_data.jsonl')
val_queries = preprocess_queries_df('PIR_data/answer_retrieval/val/subset_data.jsonl')
test_queries = preprocess_queries_df('PIR_data/answer_retrieval/test/subset_data.jsonl')

# QRELS
def preprocess_qrels_df(path):
    df = pd.read_json(path, orient='index').reset_index()
    df.columns = ['qid', 'docno']
    df['label'] = 1
    df = df.reset_index(drop=True)
    return df

train_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/train/qrels.json')
val_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/val/qrels.json')
test_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/test/qrels.json')

print("ANSWERS")
display(corpus_df.head(3))
print("QUERIES")
display(train_queries.head(3))
print("QRELS")
display(train_qrels.head(3)) 

# Create an index:docno dictionary for the corpus_df dataframe
index_docno_dict = {index: row['docno'] for index, row in corpus_df.iterrows()}
k = 0
print(f"index = {k}, docno = {index_docno_dict[k]}")

ANSWERS


Unnamed: 0,docno,text
0,writers_2010,TL;DRIf you're going to do present tense do it...
1,writers_2018,"Your writing style is stream-of-consciousness,..."
2,writers_2023,Place emphasis on uncomfortable things. Depend...


QUERIES


Unnamed: 0,qid,query_unprocessed,user_id,timestamp,query
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...
1,academia_100456,Is there a free (as in freedom) alternative to...,1106095,1513191752,free freedom altern publon review journal allo...
2,academia_103390,Search for StackExchange citations with Google...,1532620,1517935259,search stackexchang citat googl scholar possib...


QRELS


Unnamed: 0,qid,docno,label
0,academia_100305,academia_100217,1
1,academia_100456,academia_100462,1
2,academia_103390,academia_103391,1


index = 0, docno = writers_2010


In [6]:
##### REPORTED FROM PREVIOUS NOTEBOOK
### TAGS SCORE

path = './index_sepqa/user_tags_full.joblib'
USER_TAGS = joblib.load(path)

def get_user_tags(user_id, timestamp, include_curr_timestamp, user_tags=USER_TAGS):
    """
    Get the tags of a user at a given timestamp.

    include_curr_timestamp: if True, the tags at the given timestamp are included.
                            if False, the tags at the given timestamp are excluded.
    """
    tags = set()
    timestamp = int(timestamp)
    
    if include_curr_timestamp == False:
        timestamp -= 1 # exclude the question at the given timestamp

    # if the user_id is not in the user_tags dictionary, return an empty set (no profile for the user)
    if user_id not in user_tags:
        return tags
    
    for ts, user_tags in user_tags[user_id]:
        if ts <= timestamp:
            tags = tags.union(user_tags)
        # the ts are sorted, so we can break when we reach the timestamp
        else:
            break
    return tags

def _get_tags_score(df, get_user_tags_fn=get_user_tags):
    """
    get scores based on the tags of the user that asked the question
    and the user that have written the answer.

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    
    def compute_score(tags_uq, tags_ua):
        """
        tags_uq: set of tags of the user that asked the question
        tags_ua: set of tags of the user that wrote the answer
        """
        return len(tags_uq.intersection(tags_ua)) / (len(tags_uq) + 1)

    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the tags of the user that asked the question
    tags_uq = get_user_tags_fn(uq, tq, include_curr_timestamp=True)

    # users that have written the answers
    uaS = df['doc_user_id'].values
    # get the tags of the users that have written the answers
    tags_uaS = [get_user_tags_fn(ua, tq, include_curr_timestamp=False) for ua in uaS]
    # compute the score for each answer
    scores = [compute_score(tags_uq, tags_ua) for tags_ua in tags_uaS]
    return scores


##### REPORTED FROM PREVIOUS NOTEBOOK
### NUM QUESTIONS


def get_user_num_questions(user_id, timestamp, user_tags=USER_TAGS):
    """
    Get the number of questions that a user have written at a given timestamp.
    the current timestamp is included.
    """
    num_questions = 0
    timestamp = int(timestamp)
    
    # if the user_id is not in the user_tags dictionary, return 0 (no profile for the user)
    if user_id not in user_tags:
        return num_questions
    
    for ts, user_tags in user_tags[user_id]:
        if ts <= timestamp:
            num_questions += 1
        # early stopping since the timestamps are sorted
        else:
            break
    return num_questions

# THE NUM_QUESTIONS IS REFERRED TO THE USER THAT ASKED THE QUESTION
def _get_num_questions(df, get_user_num_questions_fn=get_user_num_questions):
    """
    get the number of questions of the user that asked the question
    """
    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the number of questions of the user that asked the question
    scores = [get_user_num_questions_fn(uq, tq)]
    scores = scores * len(df) # repeat the score for each document in the batch
    return scores


---
# Bi-Encoder as first stage retriever

In [7]:

def _transform_biencoder(df, FAISS_INDEX, biencoder_model, text_field='query_unprocessed', k=100, index_docno_dict=None):
    """
    Transform queries with a biencoder model.

    Used as an argument of pyterrier.apply.generic()
        => The input is a dataframe of queries.
        => The output is a dataframe of ranked documents.
           The size of the output dataframe is the number of queries * k.

    Args:
        df (pd.DataFrame): Input dataframe with queries.
        FAISS_INDEX: FAISS index for document embeddings.
        biencoder_model: Bi-encoder model for encoding queries.
        text_field (str): Column name for query text in the dataframe.
        k (int): Number of top results to return per query.
        index_docno_dict (dict): Dictionary mapping FAISS indices to document IDs.

    Returns:
        pd.DataFrame: A dataframe with ranked documents for each query.
    """
    def get_dense_scores(df):
        """
        Get cosine similarity scores with a biencoder model using a FAISS FlatIndex.

        Args:
            df (pd.DataFrame): Input dataframe with query details.

        Returns:
            pd.DataFrame: Dataframe with top-k document scores and ranks.
        """
        if not all(df['qid'] == df['qid'].iloc[0]):
            raise ValueError("Not all qids in the batch are equal.")

        # Get the query unprocessed text
        query_text = df[text_field].iloc[0]

        # Encode the query to get its embedding
        query_embedding = biencoder_model.encode(query_text).astype('float32')

        # Normalize for cosine similarity
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Search the FAISS index
        scores, indices = FAISS_INDEX.search(np.array([query_embedding]), k)

        # Expand the input dataframe for the top-k results
        res_df = pd.concat([df] * k, ignore_index=True)

        # Map FAISS indices to document IDs
        docnos = [index_docno_dict[i] for i in indices[0]]
        res_df['docno'] = docnos
        res_df['score'] = scores[0]

        return res_df

    results = []
    cols = ['Index'] + df.columns.tolist()

    # Process each query row-by-row
    # I get the top-k results for each query
    for row in df.itertuples():
        row_df = pd.DataFrame([row], columns=cols)
        res_df = get_dense_scores(row_df)
        results.append(res_df)

    # Concatenate all results and add ranks
    results_df = pd.concat(results, ignore_index=True)
    return pt.model.add_ranks(results_df)


In [8]:
### BM25
path = "./index_sepqa/index_bm25/data.properties"
bm25_index = pt.IndexFactory.of(path)

bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""},
) % 100

### BI-ENCODER
index_path = "./index_sepqa/MiniLM_faiss_IndexFlatIP.index"
faiss_index = faiss.read_index(index_path)
biencoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

transform_biencoder = partial(_transform_biencoder, FAISS_INDEX=faiss_index, biencoder_model=biencoder_model, k=100, index_docno_dict=index_docno_dict)
bi_enc = pt.apply.generic(transform_biencoder)

In [56]:
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']

res = pt.Experiment(
    [bm25, bi_enc],
    val_queries,
    val_qrels,
    names=["BM25", "Bi-Encoder"],
    eval_metrics=metrics,
)
display(res.round(3))

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,BM25,0.755,0.286,0.814,0.841,0.969,0.815,13.172
1,Bi-Encoder,0.918,0.316,0.938,0.952,1.0,0.941,12.257


### Best Pipeline with BiEncoder as first stage retriever

In [64]:
### BM25
path = "./index_sepqa/index_bm25_users/data.properties"
bm25_index = pt.IndexFactory.of(path)
bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""}, 
    metadata=["docno", "doc_user_id"] # ADD doc_user_id TO THE METADATA TO BE RETRIEVED
)
norm_bm25 = bm25 >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES


### BI-ENCODER
transform_biencoder = partial(_transform_biencoder, FAISS_INDEX=faiss_index, biencoder_model=biencoder_model, k=100, index_docno_dict=index_docno_dict)
bi_enc = pt.apply.generic(transform_biencoder)
norm_bi_enc = bi_enc >> pt.pipelines.PerQueryMaxMinScoreTransformer()

### TAGS-SCORE
tags_score = pt.apply.doc_score(_get_tags_score, batch_size=64)
norm_tags_score = tags_score >> pt.pipelines.PerQueryMaxMinScoreTransformer()

### NUM QUESTIONS
num_questions = pt.apply.doc_score(_get_num_questions, batch_size=64)
def _score_wa(df, k=512):
    """We keep the tags score only if the user that asked the question has written at least k questions."""
    # FEATURES = [bi_enc, bm25, tags_score, num_questions]
    num_questions = df['features'][-1]
    if num_questions >= k:
        weights = [0.7, 0.1, 0.2]
    else:
        weights = [0.9, 0.1, 0]

    return np.dot(df['features'][:-1], weights)
score_wa = partial(_score_wa, k=512)
wa_with_heuristics = pt.apply.doc_score(score_wa)

In [73]:
### GET doc_user_id
path = './index_sepqa/subset_answers_with_users.json'
corpus_df_WITH_USERS = pd.read_json(path, orient="records", lines=True)
corpus_df_WITH_USERS['doc_user_id'] = corpus_df_WITH_USERS['doc_user_id'].astype(str)
DOC_USER_DICT = corpus_df_WITH_USERS.set_index('docno')['doc_user_id'].to_dict()

# this block add the doc_user_id to the dataframe in the retrieval pipeline
# here is done separately, because we are using the biencoder at first stage
# with BM25 we retrieve it from the metadata (here we cant)
get_doc_user_id = pt.apply.doc_user_id(lambda row: DOC_USER_DICT[row['docno']])


### SCORE TO FEATURE
# this block add score to the features
# so I don't need to rerun the biencoder for the second stage
# we also normalize it
score_to_feature_norm = pt.apply.doc_score(lambda x: x['score'], batch_size=64) >> pt.pipelines.PerQueryMaxMinScoreTransformer()

In [79]:
### PERSONALIZED RERANKING WA with heuristics
reranking_WAH = bi_enc % 100 \
                >> get_doc_user_id \
                >> score_to_feature_norm ** norm_bm25 ** norm_tags_score ** num_questions \
                >> wa_with_heuristics

In [80]:
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']

res = pt.Experiment(
    [bm25, bi_enc, reranking_WAH],
    val_queries,
    val_qrels,
    names=["BM25", "Bi-Encoder", "Best model with Bi-Encoder as first stage"],
    eval_metrics=metrics,
)
display(res.round(3))

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,BM25,0.755,0.286,0.814,0.841,0.969,0.815,15.575
1,Bi-Encoder,0.918,0.316,0.938,0.952,1.0,0.941,12.185
2,Best model with Bi-Encoder as first stage,0.918,0.32,0.943,0.954,1.0,0.943,40.784


In [81]:
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']

res = pt.Experiment(
    [bm25, bi_enc, reranking_WAH],
    test_queries,
    test_qrels,
    names=["BM25", "Bi-Encoder", "Best model with Bi-Encoder as first stage"],
    eval_metrics=metrics,
)
display(res.round(3))

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,BM25,0.806,0.286,0.836,0.861,0.969,0.843,13.482
1,Bi-Encoder,0.857,0.31,0.898,0.912,1.0,0.895,11.614
2,Best model with Bi-Encoder as first stage,0.878,0.31,0.908,0.923,1.0,0.911,40.793


---
---
Next we have some fun methods, but they need to be refined. this is just a trial.

---

# Cross-Encoder

In [9]:
from sentence_transformers import CrossEncoder

model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
cross_model = CrossEncoder(model_name, max_length=512, device='cuda')

In [10]:
# corpus_df is indexed by 'docno'
corpus_df_docno = corpus_df.set_index('docno')

def _get_cross_scores(df, cross_model, query_field='query_unprocessed', corpus_df=corpus_df_docno):
    """
    get cosine similarity score with a crossencoder model

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    query_text = df[query_field].values
    docno_list = df['docno'].tolist()
    doc_text = corpus_df.reindex(docno_list)['text'].values

    scores = cross_model.predict(list(zip(query_text, doc_text)))
    return scores

cross_scores = partial(_get_cross_scores, cross_model=cross_model)
cross_enc = pt.apply.doc_score(cross_scores, batch_size=64)

In [11]:
# BI-ENCODER % 10 >> CROSS-ENCODER
cross_pipe = bi_enc % 10 >> cross_enc 

res = pt.Experiment(
    [bm25, bi_enc, cross_pipe],
    val_queries,
    val_qrels,
    names=["BM25", "Bi-Encoder", "Cross-Encoder"],
    eval_metrics=metrics,
)
display(res.round(3))

del cross_model

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,BM25,0.755,0.286,0.814,0.841,0.969,0.815,15.38
1,Bi-Encoder,0.918,0.316,0.938,0.952,1.0,0.941,11.18
2,Cross-Encoder,0.857,0.313,0.903,0.924,0.99,0.902,48.693


The cross encoder model should be trained.

---

# Causal LM

```python
input_str = f"""
    Answer: {correct_answer}
    Question: {question}
    Is the Answer relevant to the Question? (Yes/No)
"""
```

We take as score the output of the model for the token associated to "_Yes"

In [12]:
torch.set_grad_enabled(False)
torch.cuda.empty_cache()

# QUANTO CONFIG
quantization_config = QuantoConfig(weights="int8")

# MODEL
model_name = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="cuda",
    torch_dtype=torch.float16,  # Efficient data type for GPU
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    use_safetensors=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def print_top_k_next_tokens(input_string, tokenizer=tokenizer, model=model, k=10):

    # Tokenize to ids and move to the same device as the model
    input_ids = tokenizer.encode(input_string, return_tensors="pt").to(model.device)

    # Call model() to get logits
    logits = model(input_ids)['logits']
    print("shape of logits:", logits.shape, "\n")

    # Only care about the last projection in the last batch
    logits = logits[-1, -1]

    # softmax() to get probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Keep only the top k
    probs, ids = torch.topk(probs, k)

    # Convert ids to tokens
    texts = tokenizer.convert_ids_to_tokens(ids)

    # Print
    for prob, text in zip(probs, texts):
        print(f"{prob:.4f}: \"{text}\"")

    del input_ids, logits, probs, ids, texts
    torch.cuda.empty_cache()
    return None

In [14]:
input_string = "Merry Christmas to all, and to all a good"
print_top_k_next_tokens(input_string)

shape of logits: torch.Size([1, 11, 32064]) 

0.7862: "▁night"
0.0335: "night"
0.0197: "▁day"
0.0158: "▁rest"
0.0144: "▁one"
0.0064: "▁har"
0.0062: "▁health"
0.0058: "▁evening"
0.0045: "▁hol"
0.0034: "▁hunting"


In [15]:
true_token = "Yes"
true_id = tokenizer.convert_tokens_to_ids(true_token)
print(f"Token ID for '{true_token}': {true_id}")

token = tokenizer.tokenize(true_token)
specific_token_id = tokenizer.encode(true_token, add_special_tokens=False)[0]
print(f"Token ID for '{token[0]}': {specific_token_id}")

Token ID for 'Yes': 8241
Token ID for '▁Yes': 3869


here we do an example for our task.

Give the Question and the Anwer, we ask if the Answer is relevant to the question.

In [16]:
sample = train_queries.head(1)
sample_qrel = train_qrels[train_qrels['qid'] == sample['qid'].values[0]]
correct_doc = corpus_df[corpus_df['docno'].isin(sample_qrel['docno'])]
incorrect_docs = corpus_df[~corpus_df['docno'].isin(sample_qrel['docno'])].iloc[:5]

question = sample['query_unprocessed'].iloc[0]
correct_answer = correct_doc['text'].iloc[0]
incorrect_answer = incorrect_docs['text'].iloc[0]

input_string = f"""
        Answer: {correct_answer}
        Question: {question}
        Is the Answer relevant to the Question? (Yes/No)
    """

print_top_k_next_tokens(input_string)

shape of logits: torch.Size([1, 866, 32064]) 

0.3724: "▁No"
0.1505: "1"
0.1136: "▁Yes"
0.0554: "▁-"
0.0405: "<0x0A>"
0.0325: "2"
0.0224: "▁["
0.0197: "▁no"
0.0185: "<|end|>"
0.0144: "▁Answer"


In [17]:
def get_yes_prob(question, answer, tokenizer=tokenizer, model=model):
    # '_Yes' id is 3869
    yes_id = 3869
    input_string = f"""
        Answer: {answer}
        Question: {question}
        Is the Answer relevant to the Question? (Yes/No)
    """

    input_ids = tokenizer.encode(input_string, return_tensors="pt").to(model.device)
    logits = model(input_ids)['logits']
    logits = logits[-1, -1]
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Get the 'Yes' probability
    yes_prob = probs[yes_id].item()

    del input_ids, logits, probs
    torch.cuda.empty_cache()

    return yes_prob


In [18]:
correct_prob = get_yes_prob(question, correct_answer)
incorrect_prob = get_yes_prob(question, incorrect_answer)

print(f"Yes probability for the correct answer: {correct_prob:.4f}")
print(f"Yes probability for the incorrect answer: {incorrect_prob:.4f}")

Yes probability for the correct answer: 0.1136
Yes probability for the incorrect answer: 0.0775


In [39]:
# corpus_df is indexed by 'docno'
corpus_df_docno = corpus_df.set_index('docno')

def _get_YesLLM_scores_single(df, tokenizer, model, yes_id=3869, query_field='query_unprocessed', corpus_df=corpus_df_docno, max_doc_length=512):
    """
    Get probability for '_Yes' token for a (question, answer) pairs.

    This will be used as an argument for pyterrier.apply.doc_score().
    """
    
    # Extract the question and answer texts
    query_text = df[query_field]
    docno = df['docno']
    doc_text = corpus_df.loc[docno]['text']
    doc_text = doc_text.split()
    doc_text = " ".join(doc_text[:max_doc_length])

    # Get the probability for the 'Yes' token
    yes_prob = get_yes_prob(query_text, doc_text, tokenizer=tokenizer, model=model)

    return yes_prob


In [40]:
### BM25
path = "./index_sepqa/index_bm25/data.properties"
bm25_index = pt.IndexFactory.of(path)

bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""},
)

### BI-ENCODER
index_path = "./index_sepqa/MiniLM_faiss_IndexFlatIP.index"
faiss_index = faiss.read_index(index_path)
biencoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

transform_biencoder = partial(_transform_biencoder, FAISS_INDEX=faiss_index, biencoder_model=biencoder_model, k=100, index_docno_dict=index_docno_dict)
bi_enc = pt.apply.generic(transform_biencoder)

### LLM SCORE
# llm_score = partial(_get_YesLLM_scores, model=model, tokenizer=tokenizer)
# llm_score_transformer = pt.apply.doc_score(llm_score, batch_size=16)
llm_score = partial(_get_YesLLM_scores_single, model=model, tokenizer=tokenizer)
llm_score_transformer = pt.apply.doc_score(llm_score)
pipeline = bi_enc % 10 >> llm_score_transformer

In [43]:
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']

res = pt.Experiment(
    [bm25, bi_enc, pipeline],
    val_queries,
    val_qrels,
    names=["BM25", "Bi-Encoder", "LLM rerank"],
    eval_metrics=metrics,
)

display(res.round(3))

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,BM25,0.755,0.286,0.814,0.841,0.969,0.815,17.491
1,Bi-Encoder,0.918,0.316,0.938,0.952,1.0,0.941,12.386
2,LLM rerank,0.071,0.085,0.175,0.425,0.99,0.259,2828.864


the idea is fun, but in this raw way is not useful.