# Cold start problem - Personalized IR

Armanni Luca - 509085

Ghiotto Alessandro - 513944

---

In this bonus notebook, we explore some simple approaches for "addressing" the cold start problem. Unfortunately, most users have written only a single question, resulting in an insignificant tags score in the majority of cases. 

We will experiment with methods that leverage the tags score only in meaningful scenarios, when users have demonstrated some level of activity on the platform. More than fixing the cold start problem, we will just put all of our attention on the topicality and not on the personalization, if no user data is available.

### Table of contents:
- Num questions
- Setting a treshold
- Simple Heuristics for Weight Adjustment

In [1]:
### FOR COLAB

# !pip install -q condacolab
# import condacolab
# condacolab.install()

# !conda install -c pytorch faiss-gpu -y

# !pip install --upgrade -q python-terrier
# !pip install -q sentence_transformers ipdb emoji
# !pip Install -q pyterrier-caching

# !gdown 1HhgXzyEpsZNcenU9XhJuOYyDUKEzUse4
# !unzip pir_data.zip

In [2]:
import pandas as pd
import re
import os
import warnings
import shutil
import torch
import numpy as np
import random
import time
import faiss
import joblib
from functools import partial
import math

# Hugging Face
from sentence_transformers import SentenceTransformer

# VISUALIZATION
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

# TEXT PROCESSING
from textblob import TextBlob
import emoji
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# TERRIER
from pyterrier.measures import *
import pyterrier as pt
from pyterrier_caching import RetrieverCache

if not pt.java.started():
    pt.utils.set_tqdm('notebook')
    pt.java.init()

# SET SEED
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]


In [3]:
##### UTILITY function
### DISPLAY STYLED df
# set with colors highest values in each column
def display_styled(df, ignore_cols=[], color="#37614a"):
    """ignore_cols: list of columns to not color"""
    def highlight_max(s):
        if s.name in ignore_cols:  # Skip styling for the 'Name' column
            return ['' for _ in s]
        is_max = s == s.max()
        return [f'font-weight: bold; background-color: {color};' if v else '' for v in is_max]

    styled_df = (
        df.style
        .apply(highlight_max, axis=0)  # Apply styling
        .format({col: "{:.3f}" for col in df.select_dtypes(include='number').columns})  # Format numeric columns only
    )
    display(styled_df)
    return None

Load data

In [4]:
stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def preprocess_text(text, apply_stemmer=False, remove_stopwords=False):
    # remove emojis
    text = emoji.replace_emoji(text, "")
    # remove links
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    # remove html tags
    # text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # lowercase verything
    text = text.lower()
    # remove backslashes
    text = re.sub(r"\\", "", text)
    # remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # remove whitespaces
    text = re.sub(r"\s+", " ", text)
    # remove leading and trailing whites
    text = text.strip()
    # apply spelling correction
    # text = TextBlob(text).correct()
    tokens = text.split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
    if apply_stemmer:
        tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /home/ghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# COLLECTION OF DOCUMENTS (ANSWERS)
def preprocess_corpus(df):
    df = df.reset_index()
    df.columns = ['docno', 'text']
    df = df.reset_index(drop=True)
    return df

corpus_df = preprocess_corpus(pd.read_json('PIR_data/answer_retrieval/subset_answers.json', orient='index'))

# SAMPLES (QUERIES)
def preprocess_queries_df(path):
    df = pd.read_json(path, lines=True)
    df = df[['id', 'text', 'user_id', 'timestamp']]
    df.columns = ['qid', 'query_unprocessed', 'user_id', 'timestamp']
    df['query'] = df['query_unprocessed'].apply(lambda x: preprocess_text(x, apply_stemmer=True, remove_stopwords=True))
    df['timestamp'] = df["timestamp"].astype(int) // 10**9
    df['user_id'] = df['user_id'].astype(str)
    df = df.reset_index(drop=True)
    return df

train_queries = preprocess_queries_df('PIR_data/answer_retrieval/train/subset_data.jsonl')
val_queries = preprocess_queries_df('PIR_data/answer_retrieval/val/subset_data.jsonl')
# test_queries = preprocess_queries_df('PIR_data/answer_retrieval/test/subset_data.jsonl')

# QRELS
def preprocess_qrels_df(path):
    df = pd.read_json(path, orient='index').reset_index()
    df.columns = ['qid', 'docno']
    df['label'] = 1
    df = df.reset_index(drop=True)
    return df

train_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/train/qrels.json')
val_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/val/qrels.json')
#btest_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/test/qrels.json')

print("ANSWERS")
display(corpus_df.head(3))
print("QUERIES")
display(train_queries.head(3))
print("QRELS")
display(train_qrels.head(3)) 

ANSWERS


Unnamed: 0,docno,text
0,writers_2010,TL;DRIf you're going to do present tense do it...
1,writers_2018,"Your writing style is stream-of-consciousness,..."
2,writers_2023,Place emphasis on uncomfortable things. Depend...


QUERIES


Unnamed: 0,qid,query_unprocessed,user_id,timestamp,query
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...
1,academia_100456,Is there a free (as in freedom) alternative to...,1106095,1513191752,free freedom altern publon review journal allo...
2,academia_103390,Search for StackExchange citations with Google...,1532620,1517935259,search stackexchang citat googl scholar possib...


QRELS


Unnamed: 0,qid,docno,label
0,academia_100305,academia_100217,1
1,academia_100456,academia_100462,1
2,academia_103390,academia_103391,1


In [6]:
##### REPORTED FROM PREVIOUS NOTEBOOK
### BIENCODER MODEL

def _get_dense_scores(df, FAISS_INDEX, biencoder_model, text_field='query_unprocessed', k=1000):
    """
    get cosine similarity score with a biencoder model, with FAISS FlatIndex

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # get the query unprocessed text
    query_text = df[text_field].iloc[0]
    # get the query embedding
    query_embedding = biencoder_model.encode(query_text).astype('float32')
    query_embedding = query_embedding / np.linalg.norm(query_embedding) # normalize for cosine similarity

    # if we are reranking
    if 'docid' in df.columns:
        # select the retrieved documents
        filter_ids = df['docid'].values
        id_selector = faiss.IDSelectorArray(np.array(filter_ids, dtype=np.int64))
        search_params = faiss.SearchParametersIVF(sel=id_selector)
        # rerank them
        k = len(filter_ids)
        distances, indices = FAISS_INDEX.search(np.array([query_embedding]), k, params=search_params)
    else:
        distances, indices = FAISS_INDEX.search(np.array([query_embedding]), k)

    # mapping {docid: score}
    score_mapping = {docid: score for docid, score in zip(indices[0], distances[0])}
    # get the scores in the original order (same as the input docids)
    scores_original_order = [score_mapping[docid] for docid in df['docid']]
    return scores_original_order

In [7]:
##### REPORTED FROM PREVIOUS NOTEBOOK
### TAGS SCORE

path = './index_sepqa/user_tags_full.joblib'
USER_TAGS = joblib.load(path)

def get_user_tags(user_id, timestamp, include_curr_timestamp, user_tags=USER_TAGS):
    """
    Get the tags of a user at a given timestamp.

    include_curr_timestamp: if True, the tags at the given timestamp are included.
                            if False, the tags at the given timestamp are excluded.
    """
    tags = set()
    timestamp = int(timestamp)
    
    if include_curr_timestamp == False:
        timestamp -= 1 # exclude the question at the given timestamp

    # if the user_id is not in the user_tags dictionary, return an empty set (no profile for the user)
    if user_id not in user_tags:
        return tags
    
    for ts, user_tags in user_tags[user_id]:
        if ts <= timestamp:
            tags = tags.union(user_tags)
    return tags

def _get_tags_score(df, get_user_tags_fn=get_user_tags):
    """
    get scores based on the tags of the user that asked the question
    and the user that have written the answer.

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    
    def compute_score(tags_uq, tags_ua):
        """
        tags_uq: set of tags of the user that asked the question
        tags_ua: set of tags of the user that wrote the answer
        """
        return len(tags_uq.intersection(tags_ua)) / (len(tags_uq) + 1)

    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the tags of the user that asked the question
    tags_uq = get_user_tags_fn(uq, tq, include_curr_timestamp=True)

    # users that have written the answers
    uaS = df['doc_user_id'].values
    # get the tags of the users that have written the answers
    tags_uaS = [get_user_tags_fn(ua, tq, include_curr_timestamp=False) for ua in uaS]
    # compute the score for each answer
    scores = [compute_score(tags_uq, tags_ua) for tags_ua in tags_uaS]
    return scores


In [8]:
### BM25
path = "./index_sepqa/index_bm25_users/data.properties"
bm25_index = pt.IndexFactory.of(path)
bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""}, 
    metadata=["docno", "doc_user_id"] # ADD doc_user_id TO THE METADATA TO BE RETRIEVED
)
norm_bm25 = bm25 >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

### BI-ENCODER
index_path = "./index_sepqa/MiniLM_faiss_IndexFlatIP.index"
faiss_index = faiss.read_index(index_path)
biencoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
get_dense_score = partial(_get_dense_scores, FAISS_INDEX=faiss_index, biencoder_model=biencoder_model)
bi_enc = pt.apply.doc_score(get_dense_score, batch_size=64)
norm_bi_enc = bi_enc >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

### TAGS-SCORE
tags_score = pt.apply.doc_score(_get_tags_score, batch_size=64)
norm_tags_score = tags_score >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

---

# Num questions

We count the number of questions that the user have written until the timestamp t. then we compute the interaction between num_questions and tags_score

1. we get the following features : norm_bi_enc ** norm_bm25 ** tags_score ** num_questions
2. we apply the function that add the intercaction between tags_score and num_questions. instead of using the raw num_questions, we use the log(num_questions). because in the long tail we have a few users that have written really a lot of questions. 
3. finally we normalize in the range [0, 1] this interaction score

$\text{final\_score} = \lambda_1 \cdot \text{BiEncoder\_score} + \lambda_2 \cdot \text{BM25\_score} + \lambda_3 \cdot \text{tags\_numQuestions\_interaction\_score}$

for $\lambda_1, \lambda_2, \lambda_3$ such that $\lambda_1 + \lambda_2 + \lambda_3 = 1$

In [9]:
def get_user_num_questions(user_id, timestamp, user_tags=USER_TAGS):
    """
    Get the number of questions that a user have written at a given timestamp.
    the current timestamp is included.
    """
    num_questions = 0
    timestamp = int(timestamp)
    
    # if the user_id is not in the user_tags dictionary, return 0 (no profile for the user)
    if user_id not in user_tags:
        return num_questions
    
    for ts, user_tags in user_tags[user_id]:
        if ts <= timestamp:
            num_questions += 1
        # early stopping since the timestamps are sorted
        else:
            break
    return num_questions

# THE NUM_QUESTIONS IS REFERRED TO THE USER THAT ASKED THE QUESTION
def _get_num_questions(df, get_user_num_questions_fn=get_user_num_questions):
    """
    get the number of questions of the user that asked the question
    """
    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the number of questions of the user that asked the question
    scores = [get_user_num_questions_fn(uq, tq)]
    scores = scores * len(df) # repeat the score for each document in the batch
    return scores


In [10]:
### NUM QUESTIONS AND TAGS-SCORE INTERACTION
def _get_interaction(df):
    # get the tags score (for the full batch directly with the fucntion)
    tags_score = _get_tags_score(df)
    # get the number of questions of the user that asked the question
    num_questions = _get_num_questions(df)
    # return the interaction
    # the tags_score is multiplied by the log of the number of questions, item-wise
    return tags_score * np.log(num_questions)

interaction_score = pt.apply.doc_score(_get_interaction, batch_size=64)
norm_interaction_score = interaction_score >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

##### PIPELINE #####
path_to_cache = "./cache/features_interaction"

features_pipeline = (bm25 % 100).compile() \
                    >> (norm_bi_enc ** norm_bm25 ** norm_interaction_score) \
                    >> pt.apply.generic(lambda x: x[['qid', 'docno','features']]) # keep only useful columns
# cache the features
cached_features_retriever = RetrieverCache(path_to_cache, features_pipeline)

In [11]:
# smaller range defined for lambda associated to BiEncoder
lambda_1_values = np.arange(0.5, 1.1, 0.1)
valid_combinations = []

# Loop over lambda_1 values
for lambda_1 in lambda_1_values:
    # Loop over lambda_2 values
    for lambda_2 in np.arange(0.0, 1.1, 0.1):
        # Calculate lambda_3 as the remainder
        lambda_3 = 1.0 - lambda_1 - lambda_2
        if 0.0 <= lambda_3 <= 1.0:
            combination = (round(float(lambda_1), 1), round(float(lambda_2), 1), round(float(lambda_3), 1))
            valid_combinations.append(combination)
        
valid_combinations_str = [(f"{l1:.1f}".lstrip("0"), f"{l2:.1f}".lstrip("0"), f"{l3:.1f}".lstrip("0"))
                           for l1, l2, l3 in valid_combinations]


### DEFINE THE PIPELINES

def _score_from_features(x, l1, l2, l3):
    """given the features and the values of the three lambdas, compute the score"""
    features = x['features']
    return l1*features[0] + l2*features[1] + l3*features[2]

# create a pipeline for each configuration of (l1, l2, l3)
pipelines = [
    cached_features_retriever >> pt.apply.doc_score(partial(_score_from_features, l1=l1, l2=l2, l3=l3))
    for l1, l2, l3 in valid_combinations
]

print("Number of pipelines:", len(pipelines))

Number of pipelines: 21


In [12]:
names = [f"val_BiEncoder_BM25_Tags*nQuestions_ ({l1},{l2},{l3})" for l1, l2, l3 in valid_combinations_str]
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
results = pt.Experiment(
    pipelines,
    val_queries,
    val_qrels,
    eval_metrics=metrics,
    names=names,
    save_dir=save_dir,
    save_mode="overwrite",
)

path = "./experiments/personalized_ir/results_val_BiEncoder_BM25_Tags*nQuestions_.csv"
results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_Tags*nQuestions_ (.5,.0,.5)",0.214,0.218,0.475,0.587,0.969,0.472,1.564
1,"val_BiEncoder_BM25_Tags*nQuestions_ (.5,.1,.4)",0.551,0.303,0.763,0.783,0.969,0.724,0.602
2,"val_BiEncoder_BM25_Tags*nQuestions_ (.5,.2,.3)",0.878,0.313,0.915,0.925,0.969,0.911,0.616
3,"val_BiEncoder_BM25_Tags*nQuestions_ (.5,.3,.2)",0.888,0.313,0.919,0.923,0.969,0.916,0.609
4,"val_BiEncoder_BM25_Tags*nQuestions_ (.5,.4,.1)",0.888,0.313,0.919,0.923,0.969,0.916,0.618
5,"val_BiEncoder_BM25_Tags*nQuestions_ (.5,.5,.0)",0.888,0.313,0.919,0.925,0.969,0.915,0.675
6,"val_BiEncoder_BM25_Tags*nQuestions_ (.6,.0,.4)",0.551,0.296,0.75,0.778,0.969,0.718,0.627
7,"val_BiEncoder_BM25_Tags*nQuestions_ (.6,.1,.3)",0.857,0.313,0.907,0.918,0.969,0.901,0.621
8,"val_BiEncoder_BM25_Tags*nQuestions_ (.6,.2,.2)",0.918,0.316,0.936,0.944,0.969,0.935,0.626
9,"val_BiEncoder_BM25_Tags*nQuestions_ (.6,.3,.1)",0.908,0.316,0.934,0.94,0.969,0.931,0.616


this results are quite the same compared to using the simple Tags_score

---
# Setting a treshold

Instead of multiplying the two scores and looking at the interaction, we use the tags score only if the number of questions surpasses a certain treshold.

$\text{final\_score} = \lambda_1 \cdot \text{BiEncoder\_score} + \lambda_2 \cdot \text{BM25\_score} + \lambda_3 \cdot \text{tags\_score\_tresholded}$

for $\lambda_1, \lambda_2, \lambda_3$ such that $\lambda_1 + \lambda_2 + \lambda_3 = 1$

where 

$
\text{tags\_score\_tresholded} = 
    \begin{cases}
       \text{tags\_score}, &\quad\text{if num\_questions}\ge k\\
       0, &\quad\text{otherwise.} \\ 
     \end{cases}
$

for a certain treshold $k$

In [13]:
### NUM QUESTIONS
num_questions = pt.apply.doc_score(_get_num_questions, batch_size=64)

### TRSHOLDING THE TAGS SCORE VIA THE NUMBER OF QUESTIONS
def _tags_score_filter(df, k=3):
    """
    We keep the tags score only if the user that asked the question has written at least k questions.
    """
    # Extract number of questions and tags score
    num_questions = df['features'][-1]
    tags_score = df['features'][-2]
    # if the user has written at least k questions, keep the tags score
    tags_score = tags_score if num_questions >= k else 0

    return np.append(df['features'][:-2], tags_score)

##### PIPELINE #####
path_to_cache = "./cache/features_with_questions"

features_pipeline = (bm25 % 100).compile() \
                    >> (norm_bi_enc ** norm_bm25 ** norm_tags_score ** num_questions) \
                    >> pt.apply.generic(lambda x: x[['qid', 'docno','features']]) # keep only useful columns

# cache the features
cached_features_retriever = RetrieverCache(path_to_cache, features_pipeline)

we do a search over the value of k

In [14]:
# smaller range defined for lambda associated to BiEncoder
lambda_1_values = np.arange(0.6, 1.1, 0.1)
valid_combinations = []

# Loop over lambda_1 values
for lambda_1 in lambda_1_values:
    # Loop over lambda_2 values
    for lambda_2 in np.arange(0.0, 1.1, 0.1):
        # Calculate lambda_3 as the remainder
        lambda_3 = 1.0 - lambda_1 - lambda_2
        # WE FILTER OUT SOME COMBINATIONS
        # since we are thresholding the tags score, we want to keep only the combinations
        # where lambda_3 is high
        if 0.1 <= lambda_3 <= 1.0 and lambda_3 >= lambda_2:
            combination = (round(float(lambda_1), 1), round(float(lambda_2), 1), round(float(lambda_3), 1))
            valid_combinations.append(combination)


k_values = [10, 100, 500]

save_dir = "./experiments/personalized_ir/"
results = []

for k in k_values:

    results_at_k = []
    for lambda_1, lambda_2, lambda_3 in valid_combinations:
        tags_score_filter_at_k = partial(_tags_score_filter, k=k)
        linear_combination = partial(_score_from_features, l1=lambda_1, l2=lambda_2, l3=lambda_3)
        
        rf_pipe = cached_features_retriever \
                    >> pt.apply.doc_features(tags_score_filter_at_k) \
                    >> pt.apply.doc_score(linear_combination) \

        results_at_k.append(
            pt.Experiment(
                [rf_pipe],
                val_queries,
                val_qrels,
                eval_metrics=metrics,
                names=[f'val_BiEncoder_BM25_Tags_tresh_k={k} ({lambda_1},{lambda_2},{lambda_3})'],
                save_dir=save_dir,
                save_mode="overwrite",
            )
        )
    results.append(results_at_k)

res_df = []
for i, res in enumerate(results):
    print(f"Results for treshold k = {k_values[i]}")
    res = pd.concat(res).reset_index(drop=True)
    res_df.append(res)
    display_styled(res, ignore_cols=['name', 'mrt'])

path = "./experiments/personalized_ir/results_val_BiEncoder_BM25_Tags_tresh_k.csv"
results = pd.concat(res_df).reset_index(drop=True)
results.to_csv(path)


Results for treshold k = 10


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.6,0.0,0.4)",0.551,0.296,0.75,0.778,0.969,0.718,1.541
1,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.6,0.1,0.3)",0.857,0.313,0.907,0.918,0.969,0.901,1.417
2,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.6,0.2,0.2)",0.918,0.316,0.936,0.944,0.969,0.935,1.394
3,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.7,0.0,0.3)",0.837,0.313,0.898,0.909,0.969,0.889,1.383
4,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.7,0.1,0.2)",0.918,0.32,0.943,0.946,0.969,0.938,1.395
5,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.8,0.0,0.2)",0.898,0.316,0.93,0.938,0.969,0.927,1.396
6,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.8,0.1,0.1)",0.908,0.32,0.939,0.942,0.969,0.933,1.385
7,"val_BiEncoder_BM25_Tags_tresh_k=10 (0.9,0.0,0.1)",0.908,0.32,0.938,0.941,0.969,0.932,1.403


Results for treshold k = 100


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.6,0.0,0.4)",0.551,0.296,0.75,0.778,0.969,0.718,1.392
1,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.6,0.1,0.3)",0.857,0.313,0.907,0.918,0.969,0.901,1.39
2,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.6,0.2,0.2)",0.918,0.316,0.936,0.944,0.969,0.935,1.379
3,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.7,0.0,0.3)",0.837,0.313,0.898,0.909,0.969,0.889,1.49
4,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.7,0.1,0.2)",0.918,0.32,0.943,0.946,0.969,0.938,1.539
5,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.8,0.0,0.2)",0.898,0.316,0.93,0.938,0.969,0.927,1.463
6,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.8,0.1,0.1)",0.908,0.32,0.939,0.942,0.969,0.933,1.547
7,"val_BiEncoder_BM25_Tags_tresh_k=100 (0.9,0.0,0.1)",0.908,0.32,0.938,0.941,0.969,0.932,1.455


Results for treshold k = 500


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.6,0.0,0.4)",0.745,0.306,0.848,0.868,0.969,0.834,1.367
1,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.6,0.1,0.3)",0.898,0.313,0.924,0.936,0.969,0.925,1.407
2,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.6,0.2,0.2)",0.918,0.313,0.931,0.942,0.969,0.933,1.363
3,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.7,0.0,0.3)",0.878,0.313,0.915,0.927,0.969,0.913,1.366
4,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.7,0.1,0.2)",0.918,0.316,0.938,0.945,0.969,0.938,1.378
5,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.8,0.0,0.2)",0.908,0.313,0.927,0.94,0.969,0.93,1.353
6,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.8,0.1,0.1)",0.918,0.316,0.936,0.944,0.969,0.936,1.64
7,"val_BiEncoder_BM25_Tags_tresh_k=500 (0.9,0.0,0.1)",0.908,0.313,0.927,0.94,0.969,0.93,1.438


Both the interaction and the treshold din't improve the results.

---

# Simple Heuristics for Weight Adjustment

we have seen that linearly combining scores from different systems gives very good results. So in this method we just use the best non personalized model if the number of question is low, otherwise we use the personalized one. The input are the Biencoder score, BM25 score and the tags score.

$
\text{final\_score} = 
    \begin{cases}
       .7 \cdot \text{BiEncoder\_score} + .1 \cdot \text{BM25\_score} + .2 \cdot \text{tags\_score}, &\quad\text{if \, num\_questions}\ge k\\
       .9 \cdot \text{BiEncoder\_score} + .1 \cdot \text{BM25\_score}, &\quad\text{otherwise.} \\ 
     \end{cases}
$

This final score give birth to the following pipeline:

$\text{BM25 \% 100} >> 
\begin{cases}
    .7 \cdot \text{BiEncoder\_score} + .1 \cdot \text{BM25\_score} + .2 \cdot \text{tags\_score}, &\quad\text{if \, num\_questions}\ge 512\\
    .9 \cdot \text{BiEncoder\_score} + .1 \cdot \text{BM25\_score}, &\quad\text{otherwise.} \\ 
\end{cases}$

In [15]:
### NUM QUESTIONS
num_questions = pt.apply.doc_score(_get_num_questions, batch_size=64)

### THE WEIGHTS ARE CHOSEN BASED ON THE NUMBER OF QUESTIONS
def _score_wa(df, k=3):
    """
    We keep the tags score only if the user that asked the question has written at least k questions.
    """
    num_questions = df['features'][-1]
    if num_questions >= k:
        weights = [0.7, 0.1, 0.2]
    else:
        weights = [0.9, 0.1, 0]

    return np.dot(df['features'][:-1], weights)

##### PIPELINE #####
path_to_cache = "./cache/features_with_questions"

features_pipeline = (bm25 % 100).compile() \
                    >> (norm_bi_enc ** norm_bm25 ** norm_tags_score ** num_questions) \
                    >> pt.apply.generic(lambda x: x[['qid', 'docno','features']]) # keep only useful columns

# cache the features
cached_features_retriever = RetrieverCache(path_to_cache, features_pipeline)

In [16]:
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"

k_values = [2, 8, 32, 128, 256, 512, 1024, 2048]
results = []

for k in k_values:
    wa_score = partial(_score_wa, k=k)
    pipe = cached_features_retriever >> pt.apply.doc_score(wa_score)

    results.append(
            pt.Experiment(
                [pipe],
                val_queries,
                val_qrels,
                eval_metrics=metrics,
                names=[f'val_BiEncoder_BM25_Tags_WA_{k}'],
                save_dir=save_dir,
                save_mode="overwrite",
            )
    )

results = pd.concat(results).reset_index(drop=True)
path = "./experiments/personalized_ir/results_BiEncoder_BM25_Tags_WA.csv"
results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,val_BiEncoder_BM25_Tags_WA_2,0.918,0.32,0.943,0.946,0.969,0.938,0.958
1,val_BiEncoder_BM25_Tags_WA_8,0.918,0.32,0.943,0.946,0.969,0.938,0.853
2,val_BiEncoder_BM25_Tags_WA_32,0.918,0.32,0.943,0.946,0.969,0.938,0.877
3,val_BiEncoder_BM25_Tags_WA_128,0.918,0.32,0.943,0.946,0.969,0.938,0.858
4,val_BiEncoder_BM25_Tags_WA_256,0.918,0.32,0.943,0.946,0.969,0.938,0.872
5,val_BiEncoder_BM25_Tags_WA_512,0.929,0.316,0.94,0.948,0.969,0.941,0.876
6,val_BiEncoder_BM25_Tags_WA_1024,0.929,0.316,0.94,0.948,0.969,0.941,0.828
7,val_BiEncoder_BM25_Tags_WA_2048,0.918,0.316,0.935,0.943,0.969,0.934,0.839


**k = 512** (alongside with 1024) gives the highest P@1