# Personalized IR

Armanni Luca - 509085

Ghiotto Alessandro - 513944

---

In this notebook we try other methods for computing the score for the personalization part of our information retrieval system. We present them separately in this notebook because they gives slightly worse results than the previous method (*notebook5.1*).

### Table of contents:
- Tags as count vectors
- Set of Communities as user profile


In [1]:
### FOR COLAB

# !pip install -q condacolab
# import condacolab
# condacolab.install()

# !conda install -c pytorch faiss-gpu -y

# !pip install --upgrade -q python-terrier
# !pip install -q sentence_transformers ipdb emoji
# !pip Install -q pyterrier-caching

# !gdown 1HhgXzyEpsZNcenU9XhJuOYyDUKEzUse4
# !unzip pir_data.zip

In [2]:
import pandas as pd
import re
import os
import warnings
import shutil
import torch
import numpy as np
import random
import time
import faiss
import joblib
from functools import partial
import math

# Hugging Face
from sentence_transformers import SentenceTransformer

# VISUALIZATION
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

# TEXT PROCESSING
from textblob import TextBlob
import emoji
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# TERRIER
from pyterrier.measures import *
import pyterrier as pt
from pyterrier_caching import RetrieverCache

if not pt.java.started():
    pt.utils.set_tqdm('notebook')
    pt.java.init()

# SET SEED
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]


In [3]:
##### UTILITY function
### DISPLAY STYLED df
# set with colors highest values in each column
def display_styled(df, ignore_cols=[], color="#37614a"):
    """ignore_cols: list of columns to not color"""
    def highlight_max(s):
        if s.name in ignore_cols:  # Skip styling for the 'Name' column
            return ['' for _ in s]
        is_max = s == s.max()
        return [f'font-weight: bold; background-color: {color};' if v else '' for v in is_max]

    styled_df = (
        df.style
        .apply(highlight_max, axis=0)  # Apply styling
        .format({col: "{:.3f}" for col in df.select_dtypes(include='number').columns})  # Format numeric columns only
    )
    display(styled_df)
    return None

Load data

In [4]:
stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def preprocess_text(text, apply_stemmer=False, remove_stopwords=False):
    # remove emojis
    text = emoji.replace_emoji(text, "")
    # remove links
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    # remove html tags
    # text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # lowercase verything
    text = text.lower()
    # remove backslashes
    text = re.sub(r"\\", "", text)
    # remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # remove whitespaces
    text = re.sub(r"\s+", " ", text)
    # remove leading and trailing whites
    text = text.strip()
    # apply spelling correction
    # text = TextBlob(text).correct()
    tokens = text.split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
    if apply_stemmer:
        tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /home/ghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# COLLECTION OF DOCUMENTS (ANSWERS)
def preprocess_corpus(df):
    df = df.reset_index()
    df.columns = ['docno', 'text']
    df = df.reset_index(drop=True)
    return df

corpus_df = preprocess_corpus(pd.read_json('PIR_data/answer_retrieval/subset_answers.json', orient='index'))

# SAMPLES (QUERIES)
def preprocess_queries_df(path):
    df = pd.read_json(path, lines=True)
    df = df[['id', 'text', 'user_id', 'timestamp']]
    df.columns = ['qid', 'query_unprocessed', 'user_id', 'timestamp']
    df['query'] = df['query_unprocessed'].apply(lambda x: preprocess_text(x, apply_stemmer=True, remove_stopwords=True))
    df['timestamp'] = df["timestamp"].astype(int) // 10**9
    df['user_id'] = df['user_id'].astype(str)
    df = df.reset_index(drop=True)
    return df

train_queries = preprocess_queries_df('PIR_data/answer_retrieval/train/subset_data.jsonl')
val_queries = preprocess_queries_df('PIR_data/answer_retrieval/val/subset_data.jsonl')
# test_queries = preprocess_queries_df('PIR_data/answer_retrieval/test/subset_data.jsonl')

# QRELS
def preprocess_qrels_df(path):
    df = pd.read_json(path, orient='index').reset_index()
    df.columns = ['qid', 'docno']
    df['label'] = 1
    df = df.reset_index(drop=True)
    return df

train_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/train/qrels.json')
val_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/val/qrels.json')
# test_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/test/qrels.json')

print("ANSWERS")
display(corpus_df.head(3))
print("QUERIES")
display(train_queries.head(3))
print("QRELS")
display(train_qrels.head(3)) 

ANSWERS


Unnamed: 0,docno,text
0,writers_2010,TL;DRIf you're going to do present tense do it...
1,writers_2018,"Your writing style is stream-of-consciousness,..."
2,writers_2023,Place emphasis on uncomfortable things. Depend...


QUERIES


Unnamed: 0,qid,query_unprocessed,user_id,timestamp,query
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...
1,academia_100456,Is there a free (as in freedom) alternative to...,1106095,1513191752,free freedom altern publon review journal allo...
2,academia_103390,Search for StackExchange citations with Google...,1532620,1517935259,search stackexchang citat googl scholar possib...


QRELS


Unnamed: 0,qid,docno,label
0,academia_100305,academia_100217,1
1,academia_100456,academia_100462,1
2,academia_103390,academia_103391,1


In [6]:
##### REPORTED FROM PREVIOUS NOTEBOOK
### BIENCODER MODEL

def _get_dense_scores(df, FAISS_INDEX, biencoder_model, text_field='query_unprocessed', k=1000):
    """
    get cosine similarity score with a biencoder model, with FAISS FlatIndex

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # get the query unprocessed text
    query_text = df[text_field].iloc[0]
    # get the query embedding
    query_embedding = biencoder_model.encode(query_text).astype('float32')
    query_embedding = query_embedding / np.linalg.norm(query_embedding) # normalize for cosine similarity

    # if we are reranking
    if 'docid' in df.columns:
        # select the retrieved documents
        filter_ids = df['docid'].values
        id_selector = faiss.IDSelectorArray(np.array(filter_ids, dtype=np.int64))
        search_params = faiss.SearchParametersIVF(sel=id_selector)
        # rerank them
        k = len(filter_ids)
        distances, indices = FAISS_INDEX.search(np.array([query_embedding]), k, params=search_params)
    else:
        distances, indices = FAISS_INDEX.search(np.array([query_embedding]), k)

    # mapping {docid: score}
    score_mapping = {docid: score for docid, score in zip(indices[0], distances[0])}
    # get the scores in the original order (same as the input docids)
    scores_original_order = [score_mapping[docid] for docid in df['docid']]
    return scores_original_order

In [7]:
##### REPORTED FROM PREVIOUS NOTEBOOK
### TAGS SCORE

path = './index_sepqa/user_tags_full.joblib'
USER_TAGS = joblib.load(path)

def get_user_tags(user_id, timestamp, include_curr_timestamp, user_tags=USER_TAGS):
    """
    Get the tags of a user at a given timestamp.

    include_curr_timestamp: if True, the tags at the given timestamp are included.
                            if False, the tags at the given timestamp are excluded.
    """
    tags = set()
    timestamp = int(timestamp)
    
    if include_curr_timestamp == False:
        timestamp -= 1 # exclude the question at the given timestamp

    # if the user_id is not in the user_tags dictionary, return an empty set (no profile for the user)
    if user_id not in user_tags:
        return tags
    
    for ts, user_tags in user_tags[user_id]:
        if ts <= timestamp:
            tags = tags.union(user_tags)
    return tags

def _get_tags_score(df, get_user_tags_fn=get_user_tags):
    """
    get scores based on the tags of the user that asked the question
    and the user that have written the answer.

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    
    def compute_score(tags_uq, tags_ua):
        """
        tags_uq: set of tags of the user that asked the question
        tags_ua: set of tags of the user that wrote the answer
        """
        return len(tags_uq.intersection(tags_ua)) / (len(tags_uq) + 1)

    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the tags of the user that asked the question
    tags_uq = get_user_tags_fn(uq, tq, include_curr_timestamp=True)

    # users that have written the answers
    uaS = df['doc_user_id'].values
    # get the tags of the users that have written the answers
    tags_uaS = [get_user_tags_fn(ua, tq, include_curr_timestamp=False) for ua in uaS]
    # compute the score for each answer
    scores = [compute_score(tags_uq, tags_ua) for tags_ua in tags_uaS]
    return scores


---

# Tags as count vectors

Instead of saving the tags as a set, and computing the len of the intersection between two sets, we save the tags as **count vectors**. Now we don't just store the tags associated to each user (given a query at time t) but we count how many of them. Now we compute the score as the **cosine similarity** between two count vectors.

```python
USERS_TAGS_LIST = {
    "user1_id": [(t1, ["tag1", "tag2"]), (t2, ["tag3"]), ...],
    "user2_id": [(t3, ["tag1"]), ...],
}
```

In [8]:
### CONVERT THE USER_TAGS FOR CONTAINING LISTS ISNTEAD OF SETS
path = './index_sepqa/user_tags_full.joblib'
USER_TAGS = joblib.load(path)

USER_TAGS_LIST = {}
for user, tags in USER_TAGS.items():
    USER_TAGS_LIST[user] = [(ts, list(tag)) for ts, tag in tags]

path = './index_sepqa/user_tags_full_list.joblib'
joblib.dump(USER_TAGS_LIST, path)

def get_user_tags_list(user_id, timestamp, include_curr_timestamp, user_tags=USER_TAGS_LIST):
    """
    Get the tags of a user at a given timestamp.

    include_curr_timestamp: if True, the tags at the given timestamp are included.
                            if False, the tags at the given timestamp are excluded.
    """
    tags = []
    timestamp = int(timestamp)
    
    if include_curr_timestamp == False:
        timestamp -= 1 # exclude the question at the given timestamp

    # if the user_id is not in the user_tags dictionary, return an empty set (no profile for the user)
    if user_id not in user_tags:
        return tags
    
    for ts, user_tags in user_tags[user_id]:
        if ts <= timestamp:
            # EXTEND INSTEAD OF UNION
            tags.extend(user_tags)
        # the ts are sorted, so we can break when we reach the timestamp
        else:
            break
    return tags

### EXAMPLE
users = train_queries.head(3)['user_id'].values
for user in users:
    print(f"USER {user} TAGS AT EACH TIMESTAMP t:")
    print(USER_TAGS_LIST[user])
    print()

print("-"*50)
t = 1338195742
for user in users:
    print(f"USER = {user} , timestamp = {t} TAGS: {get_user_tags_list(user, t, include_curr_timestamp=True)}")

USER 1106095 TAGS AT EACH TIMESTAMP t:
[(1337954525, ['united-kingdom', 'reputation', 'bibliometrics']), (1338195742, ['grades', 'time-management']), (1338973715, ['website']), (1340113715, ['publications']), (1340631521, ['disreputable-publishers', 'publishers']), (1341223088, ['teaching', 'software']), (1343056478, ['conference', 'networking']), (1343647084, ['publications', 'peer-review', 'journals', 'conference']), (1343650594, ['publications', 'journals']), (1346750645, ['paperwork']), (1349425429, ['publications', 'peer-review']), (1353929506, ['responsibilities', 'contract']), (1355405618, ['publications']), (1359623571, ['job-search']), (1359655998, ['funding', 'publications']), (1360500458, ['publications', 'peer-review', 'application']), (1360586056, ['poster', 'travel']), (1361798551, ['sexual-misconduct', 'united-kingdom']), (1362391150, ['seminars', 'etiquette']), (1363091345, ['working-time', 'sabbatical', 'career-path']), (1366622104, ['version-control', 'job-search']), 

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def _get_tags_score_cosine(df, count_vectorizer, get_tags_fn=get_user_tags_list):
    """
    get scores based on the tags of the user that asked the question
    and the user that have written the answer.
        - the list of tags is transformed into a count vector.
        - the score is the cosine similarity between the two count vectors.

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    
    def compute_score(tags_uq, tags_uaS, count_vectorizer=count_vectorizer):
        """
        tags_uq : list of tags of the user that asked the question
        tags_uaS: list of tags of the user that wrote the answer, FOR EACH USER
        """
        # transform the tags into a count vector
        tags_uq = count_vectorizer.transform([tags_uq])
        tags_uaS = count_vectorizer.transform(tags_uaS)
        # compute the cosine similarity
        scores = cosine_similarity(tags_uq, tags_uaS)
        return scores[0]

    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the tags of the user that asked the question
    tags_uq = get_tags_fn(uq, tq, include_curr_timestamp=True)

    # users that have written the answers
    uaS = df['doc_user_id'].values
    # get the tags of the users that have written the answers
    tags_uaS = [get_tags_fn(ua, tq, include_curr_timestamp=False) for ua in uaS]
    # compute the score for each answer
    scores = compute_score(tags_uq, tags_uaS)
    return scores

# use all the list of unique tags tha we have stored as the vocabulary of the CountVectorizer
all_tags = joblib.load('./index_sepqa/tags_vocabulary_full.joblib')
# tokenizer and preprocessor are identity functions, since the input is already a list of tags (not a document)
vectorizer = CountVectorizer(vocabulary=all_tags, tokenizer=lambda x: x, preprocessor=lambda x: x)


In [10]:
### BM25
path = "./index_sepqa/index_bm25_users/data.properties"
bm25_index = pt.IndexFactory.of(path)
bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""}, 
    metadata=["docno", "doc_user_id"] # ADD doc_user_id TO THE METADATA TO BE RETRIEVED
)
norm_bm25 = bm25 >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

### BI-ENCODER
index_path = "./index_sepqa/MiniLM_faiss_IndexFlatIP.index"
faiss_index = faiss.read_index(index_path)
biencoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
get_dense_score = partial(_get_dense_scores, FAISS_INDEX=faiss_index, biencoder_model=biencoder_model)
bi_enc = pt.apply.doc_score(get_dense_score, batch_size=64)
norm_bi_enc = bi_enc >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

### TAGS-SCORE
get_tags_score_cosine = partial(_get_tags_score_cosine, count_vectorizer=vectorizer)
tags_score_cosine = pt.apply.doc_score(get_tags_score_cosine, batch_size=64)
norm_tags_score_cosine = tags_score_cosine >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

In [11]:
#### smaller range defined for lambda associated to BiEncoder
lambda_1_values = np.arange(0.5, 1.1, 0.1)
valid_combinations = []

# Loop over lambda_1 values
for lambda_1 in lambda_1_values:
    # Loop over lambda_2 values
    for lambda_2 in np.arange(0.0, 1.1, 0.1):
        # Calculate lambda_3 as the remainder
        lambda_3 = 1.0 - lambda_1 - lambda_2
        if 0.0 <= lambda_3 <= 1.0:
            combination = (round(float(lambda_1), 1), round(float(lambda_2), 1), round(float(lambda_3), 1))
            valid_combinations.append(combination)
        
valid_combinations_str = [(f"{l1:.1f}".lstrip("0"), f"{l2:.1f}".lstrip("0"), f"{l3:.1f}".lstrip("0"))
                           for l1, l2, l3 in valid_combinations]

##### DEFINE ALL THE PIPELINES
# we cache the features_pipeline, since the features are the same for all the combinations
# we compute it once, then reuse it for all the combinations of the lambdas
path_to_cache = "./cache/features_BiEncoder_BM25_TagsCosine"
features_pipeline = (bm25 % 100).compile() \
                    >> (norm_bi_enc ** norm_bm25 ** norm_tags_score_cosine) \
                    >> pt.apply.generic(lambda x: x[['qid', 'docno','features']]) # keep only useful columns
# cache the features
cached_features_retriever = RetrieverCache(path_to_cache, features_pipeline)

def _score_from_features(x, l1, l2, l3):
    """given the features and the values of the three lambdas, compute the score"""
    features = x['features']
    return l1*features[0] + l2*features[1] + l3*features[2]

# create a pipeline for each configuration of (l1, l2, l3)
pipelines = [
    cached_features_retriever >> pt.apply.doc_score(partial(_score_from_features, l1=l1, l2=l2, l3=l3))
    for l1, l2, l3 in valid_combinations
]

print("Number of pipelines:", len(pipelines))

Number of pipelines: 21


In [12]:
names = [f"train_BiEncoder_BM25_TagsCosine ({l1},{l2},{l3})" for l1, l2, l3 in valid_combinations_str]
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
SAVE_MODE = "overwrite" # reuse warn overwrite error

t0 = time.time()
results = pt.Experiment(
    pipelines,
    train_queries,
    train_qrels,
    eval_metrics=metrics,
    names=names,
    save_dir=save_dir,
    save_mode=SAVE_MODE,
)

path = "./experiments/personalized_ir/results_train_BiEncoder_BM25_TagsCosine.csv"
if SAVE_MODE == "overwrite" or not os.path.exists(path):
    print("Experiment duration :", round(time.time()-t0, 2), "seconds")
    # Experiment duration : ~11 minutes

    results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Experiment duration : 289.92 seconds


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"train_BiEncoder_BM25_TagsCosine (.5,.0,.5)",0.264,0.206,0.471,0.567,0.946,0.472,0.693
1,"train_BiEncoder_BM25_TagsCosine (.5,.1,.4)",0.56,0.272,0.712,0.752,0.946,0.698,0.648
2,"train_BiEncoder_BM25_TagsCosine (.5,.2,.3)",0.772,0.296,0.841,0.859,0.946,0.835,0.644
3,"train_BiEncoder_BM25_TagsCosine (.5,.3,.2)",0.813,0.301,0.868,0.881,0.946,0.862,0.64
4,"train_BiEncoder_BM25_TagsCosine (.5,.4,.1)",0.811,0.3,0.865,0.879,0.946,0.86,0.641
5,"train_BiEncoder_BM25_TagsCosine (.5,.5,.0)",0.797,0.297,0.853,0.869,0.946,0.849,0.625
6,"train_BiEncoder_BM25_TagsCosine (.6,.0,.4)",0.513,0.264,0.679,0.724,0.946,0.664,0.782
7,"train_BiEncoder_BM25_TagsCosine (.6,.1,.3)",0.76,0.295,0.836,0.853,0.946,0.828,0.658
8,"train_BiEncoder_BM25_TagsCosine (.6,.2,.2)",0.828,0.303,0.877,0.888,0.946,0.872,0.647
9,"train_BiEncoder_BM25_TagsCosine (.6,.3,.1)",0.832,0.304,0.88,0.891,0.946,0.875,0.679


In [13]:
names = [f"val_BiEncoder_BM25_TagsCosine ({l1},{l2},{l3})" for l1, l2, l3 in valid_combinations_str]
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"

t0 = time.time()
results = pt.Experiment(
    pipelines,
    val_queries,
    val_qrels,
    eval_metrics=metrics,
    names=names,
    save_dir=save_dir,
    save_mode="overwrite",
)

print("Experiment duration :", round(time.time()-t0, 2), "seconds")

path = "./experiments/personalized_ir/results_val_BiEncoder_BM25_TagsCosine.csv"
results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Experiment duration : 2.83 seconds


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_TagsCosine (.5,.0,.5)",0.296,0.218,0.504,0.593,0.969,0.501,0.66
1,"val_BiEncoder_BM25_TagsCosine (.5,.1,.4)",0.612,0.286,0.759,0.794,0.969,0.744,0.611
2,"val_BiEncoder_BM25_TagsCosine (.5,.2,.3)",0.857,0.306,0.893,0.913,0.969,0.895,0.624
3,"val_BiEncoder_BM25_TagsCosine (.5,.3,.2)",0.878,0.313,0.914,0.921,0.969,0.909,0.611
4,"val_BiEncoder_BM25_TagsCosine (.5,.4,.1)",0.888,0.313,0.919,0.925,0.969,0.915,0.631
5,"val_BiEncoder_BM25_TagsCosine (.5,.5,.0)",0.888,0.313,0.919,0.925,0.969,0.915,0.615
6,"val_BiEncoder_BM25_TagsCosine (.6,.0,.4)",0.571,0.279,0.725,0.77,0.969,0.712,0.629
7,"val_BiEncoder_BM25_TagsCosine (.6,.1,.3)",0.857,0.313,0.905,0.915,0.969,0.898,0.616
8,"val_BiEncoder_BM25_TagsCosine (.6,.2,.2)",0.888,0.316,0.924,0.931,0.969,0.918,0.644
9,"val_BiEncoder_BM25_TagsCosine (.6,.3,.1)",0.898,0.316,0.929,0.935,0.969,0.924,0.619


The results are slightly worse than the previous method, and is also more costly.

---

# Set of Communities as user profile

As we have done in the first method with the Tags, we do it also with the Communities. The formula used is the same, what changes is just the user profile that now use the communities instead of the tags.

$$CommunityScore(u_q, u_a, t) = \frac{len(intersection(Community(u_q, t), Community(u_a, t))}{len(Community(u_q, t)) + 1}$$

Where $Community(u, t)$ is the sets of Communities associated to all the questions before $t$, written by the user $u$. $u_q$ is the user who wrote the question and $u_a$ is the one who has written the answer. The Community for the question at time $t$ is included for $u_q$, but excluded for $u_a$.


In [14]:
# LOAD
path = './index_sepqa/user_community_full.joblib'
USER_COMMUNITY = joblib.load(path)

def get_user_community(user_id, timestamp, include_curr_timestamp, user_community=USER_COMMUNITY):
    """
    Get the communities of a user at a given timestamp.

    include_curr_timestamp: if True, the tags at the given timestamp are included.
                            if False, the tags at the given timestamp are excluded.
    """
    communities = set()
    timestamp = int(timestamp)
    
    if include_curr_timestamp == False:
        timestamp -= 1 # exclude the question at the given timestamp

    # if the user_id is not in the user_community dictionary, return an empty set (no profile for the user)
    if user_id not in user_community:
        return communities
    
    for ts, user_community in user_community[user_id]:
        if ts <= timestamp:
            # ADD INSTEAD OF UNION (at each timestamp we have a community, not a set of them)
            communities.add(user_community)
        # the ts are sorted, so we can break when we reach the timestamp
        else:
            break
    return communities

### EXAMPLE
users = train_queries.head(3)['user_id'].values
for user in users:
    print(f"USER {user} COMMUNITY AT EACH TIMESTAMP t:")
    print(USER_COMMUNITY[user])
    print()

print("-"*50)
t = 1338195742
for user in users:
    print(f"USER = {user} , timestamp = {t} COMMUNITIES: {get_user_community(user, t, include_curr_timestamp=True)}")

USER 1106095 COMMUNITY AT EACH TIMESTAMP t:
[(1337954525, 'academia'), (1338195742, 'academia'), (1338973715, 'academia'), (1340113715, 'academia'), (1340631521, 'academia'), (1341223088, 'academia'), (1343056478, 'academia'), (1343647084, 'academia'), (1343650594, 'academia'), (1346750645, 'academia'), (1349425429, 'academia'), (1353929506, 'academia'), (1355405618, 'academia'), (1359623571, 'academia'), (1359655998, 'academia'), (1360500458, 'academia'), (1360586056, 'academia'), (1361798551, 'academia'), (1362391150, 'academia'), (1363091345, 'academia'), (1366622104, 'academia'), (1367225030, 'academia'), (1370509865, 'academia'), (1371471877, 'academia'), (1375350804, 'academia'), (1377078634, 'academia'), (1379871860, 'academia'), (1380631137, 'academia'), (1380821299, 'academia'), (1387804346, 'academia'), (1388146049, 'academia'), (1389609878, 'travel'), (1389975678, 'academia'), (1392282649, 'academia'), (1394465484, 'travel'), (1394708555, 'expatriates'), (1394808223, 'expatr

In [15]:
def _get_community_score(df):
    """
    get scores based on the communitites of the user that asked the question
    and the user that have written the answer.

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    
    def compute_score(communities_uq, communities_ua):
        """
        communities_uq: set of tags of the user that asked the question
        communities_ua: set of tags of the user that wrote the answer
        """
        return len(communities_uq.intersection(communities_ua)) / (len(communities_uq) + 1)

    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # user of the query
    uq = df['user_id'].iloc[0]
    # timestamp of the query
    tq = df['timestamp'].iloc[0]
    # get the communities of the user that asked the question
    communities_uq = get_user_community(uq, tq, include_curr_timestamp=True)

    # users that have written the answers
    uaS = df['doc_user_id'].values
    # get the communities of the users that have written the answers
    communities_uaS = [get_user_community(ua, tq, include_curr_timestamp=False) for ua in uaS]
    # compute the score for each answer
    scores = [compute_score(communities_uq, communities_ua) for communities_ua in communities_uaS]
    return scores

In [16]:
### BM25
path = "./index_sepqa/index_bm25_users/data.properties"
bm25_index = pt.IndexFactory.of(path)
bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""}, 
    metadata=["docno", "doc_user_id"] # ADD doc_user_id TO THE METADATA TO BE RETRIEVED
)
norm_bm25 = bm25 >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

### BI-ENCODER
index_path = "./index_sepqa/MiniLM_faiss_IndexFlatIP.index"
faiss_index = faiss.read_index(index_path)
biencoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
get_dense_score = partial(_get_dense_scores, FAISS_INDEX=faiss_index, biencoder_model=biencoder_model)
bi_enc = pt.apply.doc_score(get_dense_score, batch_size=64)
norm_bi_enc = bi_enc >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

### COMMUNITY-SCORE
community_score = pt.apply.doc_score(_get_community_score, batch_size=64)
norm_community_score = community_score >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

In [17]:
#### smaller range defined for lambda associated to BiEncoder
lambda_1_values = np.arange(0.5, 1.1, 0.1)
valid_combinations = []

# Loop over lambda_1 values
for lambda_1 in lambda_1_values:
    # Loop over lambda_2 values
    for lambda_2 in np.arange(0.0, 1.1, 0.1):
        # Calculate lambda_3 as the remainder
        lambda_3 = 1.0 - lambda_1 - lambda_2
        if 0.0 <= lambda_3 <= 1.0:
            combination = (round(float(lambda_1), 1), round(float(lambda_2), 1), round(float(lambda_3), 1))
            valid_combinations.append(combination)
        
valid_combinations_str = [(f"{l1:.1f}".lstrip("0"), f"{l2:.1f}".lstrip("0"), f"{l3:.1f}".lstrip("0"))
                           for l1, l2, l3 in valid_combinations]

##### DEFINE ALL THE PIPELINES
# we cache the features_pipeline, since the features are the same for all the combinations
# we compute it once, then reuse it for all the combinations of the lambdas
path_to_cache = "./cache/features_BiEncoder_BM25_Community"
features_pipeline = (bm25 % 100).compile() \
                    >> (norm_bi_enc ** norm_bm25 ** norm_community_score) \
                    >> pt.apply.generic(lambda x: x[['qid', 'docno','features']]) # keep only useful columns
# cache the features
cached_features_retriever = RetrieverCache(path_to_cache, features_pipeline)

def _score_from_features(x, l1, l2, l3):
    """given the features and the values of the three lambdas, compute the score"""
    features = x['features']
    return l1*features[0] + l2*features[1] + l3*features[2]

# create a pipeline for each configuration of (l1, l2, l3)
pipelines = [
    cached_features_retriever >> pt.apply.doc_score(partial(_score_from_features, l1=l1, l2=l2, l3=l3))
    for l1, l2, l3 in valid_combinations
]

print("Number of pipelines:", len(pipelines))

Number of pipelines: 21


In [18]:
names = [f"train_BiEncoder_BM25_Community ({l1},{l2},{l3})" for l1, l2, l3 in valid_combinations_str]
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
SAVE_MODE = "overwrite" # reuse warn overwrite error

t0 = time.time()
results = pt.Experiment(
    pipelines,
    train_queries,
    train_qrels,
    eval_metrics=metrics,
    names=names,
    save_dir=save_dir,
    save_mode=SAVE_MODE,
)

path = "./experiments/personalized_ir/results_train_BiEncoder_BM25_Community.csv"
if SAVE_MODE == "overwrite" or not os.path.exists(path):
    print("Experiment duration :", round(time.time()-t0, 2), "seconds")
    # Experiment duration : ~11 minutes

    results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Experiment duration : 294.61 seconds


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"train_BiEncoder_BM25_Community (.5,.0,.5)",0.306,0.189,0.457,0.56,0.946,0.476,0.657
1,"train_BiEncoder_BM25_Community (.5,.1,.4)",0.585,0.267,0.712,0.752,0.946,0.703,0.625
2,"train_BiEncoder_BM25_Community (.5,.2,.3)",0.769,0.295,0.839,0.855,0.946,0.832,0.616
3,"train_BiEncoder_BM25_Community (.5,.3,.2)",0.811,0.301,0.867,0.879,0.946,0.86,0.776
4,"train_BiEncoder_BM25_Community (.5,.4,.1)",0.812,0.3,0.865,0.879,0.946,0.86,0.636
5,"train_BiEncoder_BM25_Community (.5,.5,.0)",0.797,0.297,0.853,0.869,0.946,0.849,0.641
6,"train_BiEncoder_BM25_Community (.6,.0,.4)",0.534,0.258,0.675,0.722,0.946,0.667,0.692
7,"train_BiEncoder_BM25_Community (.6,.1,.3)",0.762,0.294,0.833,0.851,0.946,0.826,0.706
8,"train_BiEncoder_BM25_Community (.6,.2,.2)",0.824,0.303,0.876,0.886,0.946,0.869,0.65
9,"train_BiEncoder_BM25_Community (.6,.3,.1)",0.833,0.304,0.88,0.891,0.946,0.875,0.642


In [19]:
names = [f"val_BiEncoder_BM25_Community ({l1},{l2},{l3})" for l1, l2, l3 in valid_combinations_str]
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"

t0 = time.time()
results = pt.Experiment(
    pipelines,
    val_queries,
    val_qrels,
    eval_metrics=metrics,
    names=names,
    save_dir=save_dir,
    save_mode= "overwrite",
)

print("Experiment duration :", round(time.time()-t0, 2), "seconds")

path = "./experiments/personalized_ir/results_val_BiEncoder_BM25_Community.csv"
results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Experiment duration : 3.01 seconds


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_Community (.5,.0,.5)",0.286,0.173,0.42,0.56,0.969,0.46,0.875
1,"val_BiEncoder_BM25_Community (.5,.1,.4)",0.602,0.299,0.773,0.793,0.969,0.741,0.662
2,"val_BiEncoder_BM25_Community (.5,.2,.3)",0.857,0.316,0.914,0.918,0.969,0.904,0.661
3,"val_BiEncoder_BM25_Community (.5,.3,.2)",0.888,0.316,0.925,0.929,0.969,0.919,0.653
4,"val_BiEncoder_BM25_Community (.5,.4,.1)",0.888,0.313,0.92,0.924,0.969,0.918,0.649
5,"val_BiEncoder_BM25_Community (.5,.5,.0)",0.888,0.313,0.919,0.925,0.969,0.915,0.656
6,"val_BiEncoder_BM25_Community (.6,.0,.4)",0.551,0.296,0.745,0.767,0.969,0.707,0.643
7,"val_BiEncoder_BM25_Community (.6,.1,.3)",0.867,0.316,0.914,0.917,0.969,0.903,0.643
8,"val_BiEncoder_BM25_Community (.6,.2,.2)",0.908,0.32,0.939,0.942,0.969,0.933,0.65
9,"val_BiEncoder_BM25_Community (.6,.3,.1)",0.898,0.316,0.93,0.934,0.969,0.926,0.65


We have seen that the communities gives similar results to the tags.

Now we put both of them in a single pipeline, so we can actually see what of the two is more important. and if the communities can add information to the tags, or if they convey the same informations.

`PIPELINE = BM25 % 100 >> l1*norm_bi_enc + l2*norm_bm25 + l3*tags_score + l4*community_score`

$\text{final\_score} = \lambda_1 \cdot \text{BiEncoder\_score} + \lambda_2 \cdot \text{BM25\_score} + \lambda_3 \cdot \text{tags\_score} + \lambda_4 \cdot \text{community\_score}$

for $\lambda_1, \lambda_2, \lambda_3, \lambda_4$ such that $\lambda_1 + \lambda_2 + \lambda_3 + \lambda_4 = 1$

In [20]:
### TAGS-SCORE
tags_score = pt.apply.doc_score(_get_tags_score, batch_size=64)
norm_tags_score = tags_score >> pt.pipelines.PerQueryMaxMinScoreTransformer() ## NORMALIZE THE SCORES

#### smaller range defined for lambda associated to BiEncoder
lambda_1_values = np.arange(0.5, 1.1, 0.1)
valid_combinations = []

# Loop over lambda_1 values
for lambda_1 in lambda_1_values:
    # Loop over lambda_2 values
    for lambda_2 in np.arange(0.0, 1.1, 0.1):
        # loop over lambda_3 values
        for lambda_3 in np.arange(0.0, 1.1, 0.1):
            # Calculate lambda_4 as the remainder
            lambda_4 = 1.0 - lambda_1 - lambda_2 - lambda_3
            if 0.0 <= lambda_4 <= 1.0:
                combination = (round(float(lambda_1), 1), round(float(lambda_2), 1), 
                               round(float(lambda_3), 1), round(float(lambda_4), 1))
                valid_combinations.append(combination)
        
valid_combinations_str = [(f"{l1:.1f}".lstrip("0"), f"{l2:.1f}".lstrip("0"), f"{l3:.1f}".lstrip("0"), f"{l4:.1f}".lstrip("0"))
                           for l1, l2, l3, l4 in valid_combinations]

##### DEFINE ALL THE PIPELINES
# we cache the features_pipeline, since the features are the same for all the combinations
# we compute it once, then reuse it for all the combinations of the lambdas
path_to_cache = "./cache/features_BiEncoder_BM25_Tags_Community"
features_pipeline = (bm25 % 100).compile() \
                    >> (norm_bi_enc ** norm_bm25 ** norm_tags_score ** norm_community_score) \
                    >> pt.apply.generic(lambda x: x[['qid', 'docno','features']]) # keep only useful columns
# cache the features
cached_features_retriever = RetrieverCache(path_to_cache, features_pipeline)

def _score_from_features(x, l1, l2, l3, l4):
    """given the features and the values of the three lambdas, compute the score"""
    features = x['features']
    return l1*features[0] + l2*features[1] + l3*features[2] + l4*features[3]

# create a pipeline for each configuration of (l1, l2, l3, l4)
pipelines = [
    cached_features_retriever >> pt.apply.doc_score(partial(_score_from_features, l1=l1, l2=l2, l3=l3, l4=l4))
    for l1, l2, l3, l4 in valid_combinations
]

print("Number of pipelines:", len(pipelines))

Number of pipelines: 52


In [21]:
names = [f"val_BiEncoder_BM25_Tags_Community ({l1},{l2},{l3},{l4})" for l1, l2, l3, l4 in valid_combinations_str]
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/personalized_ir/"

t0 = time.time()
results = pt.Experiment(
    pipelines,
    val_queries,
    val_qrels,
    eval_metrics=metrics,
    names=names,
    save_dir=save_dir,
    save_mode="overwrite",
)

print("Experiment duration :", round(time.time()-t0, 2), "seconds")

path = "./experiments/personalized_ir/results_val_BiEncoder_BM25_Tags_Community.csv"
results.to_csv(path)

display_styled(results, ignore_cols=['name', 'mrt'])

Experiment duration : 7.83 seconds


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,"val_BiEncoder_BM25_Tags_Community (.5,.0,.0,.5)",0.286,0.173,0.42,0.56,0.969,0.46,1.283
1,"val_BiEncoder_BM25_Tags_Community (.5,.0,.1,.4)",0.316,0.184,0.454,0.601,0.969,0.496,0.676
2,"val_BiEncoder_BM25_Tags_Community (.5,.0,.2,.3)",0.306,0.214,0.508,0.624,0.969,0.521,0.689
3,"val_BiEncoder_BM25_Tags_Community (.5,.0,.3,.2)",0.276,0.231,0.527,0.625,0.969,0.521,0.683
4,"val_BiEncoder_BM25_Tags_Community (.5,.0,.4,.1)",0.245,0.241,0.529,0.615,0.969,0.506,0.684
5,"val_BiEncoder_BM25_Tags_Community (.5,.0,.5,.0)",0.214,0.218,0.475,0.587,0.969,0.472,0.67
6,"val_BiEncoder_BM25_Tags_Community (.5,.1,.0,.4)",0.602,0.299,0.773,0.793,0.969,0.741,0.693
7,"val_BiEncoder_BM25_Tags_Community (.5,.1,.1,.3)",0.622,0.306,0.798,0.811,0.969,0.765,0.7
8,"val_BiEncoder_BM25_Tags_Community (.5,.1,.2,.2)",0.643,0.306,0.809,0.821,0.969,0.778,0.674
9,"val_BiEncoder_BM25_Tags_Community (.5,.1,.3,.1)",0.612,0.306,0.795,0.807,0.969,0.76,0.687


The best result is given by the configuration (.7,.1,.2,.0). So we can say that the communities don't add any information to the tags.