# Query expansion with LLMs

Armanni Luca - 509085

Ghiotto Alessandro - 513944

---

As first simple personalization method we expand the query with the tags via an LLM, before of feeding it to the biencoder.

### Table of contents:
- Set up the LLM
- Get Tags
- Reranking pipeline with query expansion

In [1]:
### FOR COLAB

# !pip install -q condacolab
# import condacolab
# condacolab.install()

# !conda install -c pytorch faiss-gpu -y

# !pip install --upgrade -q python-terrier
# !pip install -q sentence_transformers ipdb emoji

# !pip install -q flash-attn
# !pip install -q quanto optimum-quanto

# !gdown 1HhgXzyEpsZNcenU9XhJuOYyDUKEzUse4
# !unzip pir_data.zip

In [1]:
import pandas as pd
import re
import os
import warnings
import shutil
import torch
import numpy as np
import random
import time
import faiss
import joblib
from functools import partial

# Hugging Face
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, QuantoConfig

# VISUALIZATION
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

# TEXT PROCESSING
from textblob import TextBlob
import emoji
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# TERRIER
from pyterrier.measures import *
import pyterrier as pt

if not pt.java.started():
    pt.utils.set_tqdm('notebook')
    pt.java.init()

# FILTER WARNINGS
warnings.filterwarnings(
    "ignore",
    message="The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead",
)
warnings.filterwarnings(
    "ignore",
    message="`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48",
)

# GLOBAL VARIABLES
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# SET SEED
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]


We load the data in the exact same way as the previous notebook, the only difference is that we keep also the *timestamp* and the *user_id*. timestamps are stored as unix timestamps (time in seconds from January 1st, 1970).

In [2]:
stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def preprocess_text(text, apply_stemmer=False, remove_stopwords=False):
    # remove emojis
    text = emoji.replace_emoji(text, "")
    # remove links
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    # remove html tags
    # text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # lowercase verything
    text = text.lower()
    # remove backslashes
    text = re.sub(r"\\", "", text)
    # remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # remove whitespaces
    text = re.sub(r"\s+", " ", text)
    # remove leading and trailing whites
    text = text.strip()
    # apply spelling correction
    # text = TextBlob(text).correct()
    tokens = text.split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
    if apply_stemmer:
        tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /home/ghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# COLLECTION OF DOCUMENTS (ANSWERS)
def preprocess_corpus(df):
    df = df.reset_index()
    df.columns = ['docno', 'text']
    df = df.reset_index(drop=True)
    # df['text_stemmed_stopwords_removed'] = df['text'].apply(lambda x: preprocess_text(x, apply_stemmer=True, remove_stopwords=True))
    return df

corpus_df = preprocess_corpus(pd.read_json('PIR_data/answer_retrieval/subset_answers.json', orient='index'))

# SAMPLES (QUERIES)
def preprocess_queries_df(path):
    df = pd.read_json(path, lines=True)
    df = df[['id', 'text', 'user_id', 'timestamp']]
    df.columns = ['qid', 'query_unprocessed', 'user_id', 'timestamp']
    df['query'] = df['query_unprocessed'].apply(lambda x: preprocess_text(x, apply_stemmer=True, remove_stopwords=True))
    df['timestamp'] = df["timestamp"].astype(int) // 10**9
    df = df.reset_index(drop=True)
    return df

train_queries = preprocess_queries_df('PIR_data/answer_retrieval/train/subset_data.jsonl')
val_queries = preprocess_queries_df('PIR_data/answer_retrieval/val/subset_data.jsonl')
# test_queries = preprocess_queries_df('PIR_data/answer_retrieval/test/subset_data.jsonl')

# QRELS
def preprocess_qrels_df(path):
    df = pd.read_json(path, orient='index').reset_index()
    df.columns = ['qid', 'docno']
    df['label'] = 1
    df = df.reset_index(drop=True)
    return df

train_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/train/qrels.json')
val_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/val/qrels.json')
# test_qrels = preprocess_qrels_df('PIR_data/answer_retrieval/test/qrels.json')

print("ANSWERS")
display(corpus_df.head(3))
print("QUERIES")
display(train_queries.head(3))
print("QRELS")
display(train_qrels.head(3)) 

ANSWERS


Unnamed: 0,docno,text
0,writers_2010,TL;DRIf you're going to do present tense do it...
1,writers_2018,"Your writing style is stream-of-consciousness,..."
2,writers_2023,Place emphasis on uncomfortable things. Depend...


QUERIES


Unnamed: 0,qid,query_unprocessed,user_id,timestamp,query
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...
1,academia_100456,Is there a free (as in freedom) alternative to...,1106095,1513191752,free freedom altern publon review journal allo...
2,academia_103390,Search for StackExchange citations with Google...,1532620,1517935259,search stackexchang citat googl scholar possib...


QRELS


Unnamed: 0,qid,docno,label
0,academia_100305,academia_100217,1
1,academia_100456,academia_100462,1
2,academia_103390,academia_103391,1


---

### Set up the LLM

As first personalization method we expand the query with a LLM. We give as input the orginal query and we ask the model to personalize it given the insterest of the users. The interest of the users at a time *t* are given by the set of tags from all the questions that the user has written before the time *t* (the current question is included).

As LLM we use ['Phi-3-mini-4k-instruct'](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct). We quantize it for resources limitations.

NOTES on [Quantization](https://huggingface.co/docs/transformers/main//quantization):

- Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits.

- `"microsoft/Phi-3-mini-4k-instruct"` have 3.82B params. With 8-bit quantization, the model should only need around 3.82B params * 1 byte (=8 bit) = 3.8GB. which is the half of the GPU resources needed for using the model without quantizing the weights.

In [5]:
# QUANTO CONFIG
quantization_config = QuantoConfig(weights="int8")

# MODEL
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    quantization_config=quantization_config,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    #"max_new_tokens": 256,
    "do_sample": False, # Deterministic decoding
    # "temperature": 0.0,
}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


In [6]:
# We put user interest before of the query, so aren't cutted off

user_interests = {'cats', 'medicins'}
user_interests_str = ", ".join(user_interests)
query = 'I like petting cats, but I am allergic, what should I do?'
prompt = f"""
    Given the user interests as comma separated values and the query,
    provide an expanded version of the query with personalized interests.
    Just say New Query: and put the generated query. Do non mention anything else.
    Keywords: {user_interests_str} Query: {query}
"""
prompt = re.sub(r"\s+", " ", prompt)

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": prompt},
]

output = pipe(messages, **generation_args)
print(f"INPUT PROMPT   :{prompt}")
print(f"USER INTERESTS : {user_interests_str}")
print(f"OUTPUT         :{output[0]['generated_text']}")

################
user_interests = {'cats', 'no-vax'}
user_interests_str = ", ".join(user_interests)
query = 'I like petting cats, but I am allergic, what should I do?'
prompt = f"""
    Given the user interests as comma separated values and the query,
    provide an expanded version of the query with personalized interests.
    Just say New Query: and put the generated query. Do non mention anything else.
    Keywords: {user_interests_str} Query: {query}
"""
prompt = re.sub(r"\s+", " ", prompt)

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": prompt},
]

output = pipe(messages, **generation_args)
print("\n"+"="*50+"\n")
print(f"INPUT PROMPT   :{prompt}")
print(f"USER INTERESTS : {user_interests_str}")
print(f"OUTPUT         :{output[0]['generated_text']}")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


INPUT PROMPT   : Given the user interests as comma separated values and the query, provide an expanded version of the query with personalized interests. Just say New Query: and put the generated query. Do non mention anything else. Keywords: medicins, cats Query: I like petting cats, but I am allergic, what should I do? 
USER INTERESTS : medicins, cats
OUTPUT         : New Query: I enjoy petting cats, but I have allergies, what are some hypoallergenic cat breeds I can consider?


INPUT PROMPT   : Given the user interests as comma separated values and the query, provide an expanded version of the query with personalized interests. Just say New Query: and put the generated query. Do non mention anything else. Keywords: no-vax, cats Query: I like petting cats, but I am allergic, what should I do? 
USER INTERESTS : no-vax, cats
OUTPUT         : New Query: I enjoy petting cats, but I have allergies, what are some hypoallergenic cat breeds I can consider?


### Get Tags

We want to preprocess the Tags in such a way that given a user and the timestamp, we get the set of tags. 

USERS_TAGS : dictionary associating to each user a list that contains the pair (timestamp, {tags}) for each query. the list is sorted by increasing timestamp.

```python
USERS_TAGS = {
    "user1_id": [(t1, {"tag1", "tag2"}), (t2, {"tag3"}), ...],
    "user2_id": [(t3, {"tag1"}), ...],
}
```

In [6]:
# put everything together
user_tags_df = pd.concat([
    pd.read_json('PIR_data/answer_retrieval/train/subset_data.jsonl', lines=True),
    pd.read_json('PIR_data/answer_retrieval/val/subset_data.jsonl', lines=True),
    pd.read_json('PIR_data/answer_retrieval/test/subset_data.jsonl', lines=True)
])[['id', 'user_id', 'timestamp', 'tags']]
user_tags_df.columns = ['qid', 'user_id', 'timestamp', 'tags']
user_tags_df['timestamp'] = user_tags_df["timestamp"].astype(int) // 10**9
user_tags_df['tags'] = user_tags_df['tags'].apply(set)

display(user_tags_df.head(3))

# Build user_data dictionary
USER_TAGS = {}
for user, group in user_tags_df.groupby("user_id"):
    # list of tuples (timestamp, tags), sorted by timestamp
    USER_TAGS[user] = sorted(zip(group["timestamp"], group["tags"]), key=lambda x: x[0])

### SAVE
joblib.dump(USER_TAGS, './index_sepqa/user_tags_subset.joblib')

users = user_tags_df.head(3)['user_id'].values
for user in users:
    print(f"USER {user} TAGS:")
    print(USER_TAGS[user])
    print()

Unnamed: 0,qid,user_id,timestamp,tags
0,academia_100305,1106095,1513009820,"{funding, france}"
1,academia_100456,1106095,1513191752,"{peer-review, open-access}"
2,academia_103390,1532620,1517935259,"{citations, google-scholar}"


USER 1106095 TAGS:
[(1341223088, {'software', 'teaching'}), (1343056478, {'conference', 'networking'}), (1343647084, {'publications', 'journals', 'peer-review', 'conference'}), (1343650594, {'publications', 'journals'}), (1346750645, {'paperwork'}), (1349425429, {'publications', 'peer-review'}), (1353929506, {'responsibilities', 'contract'}), (1355405618, {'publications'}), (1359623571, {'job-search'}), (1359655998, {'publications', 'funding'}), (1360500458, {'publications', 'peer-review', 'application'}), (1360586056, {'poster', 'travel'}), (1361798551, {'united-kingdom', 'sexual-misconduct'}), (1362391150, {'seminars', 'etiquette'}), (1363091345, {'career-path', 'working-time', 'sabbatical'}), (1366622104, {'job-search', 'version-control'}), (1367225030, {'ethics', 'peer-review'}), (1370509865, {'copyright', 'creative-commons'}), (1371471877, {'assessment'}), (1375350804, {'writing-style', 'funding', 'writing'}), (1377078634, {'authorship'}), (1379871860, {'faculty-application', 'rec

In [7]:
# LOAD
USER_TAGS = joblib.load('./index_sepqa/user_tags_subset.joblib')

def get_user_tags(user_id, timestamp):
    tags = set()
    for ts, user_tags in USER_TAGS[user_id]:
        if ts <= timestamp:
            tags = tags.union(user_tags)
        # the ts are sorted, so we can break when we reach the timestamp
        else:
            break
    return tags

t = 1391025990
for user in users:
    print(f"USER = {user} , timestamp = {t} TAGS: {get_user_tags(user, t)}")

USER = 1106095 , timestamp = 1391025990 TAGS: {'publications', 'paperwork', 'responsibilities', 'contract', 'networking', 'sabbatical', 'faculty-application', 'etiquette', 'creative-commons', 'seminars', 'conference', 'recommendation-letter', 'teaching', 'software', 'working-time', 'sexual-misconduct', 'copyright', 'poster', 'career-path', 'authorship', 'job-search', 'version-control', 'tenure-track', 'writing', 'travel', 'united-kingdom', 'writing-style', 'peer-review', 'assessment', 'united-states', 'funding', 'journals', 'open-access', 'ethics', 'application'}
USER = 1106095 , timestamp = 1391025990 TAGS: {'publications', 'paperwork', 'responsibilities', 'contract', 'networking', 'sabbatical', 'faculty-application', 'etiquette', 'creative-commons', 'seminars', 'conference', 'recommendation-letter', 'teaching', 'software', 'working-time', 'sexual-misconduct', 'copyright', 'poster', 'career-path', 'authorship', 'job-search', 'version-control', 'tenure-track', 'writing', 'travel', 'uni

### Reranking pipeline with query expansion

`PIPELINE = BM25 % 100 >> expand_query >> BiEncoder`

We define the function that expand the query.

In [9]:
def _expand_query(df, pipeline, generation_args):
    """
    Expand queries with user tags

    used as argument of pyterrier.apply.by_query()
        => the input is a dataframe for one query at at time
    """
    # take just the first of everything, since we will use it for pt.apply.by_query()
    # we receive an input dataframe that contains only one query
    # in particular in our case the 100 rows given by BM25 % 100
    user_id = df['user_id'].iloc[0]
    timestamp = df['timestamp'].iloc[0]
    tags = get_user_tags(user_id, timestamp)
    user_tags_str = ", ".join(tags)
    query = df['query_unprocessed'].iloc[0]
    prompt = f"""
        Given the user interests as comma separated values and the query,
        provide an expanded version of the query with personalized interests.
        Just say New Query: and put the generated query. Do non mention anything else.
        Keywords: {user_tags_str} Query: {query}
    """
    prompt = re.sub(r"\s+", " ", prompt)
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": prompt},
    ]
    output = pipeline(messages, **generation_args)
    expanded_query = output[0]['generated_text']
    expand_query = re.sub(r"New Query: ", "", expanded_query)
    df['query_unprocessed'] = expand_query
    df = df.rename(columns={'query_unprocessed': 'query_expanded'})
    return df

### EXAMPLE
sample_df = pd.concat([train_queries.head(1)]*3)
print("BEFORE EXPANSION")
display(sample_df)
print("AFTER EXPANSION")
sample_df = _expand_query(sample_df, pipe, generation_args)
display(sample_df)

BEFORE EXPANSION


Unnamed: 0,qid,query_unprocessed,user_id,timestamp,query
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...
0,academia_100305,What are CNRS research units and how are they ...,1106095,1513009820,cnr research unit staf centr nation de la rech...


AFTER EXPANSION


Unnamed: 0,qid,query_expanded,user_id,timestamp,query
0,academia_100305,What are the different types of research unit...,1106095,1513009820,cnr research unit staf centr nation de la rech...
0,academia_100305,What are the different types of research unit...,1106095,1513009820,cnr research unit staf centr nation de la rech...
0,academia_100305,What are the different types of research unit...,1106095,1513009820,cnr research unit staf centr nation de la rech...


In [10]:
##### REPORTED FROM PREVIOUS NOTEBOOK
## GET SCORES BIENCODER

def _get_dense_scores(df, FAISS_INDEX, biencoder_model, text_field='query_unprocessed', k=1000):
    """
    get cosine similarity score with a biencoder model, with FAISS FlatIndex

    used as argument of pyterrier.apply.doc_score()
        =>  the input is a ranked documents dataframe (batch), by query
            the output are the scores for each document in the batch
    """
    if not all(df['qid'] == df['qid'].iloc[0]):
        assert "Not all qids in the batch are equal"
    # get the query unprocessed text
    query_text = df[text_field].iloc[0]
    # get the query embedding
    query_embedding = biencoder_model.encode(query_text).astype('float32')
    query_embedding = query_embedding / np.linalg.norm(query_embedding) # normalize for cosine similarity

    # if we are reranking
    if 'docid' in df.columns:
        # select the retrieved documents
        filter_ids = df['docid'].values
        id_selector = faiss.IDSelectorArray(np.array(filter_ids, dtype=np.int64))
        search_params = faiss.SearchParametersIVF(sel=id_selector)
        # rerank them
        k = len(filter_ids)
        distances, indices = FAISS_INDEX.search(np.array([query_embedding]), k, params=search_params)
    else:
        distances, indices = FAISS_INDEX.search(np.array([query_embedding]), k)

    # mapping {docid: score}
    score_mapping = {docid: score for docid, score in zip(indices[0], distances[0])}
    # get the scores in the original order (same as the input docids)
    scores_original_order = [score_mapping[docid] for docid in df['docid']]
    return scores_original_order

In [11]:
### FIRST STAGE
path = "./index_sepqa/index_bm25/data.properties"
bm25_index = pt.IndexFactory.of(path)
bm25 = pt.terrier.Retriever(
    bm25_index, 
    wmodel="BM25", 
    controls={'c': 1.0, 'bm25.k_1': 2.5},
    properties={"termpipelines": ""},
)

### SECOND STAGE
biencoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
index_path = "./index_sepqa/MiniLM_faiss_IndexFlatIP.index"
FAISS_INDEX = faiss.read_index(index_path)
get_dense_score = partial(_get_dense_scores, FAISS_INDEX=FAISS_INDEX, 
                          biencoder_model=biencoder_model, text_field='query_expanded') ### CHANGE TO query_expanded, not text unprocessed
bi_enc = pt.apply.doc_score(get_dense_score, batch_size=64)

### QUERY EXPANSION
expand_query = partial(_expand_query, pipeline=pipe, generation_args=generation_args)
query_expander = pt.apply.by_query(expand_query)

### PIPELINE
pipeline_QE = bm25 % 100 >> query_expander >> bi_enc

In [12]:
# put it just for the seeing all of them here
metrics = [P@1, P@3, nDCG@3, nDCG@10, R@100, MAP@100, 'mrt']
save_dir = "./experiments/query_expansion/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

SAVE_MODE = "reuse" # reuse warn overwrite error

t0 = time.time()
results = pt.Experiment(
    [pipeline_QE],
    val_queries,
    val_qrels,
    eval_metrics=metrics,
    names=["BM25_QE_MiniLM"],
    save_dir=save_dir,
    save_mode=SAVE_MODE,
)

if SAVE_MODE == "overwrite":
    print("Experiment duration :", round(time.time()-t0, 2), "seconds")
    # Experiment duration : 1274.12 seconds

    path = "./experiments/query_expansion/results_val_BM25_QE_MiniLM.csv"
    results.to_csv(path)


display(results.round(3))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Experiment duration : 1274.12 seconds


Unnamed: 0,name,P@1,P@3,nDCG@3,nDCG@10,R@100,AP@100,mrt
0,BM25_QE_MiniLM,0.857,0.31,0.898,0.902,0.969,0.891,12740.013


This method was implemented as a straightforward approach to personalize a query. However, it’s evident that utilizing LLMs solely for query rewriting is an inefficient use of resources.