Import drive to save index (if necessary)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

If indexing is done, and stored in drive

In [None]:
!ls -l '/content/drive/MyDrive/646_Project/LaMP_2/index/'

Install required packages -
1. pyterrier for indexing & retrieval
2. transformers, torch, sentencepiece for flan-T5-base, accessing gpu
3. dask for parallelization of index creation process using map_partitions

In [None]:
!pip install python-terrier
!pip install transformers[torch,sentencepiece]
!pip install torch torchvision

In [None]:
import torch, os
import dask.dataframe as dd
import numpy as np, pandas as pd, pyterrier as pt

from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from transformers import T5Tokenizer, T5ForConditionalGeneration

if not pt.started():
    pt.init(boot_packages = ["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

In [None]:
import torch

if torch.cuda.is_available():                 # Check if GPU is available
    print(torch.cuda.get_device_name(0))      # Print GPU device name
else:
    print("GPU not available.")

Load LaMP 2 data from source or google drive (if already saved)

In [None]:
df = pd.read_json('/content/drive/MyDrive/646_Project/train_questions.json')
ddf = dd.from_pandas(df, npartitions = 4)

In [None]:
ddf['profile']

PPEF and AIP functions for processing topK documents to create user personalized LLM inputs - These functions are defined in the appendix of LaMP paper

In [None]:
def ppef(profile):
        """ This function computes per profile entry prompt (PPEP) """
        text, category = profile['text'], profile['category']
        return f'the category for the article: "{text}" is "{category}"'

def aip(topk, inputQ):
        """ This function computes aggregated input prompt (AIP) for the LLM """
        user_context = ", and ".join([ppef(doc) for doc in topk])
        return user_context + f'. {inputQ}'

In [None]:
query = ""
index_df = pd.util.testing.makeDataFrame
user_profile_df = pd.util.testing.makeDataFrame

Function for Index creation [user profiles] using pyterrier

In [None]:
def create_index(x, task = 'LaMP_2', k=1):
    user_profile_df = pd.DataFrame(x['profile'])
    user_profile_df['context'] = user_profile_df[['title', 'text']].agg(' '.join, axis = 1)

    df = user_profile_df[['id', 'context']].rename(columns = {'id': 'docno', 'context': 'text'})

    path = f'/content/drive/MyDrive/646_Project/LaMP_2/index/index_{x["id"]}'

    if os.path.isdir(path):
        # pyterrier creates 10 files in the index creation process
        if len(os.listdir(path)) == 10:
            return path

    try:
        # index the text, record the docnos as metadata
        iter_indexer = pt.IterDictIndexer(path, overwrite = True)
        indexref = iter_indexer.index(df.to_dict(orient = "records"))
    except Exception as e:
        print(f"[ERROR] Creating index for {x['id']} with profile length of {df.shape[0]}")
        print(e)

    return path

In [None]:
def partition_func(dataframe):
    return dataframe.apply(create_index, axis = 1)

Create Index - run index creation process

In [None]:
%%time
print("Started indexing : ", df.shape[0])
p = ddf.map_partitions(partition_func, meta = (None, 'str'))
indexreferences = p.compute()
print("Finished indexing : ", len(indexreferences), '\n')

In [None]:
type(indexreferences[0])

In [None]:
indexreferences

In [None]:
def query_expansion(x, indexref, k=4):
    # define retriever pipeline (bm25, rm3) with default tokenizer for preprocessing query

    profile, input = x['profile'], x['input']
    _, query = input.split('] article: ')

    user_profile_df = pd.DataFrame(profile)
    user_profile_df['context'] = user_profile_df[['title', 'text']].agg(' '.join, axis = 1)
    user_profile_df.rename(columns = {'id': 'docno'}, inplace = True)

    bm25 = pt.BatchRetrieve(indexref, wmodel = 'BM25')
    rm3 = pt.rewrite.RM3(indexref, fb_docs=10, fb_terms=10)

    pipeline = pt.rewrite.tokenise() >> bm25 >> rm3 >> bm25

    # topk documents for a user query
    res = pipeline.search(input)
    topK = res.head(k)
    df = pd.merge(topK, user_profile_df, on = 'docno', how = 'inner')
    prompt = aip(df.to_dict('records'), input)

    return prompt

In [None]:
%%time
prompts = []
for row_1, (index_df2, row_df2) in  zip(indexreferences, ddf.iterrows()):
  prompt = query_expansion(row_df2,row_1)
  prompts.append(prompt)

In [None]:
print(prompts[0], '\n')

Load flan-t5-base model to CUDA

In [None]:
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
config = T5Config.from_pretrained(MODEL_NAME)

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, device_map="auto")
model.to('cuda')

Function to generate llm outputs

In [None]:
def generate_llm_output(input_data):
  input_ids = tokenizer(input_data, return_tensors="pt").input_ids.to("cuda")
  outputs = model.generate(input_ids)
  decoded_output = tokenizer.decode(outputs[0])
  return decoded_output

In [None]:
train_y = pd.read_json('/content/drive/MyDrive/646_Project/train_outputs.json')

In [None]:
truth_values = train_y['golds'].map(lambda x: x['output'])

Generate LLM outputs

In [None]:
%%time
outputs = []

for prompt in prompts:
  output = generate_llm_output(prompt)
  outputs.append(output)

print("Computed LLM Output \n")

Process LLM output to extract category

In [None]:
def process(x):
    try:
        _, a = x.split('<pad>')
        b, _ = a.split('</s>')
        return b.strip('[|]| ').lower()
    except:
        print(x, '\n')
        return ''

In [None]:
processed_outputs = []
for x in outputs:
  y = process(x)
  processed_outputs.append(y)

Processed Classification Labels - categories

In [None]:
print(processed_outputs)

Compute Accuracy, F1 scores

In [None]:
f1 = f1_score(processed_outputs, truth_values.to_list(), average = "macro")
acc = accuracy_score(processed_outputs, truth_values)

print(f'F1 Score is: {f1}')
print(f'Accuracy is: {acc}')