## Create Queries for Fine Tuning

### Imports

In [9]:
import pandas as pd


In [10]:
# !pip install transformers
# !pip install transformers[sentencepiece]
# from huggingface_hub import notebook_login

# notebook_login()

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-base-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-base-v1')

## Load Data

In [12]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/fdic_paragraphs.csv')
df.columns

Index(['SECTNO', 'SUBJECT', 'PARAGRAPH', 'TEXT', 'preprocessed_text'], dtype='object')

## Removing Short Text to ensure generated queries make sense

In [13]:
def filter_short_text(df, column):
  return df[column][df[column].apply(lambda x: len(list(x.split()))>25)]

In [16]:
passages = list(filter_short_text(df, "preprocessed_text"))


In [19]:
import torch
from tqdm.auto import tqdm

pairs = []
file_count = 0

# set to no_grad as we don't need to calculate gradients for back prop
with torch.no_grad():
    # loop through each passage individually
    for p in tqdm(passages):
        p = p.replace('\t', ' ')
        # create input tokens
        input_ids = tokenizer.encode(p, return_tensors='pt')
        # generate output tokens (query generation)
        outputs = model.generate(
            input_ids=input_ids,
            max_length=64,
            do_sample=True,
            top_p=0.95,
            num_return_sequences=3
        )
        # decode output tokens to human-readable language
        for output in outputs:
            query = tokenizer.decode(output, skip_special_tokens=True)
            # append (query, passage) pair to pairs list, separate by \t
            pairs.append(query.replace('\t', ' ')+'\t'+p)
        # once we have 1024 pairs write to file
        if len(pairs) > 1024:
            with open(f'/content/drive/MyDrive/Colab Notebooks/data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(pairs))
            file_count += 1
            pairs = []
    with open(f'/content/drive/MyDrive/Colab Notebooks/data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
      fp.write('\n'.join(pairs))

  0%|          | 0/5043 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


In [20]:
print("Paragraph:")
print(p)

print("\nGenerated Queries:")
for i in range(len(outputs)):
    query = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(f'{i + 1}: {query}')

Paragraph:
 Term of agreement An agreement that does not have a fixed termination date is considered to terminate on the last date on which any party to the agreement makes any payment or provides any loan or other resources under the agreement unless the relevant supervisory agency for the agreement otherwise notifies each party in writing 

Generated Queries:
1: definition of a fixed termination date
2: when is a term agreement in credit
3: if a loan or financing agreement has a fixed termination date, that is called


In [None]:
if pairs is not None:
    with open(f'/content/drive/MyDrive/Colab Notebooks/data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(pairs))