In [1]:
!pip install sentence_transformers
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install nltk==3.8.1
!pip install evaluate

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l- \ done
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=5501c904132a4d50c9e0b28d64b0c3daa04b81063248a573c31c3a7788488777
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
Processing /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collecte

In [2]:
import ctypes
import gc

import torch
import faiss
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction



In [3]:
def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

# Prepare training data

In [4]:
# Number of candidates to find in faiss index
N_NEGATIVES = 20

In [5]:
df_70k = pd.read_csv('/kaggle/input/llm-70kdataset-with-context/len70021_with_context.csv')
df_70k = df_70k.fillna('None')

In [6]:
df_70k.head()

Unnamed: 0.1,Unnamed: 0,prompt,A,B,C,D,E,answer,reference_sentence,wiki_text,page_id,page_title,stem_label,context,id
0,0,Who created the character Comet Man?,Bill Mumy and Miguel Ferrer,Jim Shooter,Kelley Jones,Halley's Comet,Comico,A,The character was created partly due to wide p...,Comet Man (Stephen Beckley) is a fictional cha...,3416576,Comet Man,M,"The character first appeared in Comet Man #1, ...",0
1,1,When did Comet Man first appear in a comic?,February 1987,The Golden Age of Comic Books,Lost in Space,San Diego Comic-Con,RoboCop,A,"The character first appeared in Comet Man #1, ...",Comet Man (Stephen Beckley) is a fictional cha...,3416576,Comet Man,M,Comet is the name of two fictional comic book ...,1
2,2,What was the first comics work of Bill Mumy?,Comet Man,Lost in Space,The Twilight Zone,RoboCop,Micronauts,A,"It was the first comics work of Bill Mumy, who...",Comet Man (Stephen Beckley) is a fictional cha...,3416576,Comet Man,M,"In 1996, Mumy was a writer and co-creator of S...",2
3,3,Who did Bill Mumy play in the sci-fi series Lo...,Will Robinson,Comet Man,Golden Age of Comic Books,Kelley Jones,San Diego Comic-Con,A,"It was the first comics work of Bill Mumy, who...",Comet Man (Stephen Beckley) is a fictional cha...,3416576,Comet Man,M,"His parents, played by James Stewart and Glyni...",3
4,4,Which comic company showed interest in the pro...,Comico,Bill Mumy and Miguel Ferrer,Jim Shooter,Kelley Jones,Halley's Comet,A,After meeting Marvel editor-in-chief Jim Shoot...,Comet Man (Stephen Beckley) is a fictional cha...,3416576,Comet Man,M,"In late 1994, Marvel acquired the comic book d...",4


In [7]:
def encode_questions_minilm(df):
    """
    Make embeddings of questions in 70k data using all-MiniLM-L6-v2 model.

    Args:
        df: pandas dataframe with columns 'prompt', 'A', 'B', 'C', 'D', 'E'
            to form question.

    Returns:
        float16 numpy array of questions embeddings, shape [len(df) x 384].
    """
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    texts = []
    for _, row in df.iterrows():
        text = f"{row.prompt}\n{row.A}\n{row.B}\n{row.C}\n{row.D}\n{row.E}"
        texts.append(text)
    with torch.autocast(device_type='cuda'):
        embs = model.encode(texts, device=f'cuda', show_progress_bar=True).astype(np.float16)
    return embs

In [8]:
embs_70k = encode_questions_minilm(df_70k)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/2189 [00:00<?, ?it/s]

In [9]:
def search_faiss_index(query_embs):
    """
    Read passage level quantized faiss index of whole en-wikipedia
    and find top N_NEGATIVES relevant passages for each question.

    Args:
        query_embs: numpy array of query embeddings, shape [n_query x emb_dim].

    Returns:
        numpy array with indices of the most relevant passages
        shape [n_query x N_NEGATIVES]
    """
    index = faiss.read_index('/kaggle/input/wiki-ivfqp-index/wiki_ivfqp.index')
    index.nprobe = 5  # larger value gives better search quality, but is slower
    res = faiss.StandardGpuResources()
    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = True
    co.useFloat16LookupTables = True
    index = faiss.index_cpu_to_all_gpus(index, co=co)
    _, I = index.search(query_embs.astype(np.float32), N_NEGATIVES)
    return I

In [10]:
indices = search_faiss_index(embs_70k)

In [11]:
def get_passages_texts_pyarrow(indices):
    """
    Read passages in pyarrow format and take those specified in indices.

    Args:
        indices: numpy array with indices of passages to read.
                 shape [n_queries x n_passages]

    Returns:
        Dict with indices of passages as keys and texts as values.
    """
    unique_inds = np.unique(indices)

    df_selected = pd.read_parquet('/kaggle/input/wiki-pyarrow/wiki_passages.parquet',
                                  engine='pyarrow', use_threads=True,
                                  filters=[('index', 'in', unique_inds)],
                                  columns=['index', 'passage'])
    ids2passages = dict()
    for _, row in df_selected.iterrows():
        ids2passages[row['index']] = row['passage']
    return ids2passages

In [12]:
id2text = get_passages_texts_pyarrow(indices)

In [13]:
def build_df_for_reranker(df_70k, id2text, indices, bleu_threshold=0.1):
    """
    Create dataframe with columns:
    question - concatenation of prompt and answers from initial dataframe;
    candidate - passage from Wikipedia found by retrieval model;
    label - 1 if candidate is similar to 'reference_sentence'
            from initial dataframe, 0 otherwise.
            Similarity is measured by bleu score.
    short_question - prompt from initial dataframe.
    
    Args:
        df_70k: initial dataframe with questions, answers and contexts.
        id2text: dict to translate passage index to passage text.
        indices: numpy array with indices of passages for eqch question.
                 shape [n_questions x n_passages]
        bleu_threshold: float number to categorize if retrieved passage
                        is similar to 'reference_sentence' or not.
                        If bleu score is large than this value
                        then label will be 1, otherwise 0.

    Returns:
        pandas dataframe in the format suitable for reranker training.
    """
    reranker_data = {'question': [], 'candidate': [], 'label': [], 'short_question': []}
    for idx, row in tqdm(df_70k.iterrows(), total=len(df_70k)):
        text = f"{row.prompt}\n{row.A}\n{row.B}\n{row.C}\n{row.D}\n{row.E}"
        truth = row.reference_sentence
        candidates = [id2text[i] for i in indices[idx]]
        bleu_scores = [
            sentence_bleu(
                [truth.split()], candidate.split(),
                smoothing_function=SmoothingFunction().method4,
            )
            for candidate in candidates
        ]
        for score, candidate in zip(bleu_scores, candidates):
            reranker_data['question'].append(text)
            reranker_data['candidate'].append(candidate)
            reranker_data['short_question'].append(row.prompt)
            if score > bleu_threshold:
                reranker_data['label'].append(1)
            else:
                reranker_data['label'].append(0)
    reranker_data = pd.DataFrame.from_dict(reranker_data)
    return reranker_data

In [14]:
reranker_data = build_df_for_reranker(df_70k, id2text, indices)

  0%|          | 0/70021 [00:00<?, ?it/s]

In [15]:
reranker_data

Unnamed: 0,question,candidate,label,short_question
0,Who created the character Comet Man?\nBill Mum...,Comet Man. Fictional comic book character\n,0,Who created the character Comet Man?
1,Who created the character Comet Man?\nBill Mum...,Comet Man. Creation. The character was created...,1,Who created the character Comet Man?
2,Who created the character Comet Man?\nBill Mum...,Comet Man. Comet Man (Stephen Beckley) is a fi...,0,Who created the character Comet Man?
3,Who created the character Comet Man?\nBill Mum...,Comet Man. Powers and abilities. Reception. Ad...,0,Who created the character Comet Man?
4,Who created the character Comet Man?\nBill Mum...,Comet (DC Comics). Captain Comet\n,0,Who created the character Comet Man?
...,...,...,...,...
1400415,What was James Keith Louden's role at the Arms...,Keith Ricken. Management career. Honours. ;Cork\n,0,What was James Keith Louden's role at the Arms...
1400416,What was James Keith Louden's role at the Arms...,"Peter Corke. Career. From 2009 to 2013, he ser...",0,What was James Keith Louden's role at the Arms...
1400417,What was James Keith Louden's role at the Arms...,John Egan (industrialist). Career. Roles. * ch...,0,What was James Keith Louden's role at the Arms...
1400418,What was James Keith Louden's role at the Arms...,John Horgan (hurler). Coaching career. Biograp...,0,What was James Keith Louden's role at the Arms...


In [16]:
# Distribution of labels is highly skewed towards negatives.
reranker_data['label'].value_counts()

label
0    1327278
1      73142
Name: count, dtype: int64

In [17]:
reranker_data['short_question'].nunique()

58519

In [18]:
def train_test_split(reranker_data, n_train_q):
    """
    Split given data into train and test,
    making sure that there is no intersection between
    prompts in train and test.
    Also, prompts without positive passage are excluded
    from train to avoid overfitting such prompts.
    
    Args:
        reranker_data: pandas dataframe with columns 'question', 'candidate', 'label', 'short_question'.
        n_train_q: number of prompts (short_questions) to try to take into train part.

    Returns:
        tuple of pandas dataframes (train, test).
    """
    n_valid_q = reranker_data['short_question'].nunique() - n_train_q
    reranker_data = reranker_data.sample(frac=1, random_state=0)
    train_indices, valid_indices = [], []
    unique_questions = reranker_data['short_question'].unique()
    valid_added = 0
    for question, q_df in tqdm(reranker_data.groupby('short_question'), total=len(unique_questions)):
        if q_df['label'].sum() == 0 or valid_added < n_valid_q:
            valid_indices += q_df.index.tolist()
            valid_added += 1
            continue
        train_indices += q_df.index.tolist()

    df_train = reranker_data.iloc[train_indices]
    df_valid = reranker_data.iloc[valid_indices]
    return df_train, df_valid

In [19]:
# Split data into train and validation.
# Try to take 55k out of 58.5k prompts for train,
# but in reality their amount will be lower, because
# some prompts do not have positive candidates and we
# do not want such prompt in train.
df_train, df_valid = train_test_split(reranker_data, 55_000)
print(len(df_train), len(df_valid))

  0%|          | 0/58519 [00:00<?, ?it/s]

1065120 335300


In [20]:
df_train.label.value_counts()

label
0    1009404
1      55716
Name: count, dtype: int64

In [21]:
df_valid.label.value_counts()

label
0    317874
1     17426
Name: count, dtype: int64

In [22]:
def make_balanced_train_valid(
    df_train, df_valid,
    train_neg_multiplier=3,
    num_positives_valid=8000
):
    """
    Remove some negatives from train and valid to have less unbalanced data.
    For train we will still keep more negatives than positives.
    Fraction n_negatives/n_positives is controlled by train_neg_multiplier.
    For validation we will take equal number of positives and negatives
    for convenience.
    
    Args:
        df_train: dataframe for training
        df_valid: dataframe for validation
        train_neg_multiplier: int to get negatives for training - n_positives * train_neg_multiplier.
        num_positives_valid: take this number of positive for validation instead
                             of full df_valid.label.sum() to speed up training loop

    Returns:
        tuple of pandas dataframes (train, test).
    """
    # Train part
    num_positives = df_train.label.sum()
    df_train_balanced = pd.concat([
        df_train[df_train.label == 1].reset_index(drop=True),
        df_train[df_train.label == 0].sample(n=num_positives*train_neg_multiplier).reset_index(drop=True),
    ]).reset_index(drop=True)

    # Valid part
    df_valid = df_valid.sample(frac=1, random_state=0).reset_index(drop=True)
    one_inds = df_valid[df_valid.label == 1].index.values[:num_positives_valid]
    zero_inds = df_valid[df_valid.label == 0].index.values[:num_positives_valid]
    df_valid_balanced = pd.concat([
        df_valid.iloc[one_inds].reset_index(drop=True),
        df_valid.iloc[zero_inds].reset_index(drop=True)
    ]).reset_index(drop=True)

    return df_train_balanced, df_valid_balanced

In [23]:
df_train_balanced, df_valid_balanced = make_balanced_train_valid(df_train, df_valid)

In [24]:
df_train_balanced.label.value_counts()

label
0    167148
1     55716
Name: count, dtype: int64

In [25]:
df_valid_balanced.label.value_counts()

label
1    8000
0    8000
Name: count, dtype: int64

In [26]:
del embs_70k
del indices
del id2text
del reranker_data
del df_train
del df_valid
clean_memory()

# Train model

In [27]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments, Trainer
)
import evaluate

In [28]:
MODEL_NAME = "ibm/re2g-reranker-nq"

In [29]:
def preprocess_function(tokenizer, text):
    return tokenizer(text=text[0], text_pair=text[1], truncation=True, max_length=512)


def compute_metrics(eval_pred):
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels)


def make_dataset(df):
    return [{'label': row.label, 'text': (row.question, row.candidate)} for _, row in df.iterrows()]


def train(df_train, df_valid):
    dataset = {
        'train': make_dataset(df_train),
        'test': make_dataset(df_valid)
    }

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenized_texts = {
        'train': [{'label': item['label'], **preprocess_function(tokenizer, item['text'])} for item in dataset['train']],
        'test': [{'label': item['label'], **preprocess_function(tokenizer, item['text'])} for item in dataset['test']]
    }
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id
    )
    model.cuda()
    training_args = TrainingArguments(
        output_dir="./reranker",
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=1000,
        save_steps=1000,
        warmup_steps=1000,
        save_total_limit=2,
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_texts["train"],
        eval_dataset=tokenized_texts["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    eval_res = trainer.evaluate()
    print(f"{eval_res=}")
    trainer.train()


In [30]:
train(df_train_balanced, df_valid_balanced)

Downloading (…)okenizer_config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

eval_res={'eval_loss': 2.4789488315582275, 'eval_accuracy': 0.5718125, 'eval_f1': 0.2532970027247956, 'eval_precision': 0.9889361702127659, 'eval_recall': 0.14525, 'eval_runtime': 144.603, 'eval_samples_per_second': 110.648, 'eval_steps_per_second': 6.915}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1000,0.2209,0.265648,0.905438,0.901053,0.944864,0.861125
2000,0.199,0.212623,0.918125,0.914869,0.952761,0.879875
3000,0.1942,0.258131,0.914562,0.909953,0.961844,0.863375
4000,0.1838,0.204371,0.927937,0.926217,0.948866,0.904625
5000,0.1725,0.201542,0.932125,0.932049,0.933099,0.931
6000,0.1779,0.222747,0.923875,0.920548,0.962619,0.882
7000,0.1701,0.188347,0.93425,0.933787,0.940416,0.92725
8000,0.1679,0.214767,0.934,0.93209,0.959868,0.905875
9000,0.1471,0.199099,0.934937,0.933111,0.960069,0.907625
10000,0.1549,0.190275,0.937438,0.936303,0.953597,0.919625
