In [1]:
!pip install --quiet datasets
!pip install --quiet transformers
!pip install --quiet lightning
!pip install --quiet accelerate
!pip install --quiet bitsandbytes
!pip install --quiet peft

In [2]:
import os
from typing import List, Dict
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import json
import pandas as pd
import numpy as np

import torch

from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    Adafactor
)

In [3]:
L.seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

INFO: Seed set to 42


device(type='cuda')

In [4]:
def create_dataset(dataset_split):
    data_rows = []

    for i in tqdm(range(len(dataset_split))):
        context = dataset_split[i]['article']
        question = dataset_split[i]['question']

        all_answers = dataset_split[i]['options']
        correct_answer_index =  ord(dataset_split[i]['answer']) - 65
        correct = all_answers.pop(correct_answer_index)
        incorrect1 = all_answers[0]
        incorrect2 = all_answers[1]
        incorrect3 = all_answers[2]
        data_rows.append({
            'context': context,
            'question': question,
            'correct': correct,
            'incorrect1': incorrect1,
            'incorrect2': incorrect2,
            'incorrect3': incorrect3
        })
    return pd.DataFrame(data_rows)

In [5]:
train_path = os.path.join(os.getcwd(), 'data', 'race_train_df.csv')
dev_path = os.path.join(os.getcwd(), 'data', 'race_dev_df.csv')
test_path = os.path.join(os.getcwd(), 'data', 'race_test_df.csv')
HAVE_TRAIN_DATA = os.path.isfile(train_path)
HAVE_DEV_DATA = os.path.isfile(dev_path)
HAVE_TEST_DATA = os.path.isfile(test_path)

In [6]:
if not HAVE_TRAIN_DATA or not HAVE_DEV_DATA or not HAVE_TEST_DATA:
    from datasets import load_dataset
    dataset = load_dataset("race", 'all')
    race_train_df = create_dataset(dataset['train'])
    race_dev_df = create_dataset(dataset['validation'])
    race_test_df = create_dataset(dataset['test'])
    if not os.path.isdir(os.path.join(os.getcwd(), 'data')):
        os.mkdir(os.path.join(os.getcwd(), 'data'))
    race_train_df.to_csv(train_path, index=False)
    race_dev_df.to_csv(dev_path, index=False)
    race_test_df.to_csv(test_path, index=False)

Downloading builder script:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading and preparing dataset race/all (download: 24.26 MiB, generated: 166.64 MiB, post-processed: Unknown size, total: 190.90 MiB) to /root/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b...


Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

Dataset race downloaded and prepared to /root/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/87866 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

  0%|          | 0/4934 [00:00<?, ?it/s]

In [7]:
train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)
test_df = pd.read_csv(test_path)

In [8]:
train_df.head()

Unnamed: 0,context,question,correct,incorrect1,incorrect2,incorrect3
0,Last week I talked with some of my students ab...,We can know from the passage that the author w...,teacher,doctor,model,reporter
1,Last week I talked with some of my students ab...,Many graduates today turn to cosmetic surgery ...,get an advantage over others in job-hunting,marry a better man/woman,become a model,attract more admirers
2,Last week I talked with some of my students ab...,"According to the passage, the author believes ...",media are to blame for misleading young people...,"everyone should purchase perfection, whatever ...",it's right for graduates to ask for others to ...,it is one's appearance instead of skills that ...
3,Last week I talked with some of my students ab...,Which' s the best title for the passage?.,Young Graduates Look to Surgery for Better Jobs,Young Graduates Have Higher Expectations,Young Graduates' Opinion About Cosmetic Surgery,Young Graduates Face a Different Situation in ...
4,"YUZHOU, HENAN -An accident in a central China ...",What could be the best title for this passage?,A Coal Mine Accident in Central China,Death Toll Rises in an Accident in China,An Accident in Central China,Coal Mine Accidents in China


In [9]:
PROMPT_PLACEHOLDER = """
generate distractors for given context, question and answer:
context: {context};
question: {question};
answer: {correct};
</s>
"""

In [10]:
import random

class QGDataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int,
        target_max_token_len: int
        ):
        
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(
            PROMPT_PLACEHOLDER.format(
                context=data_row['context'],
                question=data_row['question'],
                correct=data_row['correct'],
            ),
            max_length= self.source_max_token_len,
            padding='max_length',
            truncation= True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )

        target_encoding = tokenizer(
            '{}; {}; {}; </s>'.format(data_row['incorrect1'], data_row['incorrect2'], data_row['incorrect3']),
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation = True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )

        labels = target_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            input_ids = source_encoding['input_ids'].flatten(),
            attention_mask = source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
            )

In [11]:
class QGDataModule(L.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size,
        source_max_token_len: int,
        target_max_token_len: int
        ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        self.train_dataset = QGDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset = QGDataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.test_dataset = QGDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle=True, num_workers = 2)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=1, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=2)

In [12]:
MODEL_NAME = 't5-small'
SOURCE_MAX_TOKEN_LEN = 512
TARGET_MAX_TOKEN_LEN = 64

N_EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.0001

In [13]:
DF_TAKE_PERCENTAGE = 1

TAKE_TRAIN = int(len(train_df) * DF_TAKE_PERCENTAGE)
TAKE_DEV = int(len(dev_df) * DF_TAKE_PERCENTAGE)
TAKE_TEST = int(len(test_df) * DF_TAKE_PERCENTAGE)

print('Taking', DF_TAKE_PERCENTAGE * 100, '%')
print(TAKE_TRAIN, 'of', len(train_df))
print(TAKE_DEV, 'of', len(dev_df))
print(TAKE_TEST, 'of', len(test_df))

Taking 100 %
87866 of 87866
4887 of 4887
4934 of 4934


In [14]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print('tokenizer len: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

tokenizer len:  32100


In [15]:
print(train_df[:TAKE_TRAIN].shape, dev_df[:TAKE_DEV].shape, test_df[:TAKE_TEST].shape)

data_module = QGDataModule(train_df[:TAKE_TRAIN], dev_df[:TAKE_DEV], test_df[:TAKE_TEST], tokenizer, BATCH_SIZE, SOURCE_MAX_TOKEN_LEN, TARGET_MAX_TOKEN_LEN)
data_module.setup()

(87866, 6) (4887, 6) (4934, 6)


In [16]:
from peft.peft_model import PeftModel
from peft.config import PeftConfig
from peft.mapping import get_peft_model
from peft.tuners import LoraConfig
from peft.utils import TaskType
from peft import prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM

bnb_config=BitsAndBytesConfig(
    load_in_8bit=True
)

peft_config=LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16, # dimension of the low-rank matrices
    lora_alpha=8, # the scaling factor for the low-rank matrices
    lora_dropout=0.001, # the dropout probability of the LoRA layers
    target_modules=["k","q","v","o"],
)

In [17]:
import bitsandbytes as bnb

In [18]:
class QGModel(L.LightningModule):
    def __init__(self, model_name=None, peft_config=None, bnb_config=None):
        super().__init__()
        if model_name is None:
            model_name = 't5-small'
        if bnb_config is None:
            bnb_config=BitsAndBytesConfig(
                load_in_8bit=True
            )
        if peft_config is None:
            peft_config=LoraConfig(
                task_type=TaskType.SEQ_2_SEQ_LM,
                r=16, # dimension of the low-rank matrices
                lora_alpha=8, # the scaling factor for the low-rank matrices
                lora_dropout=0.001, # the dropout probability of the LoRA layers
                target_modules=["k","q","v","o"],
            )
        self.model_name = model_name
        self.peft_config = peft_config
        self.bnb_config = bnb_config
        self.peft_model = None
        self.initialize_model()

    def initialize_model(self):
        model = T5ForConditionalGeneration.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            return_dict=True
        )
        model = prepare_model_for_kbit_training(model)
        self.peft_model = get_peft_model(model, self.peft_config)

    def load_peft_model(self, peft_path):
        t5model = T5ForConditionalGeneration.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            return_dict=True
        )
        t5model = prepare_model_for_kbit_training(t5model)
        self.peft_model = PeftModel.from_pretrained(t5model, peft_path, is_trainable=True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.peft_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        loss, output = self(input_ids, attention_mask, labels)    
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return Adafactor(self.parameters(), 
                          scale_parameter=False, 
                          relative_step=False, 
                          warmup_init=False, lr=LEARNING_RATE)


In [19]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint-modif',
    save_top_k=-1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

In [20]:
trainer = L.Trainer(
    callbacks=checkpoint_callback,
    max_epochs=N_EPOCHS,
    log_every_n_steps=30
)

INFO: Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [21]:
model = QGModel(MODEL_NAME, peft_config, bnb_config)
peft_path = '/kaggle/input/lorat5/best-checkpoint-modif-v5.ckpt'
if os.path.exists(peft_path):
    model.load_peft_model(peft_path)
    model.peft_model.to(device)
    model.peft_model.print_trainable_parameters()
    print(f"Load checkpoint file successfully!")
else:
    model.initialize_model()
    print(f"The checkpoint file at {peft_path} does not exist, initialize model")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 1,179,648 || all params: 61,686,272 || trainable%: 1.9123
Load checkpoint file successfully!


In [22]:
trainer.fit(model, data_module)

2024-05-18 16:08:27.308393: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 16:08:27.308516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 16:08:27.473979: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: 
  | Name       | Type                  | Params
-----------------------------------------------------
0 | peft_model | PeftModelForSeq2SeqLM | 61.7 M
-----------------------------------------------------
1.2 M     Trainable params
60.5 M    Non-trainable params
61.7 M    Total params
246.745   Total estimat

Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 0, global step 2746: 'val_loss' reached 1.95388 (best 1.95388), saving model to '/kaggle/working/checkpoints/best-checkpoint-modif.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 1, global step 5492: 'val_loss' reached 1.95620 (best 1.95388), saving model to '/kaggle/working/checkpoints/best-checkpoint-modif-v1.ckpt' as top 2


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 2, global step 8238: 'val_loss' reached 1.95870 (best 1.95388), saving model to '/kaggle/working/checkpoints/best-checkpoint-modif-v2.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 3, global step 10984: 'val_loss' reached 1.96023 (best 1.95388), saving model to '/kaggle/working/checkpoints/best-checkpoint-modif-v3.ckpt' as top 4


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 4, global step 13730: 'val_loss' reached 1.96241 (best 1.95388), saving model to '/kaggle/working/checkpoints/best-checkpoint-modif-v4.ckpt' as top 5
INFO: `Trainer.fit` stopped: `max_epochs=5` reached.


In [23]:
# Save adapter
for file_name in os.listdir('checkpoints'):
    model_path = 'checkpoints/' + file_name
    print("==> Test model ", model_path)
    test_model = QGModel.load_from_checkpoint(model_path)
    peft_path = 'pefts/' + file_name
    test_model.peft_model.save_pretrained(peft_path)

==> Test model  checkpoints/best-checkpoint-modif-v4.ckpt
==> Test model  checkpoints/best-checkpoint-modif-v2.ckpt
==> Test model  checkpoints/best-checkpoint-modif-v1.ckpt
==> Test model  checkpoints/best-checkpoint-modif-v3.ckpt
==> Test model  checkpoints/best-checkpoint-modif.ckpt


In [24]:
import itertools
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothing_function = SmoothingFunction().method1

def generate(qgmodel, answer: str, context: str, question: str) -> str:
    formatted_distractor = PROMPT_PLACEHOLDER.format(
        context=context,
        question=question,
        correct=answer,
    )
    source_encoding = tokenizer(
        formatted_distractor,
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = qgmodel.generate(
        input_ids=source_encoding['input_ids'].to(device),
        attention_mask=source_encoding['attention_mask'].to(device),
        num_beams=10,
        temperature=1.5,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        early_stopping=True,
        use_cache=True,
        num_return_sequences=10,
        do_sample=True,
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }
    
    formated_options = []
    for option in preds:
        option = option.replace('<pad>', '')
        option = option.replace('</s>', '')
        option = option.strip()
        distractors = option.split(';')
        for distractor in distractors:
            if distractor:
                formated_options.append(distractor)
    
    for option in formated_options:
        option = option.strip()
    
    formated_options = list(set(formated_options))
    

    best_combination = None
    best_similarity = float('inf')  # Initialize with a high value
    for list_opts in itertools.combinations(formated_options, 3):
        total_similarity = 0.0
        for i in range(len(list_opts)):
            for j in range(i+1, len(list_opts)):
                similarity = sentence_bleu([list_opts[i].split()], list_opts[j].split(), smoothing_function=smoothing_function)
                total_similarity += similarity

        for options in list_opts:
            total_similarity += sentence_bleu([answer.split()], options.split(), smoothing_function=smoothing_function)

        if total_similarity < best_similarity:
            best_similarity = total_similarity
            best_combination = list_opts
    return list(best_combination)

In [25]:
context = '''
 The Yanomami live along the rivers of the rainforest in the
 north of Brazil. They have lived in the rainforest for about
 10,000 years and they use more than 2,000 different plants for
 food and for medicine. But in 1988, someone found gold in
 their forest, and suddenly 45,000 people came to the forest and
 began looking for gold. They cut down the forest to make
 roads. They made more than a hundred airports. The
 Yanomami people lost land and food. Many died because new
 diseases came to the forest with the strangers.
 In 1987, they closed fifteen roads for eight months. No one cut
 down any trees during that time. In Panama, the Kuna people
 saved their forest. They made a forest park which tourists pay
 to visit. The Gavioes people of Brazil use the forest, but they
 protect it as well. They find and sell the Brazil nuts which grow
 on the forest trees.
'''

question = '''
Those people built roads and airports in order to   _  .
'''

answer = '''carry away the gold conveniently'''

t5model = T5ForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    return_dict=True
)
for file_name in os.listdir('pefts'):
    print('Predict for model ' + file_name)
    peft_path = 'pefts/' + file_name
    peft_model = PeftModel.from_pretrained(t5model, peft_path).to(device)
    options = generate(peft_model, answer, context, question)
    print(options)

Predict for model best-checkpoint-modif-v4.ckpt
[' build a park for tourists', ' save the land and food', ' build a forest park which tourists should visit']
Predict for model best-checkpoint-modif-v2.ckpt
[' save the trees', ' make a forest park for tourists', ' find and sell the nuts']
Predict for model best-checkpoint-modif-v1.ckpt
[' find and sell the nuts that grow on them', ' protect their forest', ' make more airports']
Predict for model best-checkpoint-modif-v3.ckpt
[' protect the trees in the forests', ' make more airports', ' help the people to find gold']
Predict for model best-checkpoint-modif.ckpt
[' build a forest park', ' find and sell nuts', ' protect the trees from diseases']


In [26]:
import shutil

shutil.make_archive('archive_shutil', format='zip', root_dir='pefts')

'/kaggle/working/archive_shutil.zip'