In [1]:
!pip install --quiet datasets
!pip install --quiet transformers
!pip install --quiet peft

In [2]:
import os
import itertools
import random
from typing import List, Dict
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import json
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from peft.peft_model import PeftModel
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
)

In [3]:
def create_dataset(dataset_split):
    data_rows = []

    for i in tqdm(range(len(dataset_split))):
        context = dataset_split[i]['article']
        question = dataset_split[i]['question']

        all_answers = dataset_split[i]['options']
        correct_answer_index =  ord(dataset_split[i]['answer']) - 65
        correct = all_answers.pop(correct_answer_index)
        incorrect1 = all_answers[0]
        incorrect2 = all_answers[1]
        incorrect3 = all_answers[2]
        data_rows.append({
            'context': context,
            'question': question,
            'correct': correct,
            'incorrect1': incorrect1,
            'incorrect2': incorrect2,
            'incorrect3': incorrect3
        })
    return pd.DataFrame(data_rows)

In [4]:
train_path = os.path.join(os.getcwd(), 'data', 'race_train_df.csv')
dev_path = os.path.join(os.getcwd(), 'data', 'race_dev_df.csv')
test_path = os.path.join(os.getcwd(), 'data', 'race_test_df.csv')
HAVE_TRAIN_DATA = os.path.isfile(train_path)
HAVE_DEV_DATA = os.path.isfile(dev_path)
HAVE_TEST_DATA = os.path.isfile(test_path)

In [5]:
if not HAVE_TRAIN_DATA or not HAVE_DEV_DATA or not HAVE_TEST_DATA:
    from datasets import load_dataset
    dataset = load_dataset("race", 'all')
    race_train_df = create_dataset(dataset['train'])
    race_dev_df = create_dataset(dataset['validation'])
    race_test_df = create_dataset(dataset['test'])
    if not os.path.isdir(os.path.join(os.getcwd(), 'data')):
        os.mkdir(os.path.join(os.getcwd(), 'data'))
    race_train_df.to_csv(train_path, index=False)
    race_dev_df.to_csv(dev_path, index=False)
    race_test_df.to_csv(test_path, index=False)

Downloading builder script:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading and preparing dataset race/all (download: 24.26 MiB, generated: 166.64 MiB, post-processed: Unknown size, total: 190.90 MiB) to /root/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b...


Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

Dataset race downloaded and prepared to /root/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/87866 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

  0%|          | 0/4934 [00:00<?, ?it/s]

In [6]:
train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)
test_df = pd.read_csv(test_path)

In [7]:
test_df.head()

Unnamed: 0,context,question,correct,incorrect1,incorrect2,incorrect3
0,The rain had continued for a week and the floo...,What did Nancy try to do before she fell over?,Protect her cows from being drowned,Measure the depth of the river,Look for a fallen tree trunk,Run away from the flooded farm
1,The rain had continued for a week and the floo...,The following are true according to the passag...,Nancy took hold of the rope and climbed into t...,It took Lizzie and Nancy about 20 minutes to g...,It was raining harder when Nancy managed to ge...,The bad weather made it difficult for rescuers...
2,The rain had continued for a week and the floo...,What did the local people do to help those in ...,They put up shelter for them in a school.,They used helicopters to help carry cows.,They helped farmers gather their cows.,They set up an organization called Red Cross.
3,There is probably no field of human activity i...,The passage tells us that _ .,the clothes that we choose to wear have someth...,our values and lifestyles are in no field of h...,our values and lifestyles are from the sign la...,the clothes we choose to wear depend on a set ...
4,There is probably no field of human activity i...,"Traditionally,people usually thought that _ .",women were concerned greatly about what they w...,men cared very much for clothes,both men and women paid great attention to the...,neither men nor women showed interest in clothes


In [8]:
PROMPT_PLACEHOLDER = """
generate distractors for given context, question and answer:
context: {context};
question: {question};
answer: {correct};
</s>
"""
MODEL_NAME = 't5-small'
SOURCE_MAX_TOKEN_LEN = 512
TARGET_MAX_TOKEN_LEN = 64

device = "cpu"

In [9]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print('tokenizer len: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

tokenizer len:  32100


In [10]:
smoothing_function = SmoothingFunction().method1

def generate(qgmodel, answer: str, context: str, question: str) -> str:
    formatted_distractor = PROMPT_PLACEHOLDER.format(
        context=context,
        question=question,
        correct=answer,
    )
    source_encoding = tokenizer(
        formatted_distractor,
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = qgmodel.generate(
        input_ids=source_encoding['input_ids'].to(device),
        attention_mask=source_encoding['attention_mask'].to(device),
        num_beams=10,
        temperature=1.5,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        early_stopping=True,
        use_cache=True,
        num_return_sequences=1,
        do_sample=True,
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }
    
    formated_options = []
    for option in preds:
        option = option.replace('<pad>', '')
        option = option.replace('</s>', '')
        distractors = option.split(';')
        for distractor in distractors:
            if distractor:
                formated_options.append(distractor)
    
    for option in formated_options:
        option = option.strip()
    
    formated_options = list(set(formated_options))
    if len(formated_options) == 0:
        formated_options.append("-")
        formated_options.append("-")
        formated_options.append("-")
    if len(formated_options) == 1:
        formated_options.append("-")
        formated_options.append("-")
    if len(formated_options) == 2:
        formated_options.append("-")
    if len(formated_options) > 3:
        formated_options = formated_options[:3]
    return formated_options
#     best_combination = None
#     best_similarity = float('inf')  # Initialize with a high value
#     for list_opts in itertools.combinations(formated_options, 3):
#         total_similarity = 0.0
#         for i in range(len(list_opts)):
#             for j in range(i+1, len(list_opts)):
#                 similarity = sentence_bleu([list_opts[i].split()], list_opts[j].split(), smoothing_function=smoothing_function)
#                 total_similarity += similarity

#         for options in list_opts:
#             total_similarity += sentence_bleu([answer.split()], options.split(), smoothing_function=smoothing_function)

#         if total_similarity < best_similarity:
#             best_similarity = total_similarity
#             best_combination = list_opts
#     return list(best_combination)

In [11]:
t5model = T5ForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    return_dict=True
)
peft_path = '/kaggle/input/lorat5/best-checkpoint-modif-v5.ckpt'
peft_model = PeftModel.from_pretrained(t5model, peft_path).to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
from typing import List
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_nltk_bleu_single(references: List[str], hypothesis: str):
    if hypothesis == '': 
        return 0, 0, 0, 0 

    refs_tokenized = list(map(lambda x: word_tokenize(x), references))
    hyp_tokenized = word_tokenize(hypothesis)
    chencherry = SmoothingFunction()

    bleu_1 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(1, 0, 0, 0), smoothing_function=chencherry.method2)
    bleu_2 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method2)
    bleu_3 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method2)
    bleu_4 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method2)

    return bleu_1, bleu_2, bleu_3, bleu_4

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
result_df = test_df.copy()

In [14]:
def calculate_nltk_bleu(references: List[List[str]], hypothesis: List[str]):

    assert len(references) == len(hypothesis)
        
    bleu_totals = [0, 0, 0, 0]

    for i in tqdm(range(len(references))):
        try:
            curr_bleu = calculate_nltk_bleu_single(references[i], hypothesis[i])
        except:
            print(references[i])
            print(hypothesis[i])
            break

        bleu_totals[0] += curr_bleu[0]
        bleu_totals[1] += curr_bleu[1]
        bleu_totals[2] += curr_bleu[2]
        bleu_totals[3] += curr_bleu[3]

    return (round(bleu_totals[0] / len(references) * 100, 2),
            round(bleu_totals[1] / len(references) * 100, 2),
            round(bleu_totals[2] / len(references) * 100, 2),
            round(bleu_totals[3] / len(references) * 100, 2))

In [15]:
from tqdm import tqdm
import torch

peft_model.to(device)

results = []
for i in tqdm(range(len(result_df))):
    sample = result_df.iloc[i]
    results.append(generate(peft_model, sample['correct'], sample['context'], sample['question']))

100%|██████████| 4934/4934 [3:18:54<00:00,  2.42s/it]


In [16]:
path = 'results/test_{}.txt'.format('best-checkpoint-modif-v5')
import os
directory = os.path.dirname(path)
if not os.path.exists(directory):
    os.makedirs(directory)
with open(path, "w") as output:
    for row in results:
        output.write(str(row) + '\n')

from typing import List
def load_lines_from_txt(file_path: str) -> List[str]:
    lines = []
    with open(file_path) as f:
        lines = f.readlines()
    for i in range(len(lines)):
        lines[i] = lines[i].strip()
    return lines

results = load_lines_from_txt(path)

import ast
incorrect1s = []
incorrect2s = []
incorrect3s = []

for result in results:  
    distractors = ast.literal_eval(result)
    if len(distractors) != 3:
        if len(distractors) == 2:
            print('2 answers at', result)
            distractors.append('')
        else:
            print('1 distractor', result, 'not enough distractors??')
            distractors.append('')
            distractors.append('')
            distractors.append('')

    incorrect1s.append(distractors[0])
    incorrect2s.append(distractors[1])
    incorrect3s.append(distractors[2])

In [17]:
reference_correct = list(result_df['correct'])
reference_incorrects = []

for i in range(len(result_df)):
    row = result_df.iloc[i]
    reference_incorrects.append([row['incorrect1'], row['incorrect2'], row['incorrect3']])

In [18]:
bleu_scores = []

bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect1s))
bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect2s))
bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect3s))

bleu_scores.append(calculate_nltk_bleu(reference_correct, incorrect1s))
bleu_scores.append(calculate_nltk_bleu(reference_correct, incorrect2s))
bleu_scores.append(calculate_nltk_bleu(reference_correct, incorrect3s))

print('###', 'bleu_1', 'bleu_2', 'bleu_3', 'bleu_4')
labels = ['d1i', 'd2i', 'd3i', 'd1c', 'd2c', 'd3c']

for i in range(len(bleu_scores)):
    print(labels[i], "{:<7}".format(bleu_scores[i][0]), "{:<7}".format(bleu_scores[i][1]), "{:<7}".format(bleu_scores[i][2]), "{:<7}".format(bleu_scores[i][3]))

100%|██████████| 4934/4934 [00:05<00:00, 874.80it/s]
100%|██████████| 4934/4934 [00:05<00:00, 899.56it/s] 
100%|██████████| 4934/4934 [00:05<00:00, 920.07it/s] 
100%|██████████| 4934/4934 [00:26<00:00, 183.41it/s]
100%|██████████| 4934/4934 [00:26<00:00, 185.26it/s]
100%|██████████| 4934/4934 [00:25<00:00, 191.29it/s]

### bleu_1 bleu_2 bleu_3 bleu_4
d1i 39.61   29.69   26.01   24.07  
d2i 38.43   28.65   25.0    23.12  
d3i 34.45   25.4    22.19   20.55  
d1c 14.19   10.68   10.41   10.44  
d2c 13.92   10.48   10.2    10.2   
d3c 13.16   9.87    9.58    9.57   



