In [1]:
! pip install datasets
! pip install rouge_score
! pip install evaluate
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from google.colab import userdata
import nltk
import evaluate
import numpy as np
import spacy
import re
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer



In [2]:
from huggingface_hub import login

token = userdata.get("HF_TOKEN")
login('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')

In [3]:
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
df = load_dataset("wiki_qa")

In [6]:
df['train'] = df['train'].select(range(10000))
df['validation'] = df['validation'].select(range(1000))
df['test'] = df['test'].select(range(1000))

In [7]:
df['test'] = df['test'].remove_columns(['label', 'document_title', 'question_id'])
df['validation'] = df['validation'].remove_columns(['label', 'document_title', 'question_id'])
df['train'] = df['train'].remove_columns(['label', 'document_title', 'question_id'])

In [9]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def preprocess_dataset(dataset):
    dataset = dataset.map(lambda example: {
        'question': remove_punctuation(example['question']),
        'answer': remove_punctuation(example['answer']),
    })
    return dataset

processed_df = preprocess_dataset(df)

prefix = "Please answer this question: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=examples["answer"],
                       max_length=512,
                       truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_df = processed_df.map(preprocess_function, batched=True)

In [10]:
nltk.download("punkt", quiet=True)
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")

In [11]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {
        **result_rouge,
        **result_bleu
    }

    return result

In [12]:
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   eval_strategy="epoch",
   learning_rate=0.0001,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=4,
   weight_decay=0.01,
   save_total_limit=5,
   num_train_epochs=7,
   predict_with_generate=True,
   push_to_hub=False
)

In [13]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_df["train"],
   eval_dataset=tokenized_df["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malinkamyronets66[0m ([33malinkamyronets66-[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5474,3.34054,0.204125,0.037309,0.170449,0.170372,0.013856,"[0.22392573317392223, 0.03737045162266435, 0.00941157214174646, 0.0036545146626259027]",0.598206,0.660581,14219,21525
2,3.2711,3.323027,0.205574,0.037617,0.169033,0.169078,0.015074,"[0.22379487179487179, 0.04, 0.01005940594059406, 0.00378494623655914]",0.623881,0.679443,14625,21525
3,3.0644,3.326957,0.204644,0.036629,0.166714,0.166691,0.014919,"[0.22458429804251737, 0.03976458160416509, 0.010446421284583368, 0.004087798809206434]",0.60037,0.66216,14253,21525
4,2.941,3.336313,0.205654,0.035155,0.168747,0.168767,0.014253,"[0.22472378569939547, 0.0368904488089015, 0.00936163344362844, 0.003862698621718901]",0.609128,0.668571,14391,21525
5,2.8298,3.357401,0.205876,0.034117,0.168047,0.16802,0.013935,"[0.2243647754959972, 0.03569023569023569, 0.009138697937727456, 0.003783545974483062]",0.607481,0.667364,14365,21525
6,2.7602,3.361717,0.203199,0.033669,0.166652,0.166547,0.01385,"[0.22414536052858375, 0.03605694831321572, 0.009812143575981215, 0.004119370194068107]",0.579325,0.646876,13924,21525
7,2.7043,3.370099,0.202716,0.033135,0.166162,0.166132,0.013403,"[0.221312049289365, 0.035157720394489195, 0.008466986892452984, 0.003722414251528849]",0.602278,0.663554,14283,21525


Trainer is attempting to log a value of "[0.22392573317392223, 0.03737045162266435, 0.00941157214174646, 0.0036545146626259027]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.22379487179487179, 0.04, 0.01005940594059406, 0.00378494623655914]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.22458429804251737, 0.03976458160416509, 0.010446421284583368, 0.004087798809206434]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.22472378569939547, 0.0368904488089015, 0.00936163344362844, 0.003862698621718901]" of type <class 'list'> for ke

TrainOutput(global_step=8750, training_loss=3.0161517926897323, metrics={'train_runtime': 2224.0762, 'train_samples_per_second': 31.474, 'train_steps_per_second': 3.934, 'total_flos': 1949771665760256.0, 'train_loss': 3.0161517926897323, 'epoch': 7.0})

Модель досягла трохи кращих результатів при зменшенні learning rate, при збільшенні кількосі епох особливих покращень помічено не було, можливо варто спробувати більше епох, і тоді буде результат кращий.

In [52]:
import warnings
warnings.filterwarnings("ignore")

In [57]:
my_question = "how are glacier caves formed?"
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt").to('cuda')
outputs = trainer.model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
from textwrap import fill

print(fill(answer, width=80))

<pad> The glacier caves are a type of cave in the Antarctica</s>


In [121]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("./results/checkpoint-8500")
tokenizer = T5Tokenizer.from_pretrained("./results/checkpoint-8500")
for my_question in ["How are the directions of the velocity and force vectors related in a circular motion",
                    "how much does baby ruth candy?"]:
  inputs = tokenizer("Please answer this question: " + my_question, max_length=512,
                        truncation=True, return_tensors='pt')

  inputs = {key: value for key, value in inputs.items()}
  outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)

  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print(f"Question: {my_question}")
  print(f"Generated Answer: {answer}")
  print("-------------------------------")

Question: How are the directions of the velocity and force vectors related in a circular motion
Generated Answer: In physics circular motion refers to the motion of a body in a circular motion
-------------------------------
Question: how much does baby ruth candy?
Generated Answer: Baby Ruth is a brand name of a candy company based in New York City
-------------------------------


In [54]:
# Відповіді в датасеті достатньо специфічні.
# !!!!!!!!!! example of real answer from dataset:

# how much does baby ruth candy? It is owned by the Swiss company Nestlé'
# how are pointe shoes made? The edge of the toe pad, which is inserted between the foot and toe box for cushioning, can be seen on the right foot.

REAL MODEL WO FINE TUNING

In [100]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

predictions = []
references = []
for example in df['test']:
    question = example['question']
    true_answer = example['answer']
    inputs = tokenizer("Please answer this question: " + question, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)

    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(generated_answer)
    references.append([true_answer])

bleu_results = metric_bleu.compute(predictions=predictions, references=references)

rouge_results = metric_rouge.compute(predictions=predictions, references=references)

print(f"BLEU Score: {с}")
print(f"ROUGE Score: {rouge_results}")

BLEU Score: {'bleu': 1.322091241496486e-05, 'precisions': [0.17799009200283086, 0.03271537622682661, 0.015241320914479255, 0.014175257731958763], 'brevity_penalty': 0.00039476268906210257, 'length_ratio': 0.11315768399135101, 'translation_length': 2826, 'reference_length': 24974}
ROUGE Score: {'rouge1': 0.043865721319756114, 'rouge2': 0.008630067146450301, 'rougeL': 0.04072683204371634, 'rougeLsum': 0.040756615994795076}


In [101]:
bleu_results

{'bleu': 1.322091241496486e-05,
 'precisions': [0.17799009200283086,
  0.03271537622682661,
  0.015241320914479255,
  0.014175257731958763],
 'brevity_penalty': 0.00039476268906210257,
 'length_ratio': 0.11315768399135101,
 'translation_length': 2826,
 'reference_length': 24974}

In [102]:
rouge_results

{'rouge1': 0.043865721319756114,
 'rouge2': 0.008630067146450301,
 'rougeL': 0.04072683204371634,
 'rougeLsum': 0.040756615994795076}

In [107]:
my_question = "how much does baby ruth candy?"
inputs = tokenizer("Please answer this question: " + my_question, max_length=512,
                       truncation=True, return_tensors='pt')#, padding=True, )

inputs = {key: value for key, value in inputs.items()}
outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)

answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Question: {my_question}")
print(f"Generated Answer: {answer}")

Question: how much does baby ruth candy?
Generated Answer: $1.25


Висновки:
1.  Fine-tuned модель має значно вищі показники по всіх ROUGE метриках, що вказує на її здатність краще генерувати тексти, які збігаються з оригінальними відповідями.
2. Fine-tuned модель має кращі значення precision для всіх n-грам, що свідчить про її більшу точність у генеруванні відповідей.
3.  Fine-tuned модель генерує текст з більш природною довжиною, тоді як звичайна модель генерує дуже короткі відповіді.
4. Fine-tuned модель показує значно кращі результати в порівнянні зі звичайною моделлю за всіма основними метриками: ROUGE, BLEU, precision, brevity penalty і length ratio. Вона більш точна, генерує тексти, які краще відповідають оригіналам, і має більш природну довжину відповіді. Звичайна модель має низькі показники на всіх метриках, що свідчить про її слабку здатність генерувати відповіді, що відповідають оригінальним даним.

In [120]:
import shutil

# Вказуємо шлях до папки, яку потрібно заархівувати
folder_path = '/content/results/checkpoint-8500'  # Замість my_folder вкажіть вашу папку
zip_name = '/content/results/checkpoint-8500.zip'

# Архівуємо папку
shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)
from google.colab import files

# Завантажуємо zip-файл
files.download(zip_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>