In [None]:
# installing needed libraries
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install transformers[torch]
!pip install accelerate -U
!pip install peft
!pip install sentencepiece

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [None]:
# import needed libraries

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM,T5ForConditionalGeneration,T5Tokenizer, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType


In [None]:
# load squad dataset
train_dataset = load_dataset('squad', split='train') #87,599 samples
valid_dataset = load_dataset('squad', split='validation') # 10,570 samples

In [None]:
valid_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [None]:
print(valid_dataset[1034])

{'id': '573382d24776f41900660c39', 'title': 'Warsaw', 'context': "Warsaw, especially its city centre (Śródmieście), is home not only to many national institutions and government agencies, but also to many domestic and international companies. In 2006, 304,016 companies were registered in the city. Warsaw's ever-growing business community has been noticed globally, regionally, and nationally. MasterCard Emerging Market Index has noted Warsaw's economic strength and commercial center. Moreover, Warsaw was ranked as the 7th greatest emerging market. Foreign investors' financial participation in the city's development was estimated in 2002 at over 650 million euro. Warsaw produces 12% of Poland's national income, which in 2008 was 305.1% of the Polish average, per capita (or 160% of the European Union average). The GDP per capita in Warsaw amounted to PLN 94 000 in 2008 (c. EUR 23 800, USD 33 000). Total nominal GDP of the city in 2010 amounted to 191.766 billion PLN, 111696 PLN per capita

In [None]:
model_name='google/flan-t5-base'  # load google flan t5 base model

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
def print_number_of_trainable_model_parameters(model):   # prints all model parameters and trainable percentage
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [None]:
index = 200                                      # generate question using prompt with one shot inference

context = valid_dataset['context'][index]
question = valid_dataset['question'][index]

prompt = f"""
Generate Question from the following context.

{context}

question:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT:\n{prompt}')
print(dash_line)
print(f'HUMAN quesion:\n{question}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT:

Generate Question from the following context.

Despite waiving longtime running back DeAngelo Williams and losing top wide receiver Kelvin Benjamin to a torn ACL in the preseason, the Carolina Panthers had their best regular season in franchise history, becoming the seventh team to win at least 15 regular season games since the league expanded to a 16-game schedule in 1978. Carolina started the season 14–0, not only setting franchise records for the best start and the longest single-season winning streak, but also posting the best start to a season by an NFC team in NFL history, breaking the 13–0 record previously shared with the 2009 New Orleans Saints and the 2011 Green Bay Packers. With their NFC-best 15–1 regular season record, the Panthers clinched home-field advantage throughout the NFC playoffs for the first time in franchise history. Ten players were selected to the Pro B

In [None]:
def tokenize_function(example):      # create tokenized data set for training
    start_prompt = 'Generate Question from the following context .\n\n'
    end_prompt = '\n\nQuestion: '
    prompt = [start_prompt + context + end_prompt for context in example["context"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["question"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
tokenized_valid_datasets = valid_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
tokenized_train_datasets = tokenized_train_datasets.remove_columns(['id', 'title', 'context', 'question', 'answers'])
tokenized_valid_datasets = tokenized_valid_datasets.remove_columns(['id', 'title', 'context', 'question', 'answers'])

In [None]:
tokenized_train_datasets.shape

(87599, 2)

Take only 20 % of the data for training


In [None]:
tokenized_train_datasets_sample = tokenized_train_datasets.filter(lambda example, index: index % 5 == 0, with_indices=True)


Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [None]:
tokenized_valid_datasets_sample = tokenized_valid_datasets.filter(lambda example, index: index % 5 == 0, with_indices=True)

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
print(tokenized_train_datasets_sample.shape)
print(tokenized_valid_datasets_sample.shape)

(17520, 2)
(2114, 2)


FUll FINE-TUNING

In [None]:
# output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     learning_rate=1e-5,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     logging_steps=1,
#     max_steps=1
# )

# trainer = Trainer(
#     model=original_model,
#     args=training_args,
#     train_dataset=tokenized_train_datasets_sample,
#     eval_dataset=tokenized_valid_datasets_sample
# )

In [None]:
# trainer.train()

APPLY PEFT (LORA)

In [None]:
# define lora configration
lora_config = LoraConfig(
    r=8, # Rank of the 2 matrices
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [None]:
#lod peft model using get_peft_model from hugging face

peft_model = get_peft_model(original_model,lora_config)

# print the number of trainable parameters

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 884736
all model parameters: 248462592
percentage of trainable model parameters: 0.36%


DEFINE TRAINING PARAMETERS AND TRAIN THE MODEL

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
peft_training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/QG_model_check",
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=200,

)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_train_datasets_sample,
    eval_dataset=tokenized_valid_datasets_sample
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
peft_trainer.train()

Step,Training Loss
200,2.7297
400,0.0931
600,0.0825
800,0.0787
1000,0.0763
1200,0.0746
1400,0.0742
1600,0.0708
1800,0.0733
2000,0.0719


TrainOutput(global_step=2190, training_loss=0.319037171803653, metrics={'train_runtime': 5322.8918, 'train_samples_per_second': 3.291, 'train_steps_per_second': 0.411, 'total_flos': 1.20445665804288e+16, 'train_loss': 0.319037171803653, 'epoch': 1.0})

In [None]:
# save the model and the tokenizer
peft_model_path='/content/drive/MyDrive/QG_Model'

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('/content/drive/MyDrive/QG_Model/tokenizer_config.json',
 '/content/drive/MyDrive/QG_Model/special_tokens_map.json',
 '/content/drive/MyDrive/QG_Model/spiece.model',
 '/content/drive/MyDrive/QG_Model/added_tokens.json',
 '/content/drive/MyDrive/QG_Model/tokenizer.json')

LOAD THE MODEL FOR INFERENCE

In [None]:
# load

loaded_model = T5ForConditionalGeneration.from_pretrained(peft_model_path)
loaded_tokenizer = T5Tokenizer.from_pretrained(peft_model_path)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

GENERATE QUESION USING PEFT MODEL

In [None]:
index = 1034
num_questions_to_generate = 5

loaded_model.to('cuda')

context = valid_dataset['context'][index]
question = valid_dataset['question'][index]

prompt = f"""
Generate Question from the following context.

{context}

question:
"""
for _ in range(num_questions_to_generate):
  input_ids = loaded_tokenizer(prompt, return_tensors="pt").input_ids
  input_ids = input_ids.to('cuda')  # Move input_ids to the GPU

  peft_model_outputs = loaded_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  peft_model_text_output = loaded_tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

  print(f'PEFT MODEL: {peft_model_text_output}')

PEFT MODEL: In 2006, how many companies were registered in Warsaw?
PEFT MODEL: In what year was the GDP per capita in Warsaw estimated at over 650 million euro?
PEFT MODEL: In what year was the GDP per capita in Warsaw estimated at over 650 million euro?
PEFT MODEL: In what year was the GDP per capita in Warsaw estimated at over 650 million euro?
PEFT MODEL: In what year was the GDP per capita in Warsaw estimated at over 650 million euro?


 LOAD MODEL AND GENERAT QUESTION FUNCTION FOR DEPLOYMENT

In [None]:
def model_loading(path):

  loaded_model = T5ForConditionalGeneration.from_pretrained(path)
  loaded_tokenizer = T5Tokenizer.from_pretrained(path)

  return loaded_model,loaded_tokenizer

In [None]:
def generate_question(context, model, tokenizer):

    prompt_template = f"""
    Generate Question from the following context.

    {context}

    question:
    """
    input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids
    input_ids = input_ids.to('cuda') if model.device.type == 'cuda' else input_ids

    model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    question_output = tokenizer.decode(model_outputs[0], skip_special_tokens=True)

    return question_output


In [None]:
loaded_model, loaded_tokenizer = model_loading(peft_model_path)

index = 145
context = valid_dataset['context'][index]

# Move the loaded model to GPU if available
loaded_model.to('cuda')

generated_question = generate_question(context, loaded_model, loaded_tokenizer)
print(f'Generated Question: {generated_question}')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Question: The San Francisco Bay Area last hosted the Super Bowl XIX in 1985, when which team won the game?
