 **Few shot text generation with T5 Transformer**

## 1. Install libraries

In [14]:
!pip install transformers==2.9.0

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==2.9.0
  Using cached transformers-2.9.0-py3-none-any.whl (635 kB)
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8 MB)
     |████████████████████████████████| 3.8 MB 19.6 MB/s            
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.9.2
    Uninstalling tokenizers-0.9.2:
      Successfully uninstalled tokenizers-0.9.2
  Attempting uninstall: transformers
    Found existing installation: transformers 3.4.0
    Uninstalling transformers-3.4.0:
      Successfully uninstalled transformers-3.4.0
[31mERROR: pip's dependency resolver does not currently take into account 

In [32]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [45]:
# Check we have a GPU and check the memory size of the GUP
!nvidia-smi

Sat Jun 17 19:13:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0 Off |                  N/A |
| 75%   82C    P2   227W / 250W |   4588MiB / 11018MiB |     79%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:21:00.0 Off |                  N/A |
| 30%   40C    P8     9W / 250W |      1MiB / 11019MiB |      0%      Default |
|       

## 2. Prepare Model

In [101]:

import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)

In [102]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')


In [103]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)



In [135]:
# dataset preparation 

true_false_adjective_tuples = [('The Shawshank Redemption', 'The sawshank Redemption'),
 ('The Dark Knight', 'The darke Knight'),
 ('Fight Club', 'fright Club'),
 ('Pulp Fiction', 'Pulp friction'),
 ('Forrest Gump', 'Forrest gumpp'),
 ('The Lord of the Rings: The Fellowship of the Ring',
  "The Lord of the ring's: The Fellowship of the Ring"),
 ('The Lord of the Rings: The Return of the King',
  "The Lord of the ring's: The Return of the King"),
 ('The Godfather', 'The grandfather'),
 ('Game of Thrones', 'Aim of Thrones'),
 ('The Dark Knight Rises', 'The darke Knight Rises'),
 ('The Lord of the Rings: The Two Towers',
  "The Lord of the ring's: The Two Towers"),
 ('Gladiator', 'generator'),
 ('Batman Begins', 'bethann Begins'),
 ('Breaking Bad', 'baking Bad'),
 ('Star Wars: Episode IV - A New Hope', 'spahr Wars: Episode IV - A New Hope'),
 ('The Silence of the Lambs', "The Silence of the lamb's")]

## 3. Train Loop

In [136]:
t5_model.train()

epochs = 10

for epoch in range(epochs):
  print ("epoch ",epoch)
  for input,output in true_false_adjective_tuples:
    input_sent = "sound_change: "+input+ " </s>"
    ouput_sent = output+" </s>"

    tokenized_inp = tokenizer.encode_plus(input_sent,  max_length=96, pad_to_max_length=True,return_tensors="pt")
    tokenized_output = tokenizer.encode_plus(ouput_sent, max_length=96, pad_to_max_length=True,return_tensors="pt")


    input_ids  = tokenized_inp["input_ids"]
    attention_mask = tokenized_inp["attention_mask"]

    lm_labels= tokenized_output["input_ids"]
    decoder_attention_mask=  tokenized_output["attention_mask"]


    # the forward function automatically creates the correct decoder_input_ids
    output = t5_model(input_ids=input_ids, lm_labels=lm_labels,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
    loss = output[0]

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()




epoch  0
epoch  1
epoch  2
epoch  3
epoch  4
epoch  5
epoch  6
epoch  7
epoch  8
epoch  9


## 4. Test model

In [138]:
sentence = ['Avengers: Infinity War', 'As Good as It Gets', 'Blue Velvet', 'In the Heart of the Sea', 'Peaky Blinders', 'Ghost Rider', 'Die Hard 2', 'The Girl Next Door', 'Men in Black II', 'Enemy of the State']
def get_result(sentence):
    test_sent = f"Sound change: {sentence} </s>"
    test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")
    test_input_ids  = test_tokenized["input_ids"]
    test_attention_mask = test_tokenized["attention_mask"]

    t5_model.eval()
    beam_outputs = t5_model.generate(
        input_ids=test_input_ids,attention_mask=test_attention_mask,
        max_length=64,
        early_stopping=True,
        num_beams=10,
        num_return_sequences=5,
        no_repeat_ngram_size=2
    )
    sent_list = []
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        sent_list.append(sent)
    return sent_list
results = []
for s in sentence:
    results.append(get_result(s))
results

[['Avengers: Infinity War',
  'aims: Infinity War',
  'auris: Infinity War',
  'Aims: Infinity War',
  'aars: Infinity War'],
 ['As Good as It Gets',
  'As Good as it Gets',
  'as Good as It Gets',
  'as good as It Gets',
  'as Good as it Gets'],
 ['blue Velvet', 'blau Velvet', 'blue Velvet', 'blaue Velvet', 'bleak Velvet'],
 ['In the Heart of the Sea',
  'In the heart of the sea',
  'irve of the sea',
  'in the Heart of the Sea',
  'In The Heart of the Sea'],
 ['Peaky Blinders',
  "Peaky Blinders'",
  'peaky Blinders',
  "peaky Blinders'",
  'spitzey Blinders'],
 ['ghost Rider', 'ghoul Rider', 'gypsy Rider', 'phantom Rider', 'ghast Rider'],
 ['Die Hard 2',
  'Die suchard 2',
  'The die Hard 2',
  'The diehard 2',
  'Die Hart 2'],
 ['The Girl Next Door',
  'The girl next door',
  'The girl Next Door',
  'The glean Next Door',
  'The girl next Door'],
 ['Men in Black II',
  'men in Black II',
  'men in black II',
  "Men's in Black II",
  'Hommes in Black II'],
 ['enemy of the State',
  

In [99]:
test_sent = "generate humor: Avengers: Infinity War </s>"
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=5,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

enemy of the State
espion of the State
eye of the State
saar of the State
emy of the State
