In [1]:
pip list

Package                   Version
------------------------- --------------
absl-py                   2.1.0
aiohappyeyeballs          2.4.0
aiohttp                   3.10.5
aiosignal                 1.3.1
anyio                     4.4.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
astunparse                1.6.3
async-lru                 2.0.4
attrs                     24.2.0
babel                     2.16.0
beautifulsoup4            4.12.3
bleach                    6.1.0
certifi                   2024.7.4
cffi                      1.17.0
charset-normalizer        3.3.2
colorama                  0.4.6
comm                      0.2.2
contourpy                 1.2.1
cycler                    0.12.1
datasets                  2.21.0
debugpy                   1.8.5
decorator                 5.1.1
defusedxml                0.7.1
dill                      0.3.8
executing                 2.0.1
fastjsonschema   

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
model = model.cuda()

In [4]:
input_sentence = "This is a test text."
token_ids = tokenizer.encode(input_sentence, return_tensors = "pt").cuda()
token_ids

tensor([[1494,  339,  259,  262, 2978, 7461,  260,    1]], device='cuda:0')

In [5]:
model_out = model.generate(token_ids)
print(model_out)

tensor([[     0, 250099,      1]], device='cuda:0')




In [6]:
example_input_str = '<jp> This is just a test nbuig.'
# example_input_str = 'これは普通のテスト'
input_ids = tokenizer.encode(example_input_str, return_tensors='pt')
print('Input IDs:', input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print('Tokens:', tokens)

Input IDs: tensor([[1042, 3889,  669, 1494,  339, 1627,  259,  262, 2978,  259,  272, 1982,
         1315,  260,    1]])
Tokens: ['▁<', 'jp', '>', '▁This', '▁is', '▁just', '▁', 'a', '▁test', '▁', 'n', 'bu', 'ig', '.', '</s>']


In [7]:
# sorted(tokenizer.vocab.items(), key=lambda x: x[1])

In [8]:
# from datasets import load_dataset

# ds = load_dataset("odunola/spanish-english-pairs")

In [9]:
# type(ds)

In [10]:
# ds

In [11]:
import pandas as pd

df = pd.read_csv("hf://datasets/odunola/spanish-english-pairs/data.csv")

In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139013 entries, 0 to 139012
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  139013 non-null  object
 1   spanish  139013 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


In [13]:
# df['english']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df["english"], df["spanish"], test_size=0.2, random_state=42)

In [16]:
X_train

91785                    She's concerned about your safety.
63860                          I'll be there rain or shine.
137495    There are usually never many cars on the road ...
132868    Tom's French improved after he started studyin...
114924           It's getting harder for me to concentrate.
                                ...                        
110268             No matter where you go, I'll follow you.
119879         You can see the setting sun from the window.
103694               He does not have any relatives at all.
131932    Is there something in particular that you want...
121958       His aunt takes care of his dog during the day.
Name: english, Length: 111210, dtype: object

In [17]:
# for i in range(len(df['english'])):
#     df['english'][i]="<en> "+df['english'][i]
    

In [18]:
df['english'] = df['english'].apply(lambda x: "<en> " + x)
df['spanish'] = df['spanish'].apply(lambda x: "<esp> " + x)

In [19]:
df['english']

0                                                  <en> Go.
1                                                  <en> Go.
2                                                  <en> Go.
3                                                  <en> Go.
4                                                  <en> Hi.
                                ...                        
139008    <en> A carbon footprint is the amount of carbo...
139009    <en> Since there are usually multiple websites...
139010    <en> If you want to sound like a native speake...
139011    <en> It may be impossible to get a completely ...
139012    <en> One day, I woke up to find that God had p...
Name: english, Length: 139013, dtype: object

In [20]:
df['spanish']

0                                                 <esp> Ve.
1                                               <esp> Vete.
2                                               <esp> Vaya.
3                                             <esp> Váyase.
4                                               <esp> Hola.
                                ...                        
139008    <esp> Una huella de carbono es la cantidad de ...
139009    <esp> Como suele haber varias páginas web sobr...
139010    <esp> Si quieres sonar como un hablante nativo...
139011    <esp> Puede que sea imposible obtener un corpu...
139012    <esp> Un día, me desperté y vi que Dios me hab...
Name: spanish, Length: 139013, dtype: object

In [21]:
special_tokens_dict = {'additional_special_tokens': ["<en>","<esp>","<jp>"]}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(250103, 768)

In [22]:
token_ids = tokenizer.encode(
    example_input_str, return_tensors='pt', padding='max_length',
    truncation=True, max_length=20)
print(token_ids)

tokens = tokenizer.convert_ids_to_tokens(token_ids[0])
print(tokens)

tensor([[250102,   1494,    339,   1627,    259,    262,   2978,    259,    272,
           1982,   1315,    260,      1,      0,      0,      0,      0,      0,
              0,      0]])
['<jp>', '▁This', '▁is', '▁just', '▁', 'a', '▁test', '▁', 'n', 'bu', 'ig', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [23]:
def encode_input_str(text, tokenizer, seq_len=128):
  # target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text =  text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

In [24]:
def encode_target_str(text, tokenizer, seq_len=128):
    token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
    return token_ids[0]

In [25]:
def format_input_data(input_text, target_text, tokenizer ):
    # input_text = df["spanish"][120]
    # target_text = df["english"][120]
    
    input_token_ids = encode_input_str(
      input_text, tokenizer)
    target_token_ids = encode_target_str(
      target_text, tokenizer)
    return input_token_ids, target_token_ids


In [26]:
    # Testing `data_transform`
in_ids, out_ids = format_input_data(
    df["spanish"][120], df["english"][120], tokenizer)

print(' '.join(tokenizer.convert_ids_to_tokens(in_ids)))
print(' '.join(tokenizer.convert_ids_to_tokens(out_ids)))

<esp> ▁¡ Gracias ! </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<en> ▁Thanks ! </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

In [27]:
import torch
def transform_batch(batch, tokenizer):
  inputs = []
  targets = []
  for input_text, target_text in batch.items():
    formatted_data = format_input_data(
        input_text, target_text, tokenizer)
    # print(f"F {formatted_data}")
    if formatted_data is None:
      continue
    
    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))
    
  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

def get_data_generator(dataset, tokenizer, batch_size=32):
  dataset = dataset.sample(frac=1).reset_index(drop=True)
  for i in range(0, len(dataset), batch_size):
    raw_batch = {}
    for j in range(0, batch_size):
        raw_batch[dataset["spanish"][j]] = dataset["english"][j]
    yield transform_batch(raw_batch, tokenizer)

In [28]:
# Testing data generator
data_gen = get_data_generator(df, tokenizer)
data_batch = next(data_gen)
print('Input shape:', data_batch[0].shape)
print('Output shape:', data_batch[1].shape)

Input shape: torch.Size([32, 128])
Output shape: torch.Size([32, 128])


In [29]:
# model.load_state_dict(torch.load(model_path))
model_path = '/modelsave/mt5_translation.pt'



In [30]:
import numpy as np

In [39]:
# Constants
n_epochs = 8
batch_size = 8
print_freq = 50
checkpoint_freq = 1000
lr = 5e-4
n_batches = int(np.ceil(len(df) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

In [40]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [41]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, n_warmup_steps, total_steps)

In [42]:
losses = []

In [43]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset,
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [44]:
from tqdm import tqdm_notebook

In [45]:
for epoch_idx in range(n_epochs):
  # Randomize data order
  data_generator = get_data_generator(df,
                                      tokenizer, batch_size)
                
  for batch_idx, (input_batch, label_batch) \
      in tqdm_notebook(enumerate(data_generator), total=n_batches):
    optimizer.zero_grad()

    # Forward pass
    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)

    # Calculate loss and update weights
    loss = model_out.loss
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Print training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
          epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))
      
    if (batch_idx + 1) % checkpoint_freq == 0:
      test_loss = eval_model(model, test_dataset)
      print('Saving model with test loss of {:.3f}'.format(test_loss))
      torch.save(model.state_dict(), model_path)

torch.save(model.state_dict(), model_path)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  in tqdm_notebook(enumerate(data_generator), total=n_batches):


  0%|          | 0/17377 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 