In [None]:
!pip install transformers sentencepiece datasets #this is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems.

In [None]:
from datasets import load_dataset
from google.colab import drive
from IPython.display import display
from IPython.html import widgets
import matplotlib.pyplot as plt          # we have used the matplotlib for our graphs
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook
sns.set()

In [None]:
drive.mount("/content/gdrive")

In [None]:
model_repo="google/mt5-base"
model_path="/content/gdrive/My Drive/mt5_translation.pt"
max_seq_len=20

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_repo)

In [None]:
model=AutoModelForSeq2SeqLM.from_pretrained(model_repo)
model=model.cuda

In [None]:
exinput="this is just a test 🎌"
inputid=tokenizer.encode(exinput,return_tensors="pt")
print("inputid=",inputid)
tokens= tokenizer.convert_ids_to_tokens(inputid[0])
print("tokens",tokens)

In [None]:
sorted(tokenizer.vocab.items(),key=lambda x:x[1])

Dataset


In [None]:
dataset=load_dataset("alt")

In [None]:
train_dataset=dataset["train"]
test_dataset=dataset["test"]

In [None]:
train_dataset[0]

In [None]:
LANG_TOKEN_MAPPING={
    "en":"<en>",
    "ja":"<jp>",
    "zh":"<zh>"
}

In [None]:
s_dict={"additional_special_tokens":list(LANG_TOKEN_MAPPING.values())}
_special_tokens(special_tokens_dict)
token_embeddings(len(tokenizer))

In [None]:
tokenid=tokenizer.encode(
    exinput,return_tensors="pt",padding="max_length",truncation=True,max_length=max_seq_len)
print(tokenid)
tokens=tokenizer.convert_ids_to_tokens(tokenid[0])
print(tokens)

tensor([[   714,    339,   1627,    259,    262,   2978,    259, 247100,      1,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0]])
['▁this', '▁is', '▁just', '▁', 'a', '▁test', '▁', '🎌', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [None]:
# Creating the first function for encoding.
def encodeinput(text,targetlang,tokenizer,seq_len,lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token=lang_token_map[targetlang]                   # tokenizing and adding special tokens.
  inputid=tokenizer.encode(text=target_lang_token+text,
                          return_tensors="pt",
                          padding="max_length"
                          ,truncation=True
                          ,max_length=seq_len)
  return inputid[0]

# Creating the second function for encoding.
def encodetargetstring(text,tokenizer,seq_len,lang_token_map=LANG_TOKEN_MAPPING):
  tokenids=tokenizer.encode(
      
                          text=text,
                          return_tensors="pt",
                          padding="max_length"
                          ,truncation=True
                          ,max_length=seq_len)
  return tokenids[0]

# Creating the third function for encoding.
def formattranslationdata(translations,lang_token_map,seq_len=128):     # choosing 2 laguages for input and output.
  langs=list(lang_token_map.keys())
  input_lang,target_lang=np.random.choice(langs,size=2,replace=False)
  input_text=translations[input_lang]
  target_text=translations[target_lang]
  if input_text is None or target_text is None: 
    return None
  input_token_id=encodeinput(input_text,target_lang,tokenizer,seq_len,lang_token_map)
  target_token_id=encodetargetstring(target_text,tokenizer,seq_len,lang_token_map)
  return input_token_id,target_token_id
  
# Creating the fourth function for encoding.
def transform_batch(batch,lang_tokens,tokenizer):
  inputs=[]
  targets=[]
  for translation_set in batch["translation"]:
    format_data=formattranslationdata(translation_set,tokenizer,max_seq_len)
    if format_data is None:
      continue 
    inputid,targetid=format_data
    inputs.append(inputid.unsqueeze(0))
    targets.append(targetid.unsqueeze(0))

  batch_input_ids=torch.cat(inputs).cuda()
  
  batch_target_ids=torch.cat(targets).cuda()
  return batch_input_ids, batch_target_ids
