In [190]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer

# Download configuration from huggingface.co and cache.
# model_id = "facebook/mbart-large-50-many-to-many-mmt"
model_id = "google-t5/t5-base"

# does not download the pretrained weights, just affects configuration
# use AutoModelForSeq2SeqLM.from_pretrained to also download the weights
config = AutoConfig.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_config(config)



In [191]:
# now, need to add a tokenizer and embedding layer to the top of the model
auto_tk = AutoTokenizer.from_pretrained(model_id)



In [197]:
print(auto_tk.vocab_size)
print(list(auto_tk.get_vocab())[:100])

32100
['”', '▁punct', '▁taken', 'elecommunication', '▁cables', '▁helpful', 'cita', 'gasesc', '▁buyer', 'aide', '▁bisher', '▁upgrading', '▁Haftung', '▁crunchy', '▁Colegi', '▁Bollywood', '▁historical', '▁NASA', '▁Minute', 'teamed', '▁peace', '▁Diesel', '▁--', 'gate', '▁zip', '▁zuständig', '▁define', '▁Digi', '▁Diversity', '▁engage', '▁peninsula', 'moni', '▁phone', '▁reflecting', '▁experienta', 'blin', '▁poems', 'zugleich', '▁force', 'brücke', 'Lib', 'cul', 'ministerium', '▁song', '▁themes', '▁suis', 'admi', 'gesagt', 'â', 'OC', '▁birouri', '▁activités', '▁franchi', '▁Cushion', '▁Versand', '▁mittels', '▁strig', '▁diffusion', 'lebt', '▁payment', '▁crashes', '▁Qualcomm', '▁Strange', 'lov', '▁nächste', '▁Januar', '▁bestellen', '▁Sat', '▁aplicat', '▁revolution', 'soluble', '▁legend', 'terribly', '▁fitted', '▁run', '▁everyone', '▁ramp', '▁Fotos', 'absorbed', '▁ignor', '▁Bangalore', '▁Commissioner', 'ani', '▁funnel', '▁Proceedings', 'erweise', 'tech', '▁GREAT', '▁dilemma', 'identifying', '▁("',

In [205]:
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from datasets import load_dataset

# dataset = load_dataset("csv", data_files="data/v3.csv")

# dataset = load_dataset("csv", data_files="data/v3.csv")
# data_splits = dataset['train'].train_test_split(0.1)
# train, test = data_splits['train'], data_splits['test']

In [208]:
# train.to_csv('./data/train.csv')
# test.to_csv('./data/test.csv')
train = load_dataset('csv', data_files='data/train.csv')['train']
test = load_dataset('csv', data_files='data/test.csv')['train']

In [187]:


tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])



obolo_data = train['Obolo']

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.train_from_iterator(obolo_data, trainer)

In [188]:
print(tokenizer.get_vocab_size())
unique_words_verse = [set(verse.split()) for verse in obolo_data]
unique_words = set()
for s in unique_words_verse:
    unique_words.update(s)
print(len(unique_words))

15852
30505


In [163]:
encoded_inputs = tokenizer.encode_batch(obolo_data[:5])
for idx, enc in enumerate(encoded_inputs):
    print(obolo_data[idx])
    print(enc.tokens)

jisọs oneninan̄a me agan̄ galili isi agba okwaan̄ jodan, inyi jọn igwook ọmọ mun̄.
['Ġjiso', 'Ì£', 's', 'Ġoneninan', 'ÌĦ', 'a', 'Ġme', 'Ġagan', 'ÌĦ', 'Ġgalili', 'Ġisi', 'Ġagba', 'Ġokwaan', 'ÌĦ', 'Ġjodan', ',', 'Ġinyi', 'Ġjo', 'Ì£', 'n', 'Ġigwook', 'Ġo', 'Ì£', 'mo', 'Ì£', 'Ġmun', 'ÌĦ.']
eyiyi ore adasi ikan, mè ire si eyi òmimin ichit.
['Ġeyiyi', 'Ġore', 'Ġadasi', 'Ġikan', ',', 'Ġme', 'ÌĢ', 'Ġire', 'Ġsi', 'Ġeyi', 'Ġo', 'ÌĢ', 'mimin', 'Ġichit', '.']
mije ema kpetet inyan̄a-ibot awaji onyan̄abe melek kiban̄ inye; kpesusun̄ si ikan kan̄.
['Ġmije', 'Ġema', 'Ġkpetet', 'Ġinyan', 'ÌĦ', 'a', '-', 'ibot', 'Ġawaji', 'Ġonyan', 'ÌĦ', 'abe', 'Ġmelek', 'Ġkiban', 'ÌĦ', 'Ġinye', ';', 'Ġkpesusun', 'ÌĦ', 'Ġsi', 'Ġikan', 'Ġkan', 'ÌĦ.']
ebilene mêlilim inin̄ emen ǹkororok lek ewuuk, mè emen udọn̄ me inyọn̄ ijọn̄, inyi ema enan̄a me lek ikpele ukpook okumugwem, mè òrirọ eyi umin ubọọn̄ kan̄, me mgbọ okumugwem môjibibe isibi inu inisisik linyọn̄.
['Ġebilene', 'Ġme', 'ÌĤ', 'lilim', 'Ġinin'

In [185]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

english_data = train['English']

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.train_from_iterator(english_data, trainer)

In [189]:
print(tokenizer.get_vocab_size())
unique_words_verse = [set(verse.split()) for verse in english_data]
unique_words = set()
for s in unique_words_verse:
    unique_words.update(s)
print(len(unique_words))

15852
28190


In [167]:
encoded_inputs = tokenizer.encode_batch(english_data[:5])
for idx, enc in enumerate(encoded_inputs):
    print(english_data[idx])
    print(enc.tokens)

and he said unto him, we [are] passing from bethlehemjudah toward the side of mount ephraim; from thence [am] i: and i went to bethlehemjudah, but i [am now] going to the house of the lord; and there [is] no man that receiveth me to house.
['Ġand', 'Ġhe', 'Ġsaid', 'Ġunto', 'Ġhim', ',', 'Ġwe', 'Ġ[', 'are', ']', 'Ġpassing', 'Ġfrom', 'Ġbethlehemjudah', 'Ġtoward', 'Ġthe', 'Ġside', 'Ġof', 'Ġmount', 'Ġephraim', ';', 'Ġfrom', 'Ġthence', 'Ġ[', 'am', ']', 'Ġi', ':', 'Ġand', 'Ġi', 'Ġwent', 'Ġto', 'Ġbethlehemjudah', ',', 'Ġbut', 'Ġi', 'Ġ[', 'am', 'Ġnow', ']', 'Ġgoing', 'Ġto', 'Ġthe', 'Ġhouse', 'Ġof', 'Ġthe', 'Ġlord', ';', 'Ġand', 'Ġthere', 'Ġ[', 'is', ']', 'Ġno', 'Ġman', 'Ġthat', 'Ġreceiveth', 'Ġme', 'Ġto', 'Ġhouse', '.']
and ye shall appoint the possession of the city five thousand broad, and five and twenty thousand long, over against the oblation of the holy [portion]: it shall be for the whole house of israel.
['Ġand', 'Ġye', 'Ġshall', 'Ġappoint', 'Ġthe', 'Ġpossession', 'Ġof', 'Ġthe', 'Ġcity'

In [176]:
from transformers import AutoTokenizer
bpe_gpt2 = AutoTokenizer.from_pretrained('gpt2')

In [177]:
bpe_gpt2.vocab_size

50257

In [184]:
print(list(bpe_gpt2.vocab)[:10])

['Ġrumours', 'Ġaudi', 'Ġunregulated', 'Ġ19', 'Ġhistorian', 'essa', 'Ġrace', 'ĠGorsuch', 'ĠHeaven', 'real']


In [127]:
tokenizer.save('data/obolo-bpe-tokenizer.json')
# tokenizer = Tokenizer.from_file('data/obolo-bpe-tokenizer.json')

In [128]:
out = tokenizer.encode(obolo_data[100])
print(out.tokens)
print(tokenizer.decode(out.ids))

['Ġjaban', 'Ġonenikaan', 'ÌĦ', 'Ġngwan', 'ÌĦ', 'Ġenerieen', 'ÌĦ', 'Ġge', 'Ġekigwen', 'Ġjuban', '.', 'Ġjuban', 'Ġore', 'Ġadasi', 'Ġene', 'Ġo', 'ÌĢ', 'bebene', 'Ġikikwak', 'Ġuneen', 'ÌĦ,', 'Ġme', 'ÌĢ', 'Ġikiwut', 'Ġo', 'ÌĢ', 'ja', '.']
 jaban onenikaan̄ ngwan̄ enerieen̄ ge ekigwen juban. juban ore adasi ene òbebene ikikwak uneen̄, mè ikiwut òja.


In [131]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file='data/obolo-bpe-tokenizer.json')
fast_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=16463, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [136]:
ids = fast_tokenizer.encode("mè ekekpulu me egwe mè eririeen̄, inyi ekeche utoon̄ esan̄a me lek udun. awaji okpọkpọ, mè imun̄ ibe ke inu cha îjaan̄.")
fast_tokenizer.decode(ids)

' mè ekekpulu me egwe mè eririeen̄, inyi ekeche utoon̄ esan̄a me lek udun. awaji okpọkpọ, mè imun̄ ibe ke inu cha îjaan̄.'

In [72]:
auto_tk.batch_encode_plus(obolo_data[:5])['input_ids'][0]

[140,
 3,
 23,
 346,
 15719,
 6,
 3,
 9,
 210,
 17815,
 1889,
 3522,
 19,
 77,
 63,
 2,
 29,
 2,
 3,
 51,
 5115,
 3,
 40,
 77,
 63,
 2,
 29,
 2,
 5,
 1]