In [190]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer

# Download configuration from huggingface.co and cache.
# model_id = "facebook/mbart-large-50-many-to-many-mmt"
model_id = "google-t5/t5-base"

# does not download the pretrained weights, just affects configuration
# use AutoModelForSeq2SeqLM.from_pretrained to also download the weights
config = AutoConfig.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_config(config)



In [191]:
# now, need to add a tokenizer and embedding layer to the top of the model
auto_tk = AutoTokenizer.from_pretrained(model_id)



In [197]:
print(auto_tk.vocab_size)
print(list(auto_tk.get_vocab())[:100])

32100
['”', '▁punct', '▁taken', 'elecommunication', '▁cables', '▁helpful', 'cita', 'gasesc', '▁buyer', 'aide', '▁bisher', '▁upgrading', '▁Haftung', '▁crunchy', '▁Colegi', '▁Bollywood', '▁historical', '▁NASA', '▁Minute', 'teamed', '▁peace', '▁Diesel', '▁--', 'gate', '▁zip', '▁zuständig', '▁define', '▁Digi', '▁Diversity', '▁engage', '▁peninsula', 'moni', '▁phone', '▁reflecting', '▁experienta', 'blin', '▁poems', 'zugleich', '▁force', 'brücke', 'Lib', 'cul', 'ministerium', '▁song', '▁themes', '▁suis', 'admi', 'gesagt', 'â', 'OC', '▁birouri', '▁activités', '▁franchi', '▁Cushion', '▁Versand', '▁mittels', '▁strig', '▁diffusion', 'lebt', '▁payment', '▁crashes', '▁Qualcomm', '▁Strange', 'lov', '▁nächste', '▁Januar', '▁bestellen', '▁Sat', '▁aplicat', '▁revolution', 'soluble', '▁legend', 'terribly', '▁fitted', '▁run', '▁everyone', '▁ramp', '▁Fotos', 'absorbed', '▁ignor', '▁Bangalore', '▁Commissioner', 'ani', '▁funnel', '▁Proceedings', 'erweise', 'tech', '▁GREAT', '▁dilemma', 'identifying', '▁("',

In [1]:
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from datasets import load_dataset

# dataset = load_dataset("csv", data_files="data/v3.csv")

# dataset = load_dataset("csv", data_files="data/v3.csv")
# data_splits = dataset['train'].train_test_split(0.1)
# train, test = data_splits['train'], data_splits['test']

In [2]:
# train.to_csv('./data/train.csv')
# test.to_csv('./data/test.csv')
train = load_dataset('csv', data_files='data/train.csv')['train']
test = load_dataset('csv', data_files='data/test.csv')['train']

In [3]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

obolo_data = train['Obolo']

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.train_from_iterator(obolo_data, trainer)

In [213]:
print(tokenizer.get_vocab_size())
unique_words_verse = [set(verse.split()) for verse in obolo_data]
unique_words = set()
for s in unique_words_verse:
    unique_words.update(s)
print(len(unique_words))

15866
30535


In [214]:
encoded_inputs = tokenizer.encode_batch(obolo_data[:5])
for idx, enc in enumerate(encoded_inputs):
    print(obolo_data[idx])
    print(enc.tokens)

“ekene onene ifo emen inyọn̄, mè isibi inu? sà ìre ekene otitiin̄ efet itap me akajit ubọk kan̄ ichit? ekene okat mun̄ me emen ekwut ibọp? sà ìre ekene osun̄ ntutun̄ ere geelek me linyọn̄ me irek kan̄? keke ore erieen̄ kan̄? sà ìre keke ore erieen̄ gwun̄ kan̄? tumu nyi emi nu, mè ire oriọọn̄!
['ĠâĢľ', 'ekene', 'Ġonene', 'Ġifo', 'Ġemen', 'Ġinyo', 'Ì£', 'n', 'ÌĦ,', 'Ġme', 'ÌĢ', 'Ġisibi', 'Ġinu', '?', 'Ġsa', 'ÌĢ', 'Ġi', 'ÌĢ', 're', 'Ġekene', 'Ġotitiin', 'ÌĦ', 'Ġefet', 'Ġitap', 'Ġme', 'Ġakajit', 'Ġubo', 'Ì£', 'k', 'Ġkan', 'ÌĦ', 'Ġichit', '?', 'Ġekene', 'Ġokat', 'Ġmun', 'ÌĦ', 'Ġme', 'Ġemen', 'Ġekwut', 'Ġibo', 'Ì£', 'p', '?', 'Ġsa', 'ÌĢ', 'Ġi', 'ÌĢ', 're', 'Ġekene', 'Ġosun', 'ÌĦ', 'Ġntutun', 'ÌĦ', 'Ġere', 'Ġgeelek', 'Ġme', 'Ġlinyo', 'Ì£', 'n', 'ÌĦ', 'Ġme', 'Ġirek', 'Ġkan', 'ÌĦ?', 'Ġkeke', 'Ġore', 'Ġerieen', 'ÌĦ', 'Ġkan', 'ÌĦ?', 'Ġsa', 'ÌĢ', 'Ġi', 'ÌĢ', 're', 'Ġkeke', 'Ġore', 'Ġerieen', 'ÌĦ', 'Ġgwun', 'ÌĦ', 'Ġkan', 'ÌĦ?', 'Ġtumu', 'Ġnyi', 'Ġemi', 'Ġnu', ',', 'Ġme', 'ÌĢ', 'Ġire',

In [242]:
eng_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

english_data = train['English']

eng_tokenizer.pre_tokenizer = ByteLevel()
eng_tokenizer.decoder = decoders.ByteLevel()
eng_tokenizer.train_from_iterator(english_data, trainer)

In [189]:
print(tokenizer.get_vocab_size())
unique_words_verse = [set(verse.split()) for verse in english_data]
unique_words = set()
for s in unique_words_verse:
    unique_words.update(s)
print(len(unique_words))

15852
28190


In [167]:
encoded_inputs = tokenizer.encode_batch(english_data[:5])
for idx, enc in enumerate(encoded_inputs):
    print(english_data[idx])
    print(enc.tokens)

and he said unto him, we [are] passing from bethlehemjudah toward the side of mount ephraim; from thence [am] i: and i went to bethlehemjudah, but i [am now] going to the house of the lord; and there [is] no man that receiveth me to house.
['Ġand', 'Ġhe', 'Ġsaid', 'Ġunto', 'Ġhim', ',', 'Ġwe', 'Ġ[', 'are', ']', 'Ġpassing', 'Ġfrom', 'Ġbethlehemjudah', 'Ġtoward', 'Ġthe', 'Ġside', 'Ġof', 'Ġmount', 'Ġephraim', ';', 'Ġfrom', 'Ġthence', 'Ġ[', 'am', ']', 'Ġi', ':', 'Ġand', 'Ġi', 'Ġwent', 'Ġto', 'Ġbethlehemjudah', ',', 'Ġbut', 'Ġi', 'Ġ[', 'am', 'Ġnow', ']', 'Ġgoing', 'Ġto', 'Ġthe', 'Ġhouse', 'Ġof', 'Ġthe', 'Ġlord', ';', 'Ġand', 'Ġthere', 'Ġ[', 'is', ']', 'Ġno', 'Ġman', 'Ġthat', 'Ġreceiveth', 'Ġme', 'Ġto', 'Ġhouse', '.']
and ye shall appoint the possession of the city five thousand broad, and five and twenty thousand long, over against the oblation of the holy [portion]: it shall be for the whole house of israel.
['Ġand', 'Ġye', 'Ġshall', 'Ġappoint', 'Ġthe', 'Ġpossession', 'Ġof', 'Ġthe', 'Ġcity'

In [219]:
from transformers import AutoTokenizer
bpe_gpt2 = AutoTokenizer.from_pretrained('gpt2', padding_side='left')

In [220]:
bpe_gpt2.vocab_size

50257

In [221]:
print(list(bpe_gpt2.vocab)[:10])

['Ġshoes', 'Mexico', 'Ġinaction', 'asc', 'Ġshared', 'ĠTrack', 'Ġsubord', 'modern', 'Ġtubes', 'Ïī']


In [244]:
tokens = bpe_gpt2.tokenize('Two fighters in the UFC were striking each other, then one of them ended up taking his opponent down and submitting him.')
print(tokens)
ids = bpe_gpt2.encode('Two fighters in the UFC were striking each other, then one of them ended up taking his opponent down and submitting him.')
bpe_gpt2.decode(ids)

['Two', 'Ġfighters', 'Ġin', 'Ġthe', 'ĠUFC', 'Ġwere', 'Ġstriking', 'Ġeach', 'Ġother', ',', 'Ġthen', 'Ġone', 'Ġof', 'Ġthem', 'Ġended', 'Ġup', 'Ġtaking', 'Ġhis', 'Ġopponent', 'Ġdown', 'Ġand', 'Ġsubmitting', 'Ġhim', '.']


'Two fighters in the UFC were striking each other, then one of them ended up taking his opponent down and submitting him.'

In [245]:
sample_words = 'batting disengage archeologists eyewitnesses photographers'
for word in sample_words.split():
    tokens = bpe_gpt2.tokenize(word)
    print(tokens)
    # ids = bpe_gpt2.encode(word)
    # print(bpe_gpt2.decode(ids))

['bat', 'ting']
['dis', 'eng', 'age']
['ar', 'che', 'ologists']
['ey', 'ewitness', 'es']
['phot', 'ographers']


In [255]:
sample_words = 'batting disengage archeologists eyewitnesses photographers'
for word in sample_words.split():
    tokens = eng_tokenizer.encode(word).tokens
    print(tokens)
print(eng_tokenizer.get_vocab_size(), bpe_gpt2.vocab_size)

['Ġbatt', 'ing']
['Ġdis', 'en', 'g', 'age']
['Ġarch', 'eo', 'log', 'ist', 's']
['Ġeyewitnesses']
['Ġp', 'ho', 'to', 'g', 'raph', 'ers']
18627 50257


In [215]:
# tokenizer.save('data/obolo-bpe-tokenizer.json')
# tokenizer = Tokenizer.from_file('data/obolo-bpe-tokenizer.json')

In [217]:
print(obolo_data[100])
out = tokenizer.encode(obolo_data[100])
print(out.tokens)
print(tokenizer.decode(out.ids))

ìkakifieek owot me lek ofiik mgbọ utọọk, mije, ìkilọlọk ofọnti utọọk iso otutuuk ebi uwu kan̄.
['Ġi', 'ÌĢ', 'kakifieek', 'Ġowot', 'Ġme', 'Ġlek', 'Ġofiik', 'Ġmgbo', 'Ì£', 'Ġuto', 'Ì£', 'o', 'Ì£', 'k', ',', 'Ġmije', ',', 'Ġi', 'ÌĢ', 'kilo', 'Ì£', 'lo', 'Ì£', 'k', 'Ġofo', 'Ì£', 'nti', 'Ġuto', 'Ì£', 'o', 'Ì£', 'k', 'Ġiso', 'Ġotutuuk', 'Ġebi', 'Ġuwu', 'Ġkan', 'ÌĦ.']
 ìkakifieek owot me lek ofiik mgbọ utọọk, mije, ìkilọlọk ofọnti utọọk iso otutuuk ebi uwu kan̄.


In [4]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file='data/obolo-bpe-tokenizer.json')
fast_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=15866, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [54]:
ids = fast_tokenizer(["mè ekekpulu me egwe mè eririeen̄, inyi ekeche utoon̄ esan̄a me lek udun. awaji okpọkpọ, mè imun̄ ibe ke inu cha îjaan̄."], add_special_tokens=True)
print(ids.get('input_ids'))
print(fast_tokenizer.convert_tokens_to_ids('[UNK]'))
fast_tokenizer.batch_decode(ids['input_ids'])

[[56, 55, 3255, 56, 687, 56, 55, 750, 118, 104, 7992, 709, 53, 1158, 53, 16, 56, 95, 425, 10, 156, 526, 50, 152, 191, 56, 55, 470, 53, 108, 87, 109, 177, 49, 81, 587, 149]]
0


[' mè ekekpulu me egwe mè eririeen̄, inyi ekeche utoon̄ esan̄a me lek udun. awaji okpọkpọ, mè imun̄ ibe ke inu cha îjaan̄.']

In [72]:
auto_tk.batch_encode_plus(obolo_data[:5])['input_ids'][0]

[140,
 3,
 23,
 346,
 15719,
 6,
 3,
 9,
 210,
 17815,
 1889,
 3522,
 19,
 77,
 63,
 2,
 29,
 2,
 3,
 51,
 5115,
 3,
 40,
 77,
 63,
 2,
 29,
 2,
 5,
 1]