<a href="https://colab.research.google.com/github/DeepsMoseli/TswanaBert/blob/master/tswanaBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/82/25/89050e69ed53c2a3b7f8c67844b3c8339c1192612ba89a172cf85b298948/transformers-3.0.1-py3-none-any.whl (757kB)
[K     |▍                               | 10kB 28.6MB/s eta 0:00:01[K     |▉                               | 20kB 6.3MB/s eta 0:00:01[K     |█▎                              | 30kB 6.1MB/s eta 0:00:01[K     |█▊                              | 40kB 7.1MB/s eta 0:00:01[K     |██▏                             | 51kB 6.7MB/s eta 0:00:01[K     |██▋                             | 61kB 7.1MB/s eta 0:00:01[K     |███                             | 71kB 8.0MB/s eta 0:00:01[K     |███▌                            | 81kB 7.8MB/s eta 0:00:01[K     |████                            | 92kB 7.8MB/s eta 0:00:01[K     |████▎                           | 102kB 8.1MB/s eta 0:00:01[K     |████▊                           | 112kB 8.1MB/s eta 0:00:01[K     |█████▏                          | 122kB 8.1M

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:

from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./").glob("**/*.txt")]
paths

['drive/My Drive/2017 Applications/datasets/classification/bank/bank-names.txt',
 'drive/My Drive/2017 Applications/datasets/classification/bank/bank-additional/bank-additional-names.txt',
 'drive/My Drive/2017 Applications/datasets/classification/skin colour/Skin_NonSkin.txt',
 'drive/My Drive/2017 Applications/datasets/classification/student/student.txt',
 'drive/My Drive/dissertation/MIT807/latex tems/Dissertate-NYU-LaTeX/fonts/Lato/SIL Open Font License 1.1.txt',
 'drive/My Drive/dissertation/MIT807/Code/crowdAI.txt',
 'drive/My Drive/dissertation/MIT807/backup/changelog.txt',
 'drive/My Drive/dissertation/MIT807/backup/backup/changelog.txt',
 'drive/My Drive/dissertation/MIT807/backup/accuracy_crossValidation/changelog.txt',
 'drive/My Drive/Valuable code/aloc price formula.txt',
 'drive/My Drive/Valuable code/AR EndImportPortfolio.txt',
 'drive/My Drive/Valuable code/Alloc Enhencements.txt',
 'drive/My Drive/Valuable code/testAggregation.txt',
 'drive/My Drive/Valuable code/Colum

## initialize tokenizer and customize training


In [5]:
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=["./drive/My Drive/TswanaBert/large_test_clean.txt"], show_progress=True,vocab_size=25000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model('./drive/My Drive/TswanaBert')
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

Trained vocab size: 23372


In [6]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./drive/My Drive/TswanaBert/vocab.json",
    "./drive/My Drive/TswanaBert/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)


encoding = tokenizer.encode("gore fa a re merafe yotlhe e nne le.")
print("Encoded String: ", encoding.tokens)

decoded = tokenizer.decode(encoding.ids)
print("Decoded string: {}".format(decoded))

Encoded String:  ['<s>', 'gore', 'Ġfa', 'Ġa', 'Ġre', 'Ġmerafe', 'Ġyotlhe', 'Ġe', 'Ġnne', 'Ġle', '.', '</s>']
Decoded string: <s>gore fa a re merafe yotlhe e nne le.</s>


## Make and Train a language model



In [7]:
!nvidia-smi

Sat Jul  4 03:22:01 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
import torch
torch.cuda.is_available()

True

### Config the model and re-create our tokenizer in transformers


In [9]:
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast

config = RobertaConfig(
    vocab_size = tokenizer.get_vocab_size(),
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained("./drive/My Drive/TswanaBert/", max_len=512)

 since we are training from scratch, we initialize from config not a checkpoint of saved model


In [10]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config = config)
print("Number of parameters: ", model.num_parameters())

Number of parameters:  62080076


## Make and register training dataset

In [11]:
%%time
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./drive/My Drive/TswanaBert/large_test_clean.txt",
    block_size=128,
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

CPU times: user 1.63 s, sys: 28 ms, total: 1.66 s
Wall time: 967 ms


Initialize model

In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./drive/My Drive/TswanaBert",
    overwrite_output_dir=True,
    num_train_epochs=120,
    per_device_train_batch_size=64,
    save_steps=15_000,
    save_total_limit=3,.
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [15]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=120.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=207.0, style=ProgressStyle(description_wi…



CPU times: user 4h 3min 40s, sys: 3h 7min 11s, total: 7h 10min 51s
Wall time: 7h 13min 45s


TrainOutput(global_step=24840, training_loss=3.4094241770521263)

In [25]:
trainer.save_model("./drive/My Drive/TswanaBert")
tokenizer.save_pretrained("./drive/My Drive/TswanaBert")

('./drive/My Drive/TswanaBert/vocab.json',
 './drive/My Drive/TswanaBert/merges.txt',
 './drive/My Drive/TswanaBert/special_tokens_map.json',
 './drive/My Drive/TswanaBert/added_tokens.json')

-----
# Now lets Test

In [19]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./drive/My Drive/TswanaBert",
    tokenizer="./drive/My Drive/TswanaBert"
)

fill_mask("Tona ya mapodisi Bheki Cele a re o <mask> o o oketsegang wa bana ba bonnye ba dingwaga.")



[{'score': 0.07074476778507233,
  'sequence': '<s>Tona ya mapodisi Bheki Cele a re o mokgatlho o o oketsegang wa bana ba bonnye ba dingwaga.</s>',
  'token': 1434,
  'token_str': 'Ġmokgatlho'},
 {'score': 0.06010856851935387,
  'sequence': '<s>Tona ya mapodisi Bheki Cele a re o ne o o oketsegang wa bana ba bonnye ba dingwaga.</s>',
  'token': 313,
  'token_str': 'Ġne'},
 {'score': 0.029703786596655846,
  'sequence': '<s>Tona ya mapodisi Bheki Cele a re o motse o o oketsegang wa bana ba bonnye ba dingwaga.</s>',
  'token': 481,
  'token_str': 'Ġmotse'},
 {'score': 0.029697271063923836,
  'sequence': '<s>Tona ya mapodisi Bheki Cele a re o setse o o oketsegang wa bana ba bonnye ba dingwaga.</s>',
  'token': 634,
  'token_str': 'Ġsetse'},
 {'score': 0.027897300198674202,
  'sequence': '<s>Tona ya mapodisi Bheki Cele a re o santse o o oketsegang wa bana ba bonnye ba dingwaga.</s>',
  'token': 1052,
  'token_str': 'Ġsantse'}]

## save model on huggingface site

In [26]:
from transformers import TFRobertaForMaskedLM

tf_model = TFRobertaForMaskedLM.from_pretrained("./drive/My Drive/TswanaBert", from_pt=True)
tf_model.save_pretrained("./drive/My Drive/TswanaBert")

All PyTorch model weights were used when initializing TFRobertaForMaskedLM.

Some weights or buffers of the PyTorch model TFRobertaForMaskedLM were not initialized from the TF 2.0 model and are newly initialized: ['lm_head.decoder.weight', 'lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Data sources

1. The largest portion of this dataset (10k)  sentences of text, comes from the [Leipzig Corpora Collection](https://wortschatz.uni-leipzig.de/en/download)


2. I Then added SABC news headlines collected by Marivate Vukosi, & Sefara Tshephisho, (2020)  that is generously made available on [zenoodo](http://doi.org/10.5281/zenodo.3668495 ). This added 185 tswana sentences to my corpus. 


3. I went on to add 300 more sentences by scrapping following sites news sites and blogs that mostly based in Botswana. I actively continue to expand the dataset.

* http://setswana.blogspot.com/
* https://omniglot.com/writing/tswana.php
* http://www.dailynews.gov.bw/
* http://www.mmegi.bw/index.php
* https://tsena.co.bw
* http://www.botswana.co.za/Cultural_Issues-travel/botswana-country-guide-en-route.html
* https://www.poemhunter.com/poem/2013-setswana/
https://www.poemhunter.com/poem/ngwana-wa-mosetsana/
 




In [27]:
!transformers-cli login

2020-07-04 11:35:07.694550: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: MoseliMotsoehli
Password: 
Login successful
Your token: uILpqJCPboRgIqtOZcthQNyjnJUTpVVobIurrCjjfTrkEBXuGvmIXCUjPbnokBIFNfffBpdBoSazxNEvYyjBzGeKUURKzwASyXkDOZcKunAiMnMwZKSFKjvJbWtACXCy 

Your token has been saved to /root/.huggingface/token


In [35]:
!transformers-cli upload ./TswanaBert

2020-07-04 12:02:20.220588: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
About to upload file [1m/content/TswanaBert/vocab.json[0m to S3 under filename [1mTswanaBert/vocab.json[0m and namespace [1mMoseliMotsoehli[0m
About to upload file [1m/content/TswanaBert/special_tokens_map.json[0m to S3 under filename [1mTswanaBert/special_tokens_map.json[0m and namespace [1mMoseliMotsoehli[0m
About to upload file [1m/content/TswanaBert/tokenizer_config.json[0m to S3 under filename [1mTswanaBert/tokenizer_config.json[0m and namespace [1mMoseliMotsoehli[0m
About to upload file [1m/content/TswanaBert/pytorch_model.bin[0m to S3 under filename [1mTswanaBert/pytorch_model.bin[0m and namespace [1mMoseliMotsoehli[0m
About to upload file [1m/content/TswanaBert/config.json[0m to S3 under filename [1mTswanaBert/config.json[0m and namespace [1mMoseliMotsoehli[0m
About to upload file [1m/content/TswanaBert/

## Use model from hugging face

In [36]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("MoseliMotsoehli/TswanaBert")
model = AutoModelWithLMHead.from_pretrained("MoseliMotsoehli/TswanaBert")

unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=515.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=378196.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=224777.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=78.0, style=ProgressStyle(description_w…






HBox(children=(FloatProgress(value=0.0, description='Downloading', max=248346529.0, style=ProgressStyle(descri…




In [58]:
num_words = 10
start = "Bodiphatsa"

print("Start: %s"%start)
for k in range(num_words):
  ll = unmasker("%s <mask>"%start)
  start = ll[0]["sequence"].replace("<s>","").replace("</s>","")
  print("k: %s"%start)


Start: Bodiphatsa
k: Bodiphatsa jo
k: Bodiphatsa jo bo
k: Bodiphatsa jo bo tletseng
k: Bodiphatsa jo bo tletseng teng
k: Bodiphatsa jo bo tletseng teng.
k: Bodiphatsa jo bo tletseng teng. 
k: Bodiphatsa jo bo tletseng teng. 
k: Bodiphatsa jo bo tletseng teng. 
k: Bodiphatsa jo bo tletseng teng. 
k: Bodiphatsa jo bo tletseng teng. 
