# BERT: As one of Autoencoding Language Models 

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 7.4MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 36.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 51.7MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [4]:
!pip install tokenizers



In [5]:
os.chdir("drive/MyDrive/akademi/Packt NLP with Transformers/CH03")

In [6]:
os.listdir()

['CH03.1 As one of Autoencoding Language Models.ipynb',
 'IMDB Dataset.csv',
 'albert.png',
 'CH03.0X Autoencoding Models.ipynb',
 'old files',
 'CH03.0X Tokenizer.ipynb',
 'corpus.txt']

In [8]:
import pandas as pd
imdb_df = pd.read_csv("IMDB Dataset.csv")
reviews = imdb_df.review.to_string(index=None) 
with open("corpus.txt", "w") as f: 
    f.writelines(reviews) 

In [9]:
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer() 
bert_wordpiece_tokenizer.train("corpus.txt") 

In [10]:
bert_wordpiece_tokenizer.get_vocab()

{'##iliar': 2285,
 'germans': 10744,
 'shaky': 11110,
 'nigel': 13408,
 'list': 1340,
 'spy': 5241,
 'veterans': 13210,
 'mitz': 14035,
 'ske': 3446,
 'abs': 584,
 '##vest': 3676,
 'uncle': 5537,
 'seven': 2458,
 'decadent': 17952,
 'catastrop': 15985,
 '56': 13853,
 '##ioklar': 15410,
 'previous': 1388,
 'poppa': 12696,
 'affection': 11317,
 'dane': 8645,
 'beowulf': 9701,
 'hooray': 15505,
 'carey': 10612,
 'kite': 6605,
 'missouri': 10593,
 'four': 1177,
 'videostore': 15523,
 'vacation': 3869,
 'amanda': 8894,
 'milo': 12798,
 'edith': 9071,
 'cabin': 5112,
 'strangeland': 13504,
 'deaths': 16038,
 'entrails': 13705,
 'lind': 3881,
 'braindead': 16470,
 'awful': 664,
 'relate': 7488,
 'gome': 13950,
 '##ited': 1048,
 '##zem': 14365,
 '##ter': 223,
 'archbishop': 16917,
 'chains': 8874,
 '##dle': 1795,
 'zent': 8735,
 '##istical': 12007,
 '##ator': 2912,
 'comedians': 7382,
 'jum': 7814,
 'clouds': 13541,
 'ang': 1423,
 'mus': 796,
 'lar': 1545,
 'lon': 2394,
 'covered': 10741,
 'ka

In [11]:
!mkdir tokenizer
bert_wordpiece_tokenizer.save_model("tokenizer")

['tokenizer/vocab.txt']

In [16]:
tokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

In [17]:
tokenized_sentence = tokenizer.encode("Oh it works just fine")

In [18]:
tokenized_sentence.tokens

['[CLS]', 'oh', 'it', 'works', 'just', 'fine', '[SEP]']

In [19]:
tokenized_sentence = tokenizer.encode("ohoh i thougt it might be workingg well")

In [20]:
from transformers import BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained("tokenizer") 

In [21]:
from transformers import LineByLineTextDataset 
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path="corpus.txt", block_size=128) 



In [22]:
from transformers import DataCollatorForLanguageModeling 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) 

In [23]:
from transformers import TrainingArguments 
training_args = TrainingArguments(output_dir="BERT", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=128) 

In [24]:
from transformers import BertConfig, BertForMaskedLM 
bert = BertForMaskedLM(BertConfig()) 

In [25]:
from transformers import Trainer 
trainer = Trainer(model=bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 

In [26]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=391, training_loss=5.399562669836956, metrics={'train_runtime': 183.9047, 'train_samples_per_second': 2.126, 'total_flos': 809001393842448.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 1978449920, 'init_mem_gpu_alloc_delta': 439194112, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 16642048, 'train_mem_gpu_alloc_delta': 1324854272, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 7558385664})

In [27]:
trainer.save_model("MyBERT")

In [28]:
from transformers import BertConfig 
BertConfig() 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [29]:
tiny_bert_config = BertConfig(max_position_embeddings=512, hidden_size=128, num_attention_heads=2, num_hidden_layers=2, intermediate_size=512) 
tiny_bert_config 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [30]:
tiny_bert = BertForMaskedLM(tiny_bert_config) 
trainer = Trainer(model=tiny_bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 
trainer.train() 

Step,Training Loss


TrainOutput(global_step=391, training_loss=8.822054577605499, metrics={'train_runtime': 25.8968, 'train_samples_per_second': 15.098, 'total_flos': 32626925464848.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 17671168, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 8192, 'train_mem_gpu_alloc_delta': 53066240, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 3202919424})

In [31]:
from transformers import TFBertModel, BertTokenizerFast 
bert = TFBertModel.from_pretrained("bert-base-uncased") 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 
bert.layers 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7faa6da9e250>]

In [32]:
tokenized_text = tokenizer.batch_encode_plus(["hello how is it going with you","lets test it"], return_tensors="tf", max_length=256, truncation=True, pad_to_max_length=True) 
bert(tokenized_text) 



TFBaseModelOutputWithPooling([('last_hidden_state',
                               <tf.Tensor: shape=(2, 256, 768), dtype=float32, numpy=
                               array([[[ 1.00471266e-01,  6.77026808e-02, -8.33596289e-02, ...,
                                        -4.93304521e-01,  1.16539374e-01,  2.26647303e-01],
                                       [ 3.23624015e-01,  3.70718539e-01,  6.14685655e-01, ...,
                                        -6.27267420e-01,  3.79082859e-01,  7.05310851e-02],
                                       [ 1.99534193e-01, -8.75509858e-01, -6.47859275e-02, ...,
                                        -1.28073208e-02,  3.07651341e-01, -2.07320880e-02],
                                       ...,
                                       [-6.53303489e-02,  1.19046137e-01,  5.76847017e-01, ...,
                                        -2.95460761e-01,  2.49741450e-02,  1.13964267e-01],
                                       [-2.64715284e-01, -7.863805

In [33]:
from tensorflow import keras 
import tensorflow as tf 
max_length = 256 
tokens = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
masks = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
embedding_layer = bert.layers[0]([tokens,masks])[0][:,0,:] 
dense = tf.keras.layers.Dense(units=2, activation="softmax")(embedding_layer) 
model = keras.Model([tokens,masks],dense) 

In [34]:
tokenized = tokenizer.batch_encode_plus(["hello how is it going with you","hello how is it going with you"], return_tensors="tf", max_length= max_length, truncation=True, pad_to_max_length=True) 



In [35]:
model([tokenized["input_ids"],tokenized["attention_mask"]]) 

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.56051177, 0.43948826],
       [0.56051177, 0.43948826]], dtype=float32)>

In [37]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"]) 
model.summary() 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           bert[0][0]                   

In [38]:
model.layers[2].trainable = False 

In [39]:
import pandas as pd 
imdb_df = pd.read_csv("IMDB Dataset.csv") 
reviews = list(imdb_df.review) 
tokenized_reviews = tokenizer.batch_encode_plus(reviews, return_tensors="tf", max_length=max_length, truncation=True, pad_to_max_length=True) 

import numpy as np 
train_split = int(0.8 * len(tokenized_reviews["attention_mask"])) 
train_tokens = tokenized_reviews["input_ids"][:train_split] 
test_tokens = tokenized_reviews["input_ids"][train_split:] 
train_masks = tokenized_reviews["attention_mask"][:train_split] 
test_masks = tokenized_reviews["attention_mask"][train_split:] 
sentiments = list(imdb_df.sentiment) 
labels = np.array([[0,1] if sentiment == "positive" else [1,0] for sentiment in sentiments]) 
train_labels = labels[:train_split] 
test_labels = labels[train_split:] 



In [None]:
model.fit([train_tokens,train_masks],train_labels, epochs=5)