In [None]:
#!pip install datasets

In [None]:
from datasets import load_dataset
from transformers import TFAutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import create_optimizer
import tensorflow as tf
from transformers import DataCollatorForWholeWordMask

In [None]:
import psutil
import os

In [None]:
from pprint import pprint

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_checkpoint_token = 'FacebookAI/xlm-roberta-base'
#model_checkpoint = './drive/MyDrive/Models/osho_masked_lm_fine_tuned_roberta_batch_size_32'
model_checkpoint = './drive/MyDrive/Models/osho_masked_lm_fine_tuned_roberta'

In [None]:
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_token)

All model checkpoint layers were used when initializing TFXLMRobertaForMaskedLM.

All the layers of TFXLMRobertaForMaskedLM were initialized from the model checkpoint at ./drive/MyDrive/Models/osho_masked_lm_fine_tuned_roberta.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForMaskedLM for predictions without further training.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
def print_memory_used_by_process():
  process_id = os.getpid()
  process_meta = psutil.Process(process_id)
  mem_used = process_meta.memory_info().rss/(1024*1024*1024)
  print(f'{(mem_used)} GB')

In [None]:
print_memory_used_by_process()

2.337085723876953 GB


In [None]:
osho_dataset = load_dataset('DhruvDancingBuddha/osho_discourses')

Downloading readme:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1965 [00:00<?, ? examples/s]

In [None]:
osho_dataset

DatasetDict({
    train: Dataset({
        features: ['char_url', 'topic_name', 'topic_lesson_name', 'topic_lesson_url', 'all_txt'],
        num_rows: 1965
    })
})

In [None]:
def tokenizer_osho(examples):
  result = tokenizer(examples['all_txt'])

  if tokenizer.is_fast:
    result["word_ids"] = [result.word_ids(i) for i in range(len(result['input_ids']))]

  return result

In [None]:
osho_dataset = osho_dataset['train'].map(tokenizer_osho, batched=True, remove_columns=['all_txt', 'char_url', 'topic_name', 'topic_lesson_name', 'topic_lesson_url'])

Map:   0%|          | 0/1965 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (11762 > 512). Running this sequence through the model will result in indexing errors


In [None]:
osho_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 1965
})

In [None]:
#pprint(osho_dataset['input_ids'][1], width=200, compact=True)

In [None]:
print_memory_used_by_process()

4.231006622314453 GB


In [None]:
def chunk_text(examples):
  concate_examples = {k:sum(examples[k], []) for k in examples.keys()}
  chunk_size = 128

  total_len = len(concate_examples[list(examples.keys())[0]])

  total_len = (total_len // chunk_size) * chunk_size

  results = {
  k:[t[i:i + chunk_size] for i in range(0, total_len, chunk_size)]
    for k, t in concate_examples.items()
  }

  results["labels"] = results["input_ids"].copy()

  return results

In [None]:
osho_dataset = osho_dataset.map(chunk_text, batched=True)

Map:   0%|          | 0/1965 [00:00<?, ? examples/s]

In [None]:
osho_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 147817
})

In [None]:
whole_word_data_collator  = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
total_len = len(osho_dataset)
train_len = int(0.9 * total_len)
test_len  = total_len - train_len

print(f'Total Length is {total_len}\n\nTrain Len is    {train_len}\n\nTest Len is     {test_len}')

Total Length is 147817

Train Len is    133035

Test Len is     14782


In [None]:
osho_dataset = osho_dataset.train_test_split(train_size=train_len,test_size=test_len, seed=42)

In [None]:
osho_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 133035
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 14782
    })
})

In [None]:
print_memory_used_by_process()

4.605098724365234 GB


In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    osho_dataset["train"],
    collate_fn=whole_word_data_collator,
    shuffle=True,
    batch_size=32)

tf_eval_dataset = model.prepare_tf_dataset(
    osho_dataset["test"],
    collate_fn=whole_word_data_collator,
    shuffle=True,
    batch_size=32)



In [None]:
print_memory_used_by_process()

4.638053894042969 GB


In [None]:
num_train_steps = len(tf_train_dataset)
print(f'Number of Train Steps {num_train_steps}')

Number of Train Steps 4157


In [None]:
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=500,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

In [None]:
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
num_epochs = 2

In [None]:
callback = tf.keras.callbacks.EarlyStopping(patience=3)

In [None]:
# 16629/16629 [==============================] - 1425s 86ms/step - loss: 1.5697 - val_loss: 1.4889
# 16629/16629 [==============================] - 1454s 87ms/step - loss: 1.4953 - val_loss: 1.4240

In [None]:
history = model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=[callback])

Epoch 1/2




Epoch 2/2


In [None]:
model_dir = './drive/MyDrive/Models/osho_masked_lm_fine_tuned_roberta'

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model.save_pretrained(model_dir)

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
model.push_to_hub('DhruvDancingBuddha/osho_discourses_roberta_128')

In [None]:
tokenizer.push_to_hub('DhruvDancingBuddha/osho_discourses_roberta_128')