In [4]:
#!pip install datasets

In [5]:
from datasets import Dataset
from pprint import pprint

from transformers import AutoTokenizer
from transformers import TFAutoModelForCausalLM
from transformers import create_optimizer, AdamWeightDecay
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline

import json
import psutil
import datasets

from datasets import load_dataset

import numpy as np

import os

from google.colab import drive

from transformers import create_optimizer
import tensorflow as tf

from huggingface_hub import notebook_login

In [7]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
model_checkpoint = "distilgpt2"

In [39]:
model = TFAutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [40]:
def print_memory_used_by_process():
  process_id = os.getpid()
  process_meta = psutil.Process(process_id)
  mem_used = process_meta.memory_info().rss/(1024*1024)
  print(f'{int(mem_used)} MB')

In [41]:
print_memory_used_by_process()

In [42]:
drive.mount('/content/drive')

In [43]:
print_memory_used_by_process()

In [44]:
osho_dataset = load_dataset('DhruvDancingBuddha/osho_discourses')

In [45]:
osho_dataset

DatasetDict({
    train: Dataset({
        features: ['char_url', 'topic_name', 'topic_lesson_name', 'topic_lesson_url', 'all_txt'],
        num_rows: 1965
    })
})

In [46]:
print_memory_used_by_process()

In [47]:
def tokenizer_osho(examples):
  result = tokenizer(examples['all_txt'])

  #if tokenizer.is_fast:
  #  result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]

  return result

In [48]:
osho_dataset = osho_dataset['train'].map(tokenizer_osho, batched=True, remove_columns=['all_txt', 'char_url', 'topic_name', 'topic_lesson_name', 'topic_lesson_url'])

Map:   0%|          | 0/1965 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (11129 > 1024). Running this sequence through the model will result in indexing errors


In [49]:
osho_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1965
})

In [50]:
print_memory_used_by_process()

In [51]:
def chunk_text(examples):
  concate_examples = {k:sum(examples[k], []) for k in examples.keys()}
  chunk_size = 128

  total_len = len(concate_examples[list(examples.keys())[0]])

  total_len = (total_len // chunk_size) * chunk_size

  results = {
  k:[t[i:i + chunk_size] for i in range(0, total_len, chunk_size)]
    for k, t in concate_examples.items()
  }

  results["labels"] = results["input_ids"].copy()

  return results

In [52]:
osho_dataset = osho_dataset.map(chunk_text, batched=True)

Map:   0%|          | 0/1965 [00:00<?, ? examples/s]

In [53]:
osho_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 138918
})

In [54]:
total_len = len(osho_dataset)
train_len = int(0.9 * total_len)
test_len  = total_len - train_len

print(f'Total Length is {total_len}\n\nTrain Len is    {train_len}\n\nTest Len is     {test_len}')

In [55]:
osho_dataset = osho_dataset.train_test_split(train_size=train_len,test_size=test_len, seed=42)

In [56]:
osho_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 125026
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13892
    })
})

In [57]:
print_memory_used_by_process()

In [58]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [59]:
tf_train_dataset = model.prepare_tf_dataset(
    osho_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32)

tf_eval_dataset = model.prepare_tf_dataset(
    osho_dataset["test"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32)

In [60]:
print_memory_used_by_process()

In [61]:
num_train_steps = len(tf_train_dataset)
print(f'Number of Train Steps {num_train_steps}')

num_epochs = 1
num_train_steps_1 = num_epochs * num_train_steps
print(f'Number of Train Steps INTO EPOCHS {num_train_steps_1}')

In [62]:
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps_1,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

In [63]:
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [64]:
history = model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

In [68]:
pprint(history.history)

In [70]:
model_dir = './drive/MyDrive/Models/osho_causal_fine_tuned'

In [71]:
model.save_pretrained(model_dir)

In [72]:
model_create_id = 'DhruvDancingBuddha/osho_discourses_distillgpt2_causal_llm'

In [73]:
model.push_to_hub(model_create_id)
tokenizer.push_to_hub(model_create_id)

tf_model.h5:   0%|          | 0.00/328M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DhruvDancingBuddha/osho_discourses_distillgpt2_causal_llm/commit/6c923e2ed0f66f9833fa5c763a3b53526653fa62', commit_message='Upload tokenizer', commit_description='', oid='6c923e2ed0f66f9833fa5c763a3b53526653fa62', pr_url=None, pr_revision=None, pr_num=None)

In [1]:
print('Dhruv')

Dhruv


In [6]:
checkpoint = 'DhruvDancingBuddha/osho_discourses_distillgpt2_causal_llm'
checkpoint_1 = "distilgpt2"

In [7]:
model = TFAutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/328M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at DhruvDancingBuddha/osho_discourses_distillgpt2_causal_llm.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
test_sentence = "Mindfullness is all about"

In [12]:
tokenized = tokenizer(test_sentence, return_tensors="tf")
outputs = model.generate(**tokenized, max_length=100, no_repeat_ngram_size=3, do_sample=True, top_k=300)
tokenizer.decode(outputs[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Mindfullness is all about awareness, so don’t jump on anything, but come along. Start smiling.Remember: there is no more of an aura when you are standing on the peak or standing on a hill peak. It cannot be more than that. It will remain just the same.A single moment, that is not so. It still is.Buddha had to stop one moment, then one repetition, then another repetition, and then another on another, and after another'