# Tutorial

## Import the necessary Packages

In [1]:
# used in the previous experiments
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

# for training
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

# for experiment tracking
import wandb


# common packages
import pandas as pd
from pprint import pprint
import json
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# ds_full = load_dataset('bookcorpus', split='all')
# pprint(ds_full)

## Distribution of the length of the samples

In [3]:
# # Calculate the length of each sample (number of words per sample)
# sample_lengths = [len(text.split()) for text in ds_full['text']]
# sample_lengths.sort()

In [4]:
# bins = np.unique(sample_lengths)

In [5]:
# # Plot the distribution
# import random
# plt.figure(figsize=(10, 4))
# plt.hist(random.sample(sample_lengths, k=10**6), bins=bins[0:150])
# plt.xlabel('Sequence length')
# plt.ylabel('Count')
# plt.grid()
# plt.show()

## Tokenization from the previous week

In [6]:
# # hopper_tokenizer = AutoTokenizer.from_pretrained('../week-2/hopper')
# # hopper_tokenizer
# auto_loaded_tokenizer = AutoTokenizer.from_pretrained(
#     "../week-2/awesome_tokenizer", 
#     local_files_only=True
# )

In [7]:
# hopper_tokenizer = AutoTokenizer.from_pretrained(
#     "../week-2/awesome_tokenizer", 
#     local_files_only=True
# )
# hopper_tokenizer

## Apply for batch samples

In [8]:
# bs =4 # batch_size
# model_inputs = hopper_tokenizer(ds_full[0:bs]['text'], padding=True)
# pprint(model_inputs['input_ids'], compact=True)

## Instead of the hopper tokenizer, use gpt2

In [9]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})


> The parameter `padding_side` is set to `right`, but since no padding token is used, we should add a padding token to avoid errors from HF

In [10]:
tokenizer.pad_token = '<|endoftext|>'

## Map Function

### Custom mapping function

> Which takes a batch of samples and returns `input_ids` and `attention_mask` such that the length of `input_ids` is 1024 for all samples.

### Apply Encoding

> Define a mapping function that takes a batch of samplews and returns input_ids and attention_mask such that the length of input_ids is 1024 for all samples.

In [11]:
tokenizer

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'})

In [12]:
# def cust_func(batch_size):
#   return tokenizer(
#     batch_size['text'],
#     padding = 'max_length',
#     truncation = True,
#     max_length = 1024,
#     return_attention_mask=True
#       )

## Apply the mapping to your dataset

In [13]:
# tokenized_ds = ds_full.map(
#   cust_func,
#   batched=True,
#   remove_columns=ds_full.column_names # removes original text
# )

In [14]:
# tokenized_ds.save_to_disk('data/BC_Chunked')

## Access the save dataset

In [15]:
ds_chunked = load_from_disk('data/BC_Chunked')

In [16]:
print(ds_chunked)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 74004228
})


## Data Loader

In [17]:
ds_split = ds_chunked.train_test_split(test_size=0.007, seed=42)
ds_split

Loading cached split indices for dataset at /home/sachin/projects/DLP/deep-learning-practices/week-3/data/BC_Chunked/cache-972c6039c69922a1.arrow and /home/sachin/projects/DLP/deep-learning-practices/week-3/data/BC_Chunked/cache-71041fb06f640bba.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 73486198
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 518030
    })
})

In [18]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [19]:
from torch.utils.data import DataLoader

In [20]:
dataloader = DataLoader(dataset=ds_split['train'],
                        collate_fn=data_collator,
                        batch_size=4,
                        )

In [21]:
for batch in dataloader:
  pprint(batch)
  break

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[15506,  2513,   494,  ..., 50256, 50256, 50256],
        [36151,   764, 50256,  ..., 50256, 50256, 50256],
        [ 1820, 22662,   275,  ..., 50256, 50256, 50256],
        [   71,  3020,   837,  ..., 50256, 50256, 50256]]),
 'labels': tensor([[15506,  2513,   494,  ...,  -100,  -100,  -100],
        [36151,   764,  -100,  ...,  -100,  -100,  -100],
        [ 1820, 22662,   275,  ...,  -100,  -100,  -100],
        [   71,  3020,   837,  ...,  -100,  -100,  -100]])}


## Initialize the Model

In [22]:
# Download the configuration from the HF hub
configuration = GPT2Config()
pprint(configuration)

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.25.1",
  "use_cache": true,
  "vocab_size": 50257
}



## Build the Model

In [23]:
model = GPT2LMHeadModel(configuration)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

## Count the number of parameters

In [24]:
num_parameters = 0
for param in model.parameters():
  num_parameters +=param.numel()
print(f'Number of Parameters: {num_parameters/10**6:.2f}M')

Number of Parameters: 124.44M


## Train the Model

In [25]:
# !wandb login

In [26]:
# wandb for experiment tracking
wandb.init(
  project='my-awesome-project',
  config={
    "batch_size":16,
    "dataset":"Bookcorpus-74M",
  },
)

[34m[1mwandb[0m: Currently logged in as: [33m21f2000143[0m ([33m21f2000143-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [27]:
training_args = TrainingArguments(
  output_dir="./results",
  overwrite_output_dir=True,
  num_train_epochs=1,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  evaluation_strategy="steps",
  eval_steps=500,
  save_steps=500,
  logging_steps=100,
  save_total_limit=2,
  learning_rate=5e-5,
  weight_decay=0.01,
  fp16=True,
  report_to="wandb",
  run_name="gpt2-bookcorpus"
)

In [28]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=ds_split['train'],
                  eval_dataset = ds_split['test'],
                  data_collator=data_collator)

Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 73486198
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 18371550
  Number of trainable parameters = 124439808
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
  Num examples = 73486198
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 18371550
  Number of trainable parameters = 124439808
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
