In [34]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [35]:
MODEL_NAME = 'distilgpt2'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

loading file https://huggingface.co/distilgpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/distilgpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/distilgpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilgpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6

Embedding(50259, 768)

In [36]:
sentences = pd.read_csv('/kaggle/input/holmes/holmes.csv')['sentence']
sentences.head()


0       ﻿To Sherlock Holmes she is always _the_ woman.
1    I have seldom heard him mention her under any ...
2    In his eyes she eclipses and predominates the ...
3    It was not that he felt any emotion akin to lo...
4    All emotions, and that one particularly, were ...
Name: sentence, dtype: object

In [37]:
max_length = max([len(tokenizer.encode(sentence)) for sentence in sentences])

In [38]:
max_length

125

In [39]:
class HolmesDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [40]:
dataset = HolmesDataset(sentences, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [41]:
train_dataset[0]

(tensor([50257,  1544, 28271,   465, 12450,    13, 50256, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tenso

In [42]:
import gc
gc.collect()

128

In [43]:
training_args = TrainingArguments(output_dir='/Users/brian/Documents/College/NLP', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1, learning_rate=7e-5,
                                  warmup_steps=10, weight_decay=0.05, report_to = 'none')


PyTorch: setting up devices


In [44]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 5463
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5463


Step,Training Loss
100,1.9602
200,0.7601
300,0.7368
400,0.8022
500,0.7398
600,0.8427
700,0.6952
800,0.6421
900,0.6705
1000,0.6743


Saving model checkpoint to /Users/brian/Documents/College/NLP/checkpoint-5000
Configuration saved in /Users/brian/Documents/College/NLP/checkpoint-5000/config.json
Model weights saved in /Users/brian/Documents/College/NLP/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5463, training_loss=0.7113451021136151, metrics={'train_runtime': 212.7394, 'train_samples_per_second': 25.679, 'train_steps_per_second': 25.679, 'total_flos': 174250994688000.0, 'train_loss': 0.7113451021136151, 'epoch': 1.0})

In [45]:
model.save_pretrained("/Users/brian/Documents/College/NLP")

Configuration saved in /Users/brian/Documents/College/NLP/config.json
Model weights saved in /Users/brian/Documents/College/NLP/pytorch_model.bin


In [46]:
tokenizer.save_pretrained("/Users/brian/Documents/College/NLP")

tokenizer config file saved in /Users/brian/Documents/College/NLP/tokenizer_config.json
Special tokens file saved in /Users/brian/Documents/College/NLP/special_tokens_map.json
added tokens file saved in /Users/brian/Documents/College/NLP/added_tokens.json


('/Users/brian/Documents/College/NLP/tokenizer_config.json',
 '/Users/brian/Documents/College/NLP/special_tokens_map.json',
 '/Users/brian/Documents/College/NLP/vocab.json',
 '/Users/brian/Documents/College/NLP/merges.txt',
 '/Users/brian/Documents/College/NLP/added_tokens.json')

In [47]:
tokenizer = GPT2Tokenizer.from_pretrained("/Users/brian/Documents/College/NLP")
model = GPT2LMHeadModel.from_pretrained("/Users/brian/Documents/College/NLP")

loading file /Users/brian/Documents/College/NLP/vocab.json
loading file /Users/brian/Documents/College/NLP/merges.txt
loading file /Users/brian/Documents/College/NLP/added_tokens.json
loading file /Users/brian/Documents/College/NLP/special_tokens_map.json
loading file /Users/brian/Documents/College/NLP/tokenizer_config.json
Adding <|startoftext|> to the vocabulary
Adding <|pad|> to the vocabulary
loading configuration file /Users/brian/Documents/College/NLP/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,


In [49]:
generated = tokenizer("<|startoftext|> He hasn't seen the old man since", return_tensors="pt").input_ids
sample_outputs = model.generate(generated, do_sample=True, top_k=45, max_length=40, top_p=0.7, temperature=2.45, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  He hasn't seen the old man since, no better; no better of you when my son walks.
1:  He hasn't seen the old man since the age of one! He would be as old and as new with a beard and a hat which is a man, when old age dies.
2:  He hasn't seen the old man since a moment ago.
3:  He hasn't seen the old man since, with his glasses at his chin.
4:  He hasn't seen the old man since when his father passed away but now it has gone on forever to give birth—for if he does come from Winchester and visit her every time of the future
5:  He hasn't seen the old man since it arrived to him; for all that I had known of him before I saw him when the latter entered, it would appear to me that there are some
6:  He hasn't seen the old man since I first saw it from where he is?’ He is one of seven or eight times the kind of father in this family that goes out all over
7:  He hasn't seen the old man since.
8:  He hasn't seen the old man since last morning.
9:  He hasn't seen the old man since he left th