In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

In [3]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2TokenizerFast
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline
from pathlib import Path
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
class SonnetData:
    def __init__(self, data_path):
        self.sonnets = []
        self.path = data_path
        for file_name in os.listdir(data_path):
            with open(data_path / file_name, 'rt') as f:
              text = ''.join(f.readlines())
              self.sonnets.append(text)
        
    def tokenize(self, tokenizer):
      self.sonnets = tokenizer(self.sonnets, truncation=True,padding=True)

    def group_texts(self,block_size=16):
      concatenated_examples = {k: sum(self.sonnets[k], []) for k in self.sonnets.keys()}
      total_length = len(concatenated_examples[list(self.sonnets.keys())[0]])
      self.sonnets = {
          k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
          for k, t in concatenated_examples.items()
      }
      self.sonnets['labels'] = self.sonnets['input_ids'].copy()
      if len(self.sonnets['input_ids'][-1])!=len(self.sonnets['input_ids'][0]):
        for x in self.sonnets:
          self.sonnets[x].pop()
    def __len__(self):
        return len(self.sonnets['input_ids'])

    def __getitem__(self, ind):
        return self.sonnets['input_ids'][ind]

In [6]:
def process(data,tokenizer):
  data.tokenize(tokenizer)
  data.group_texts()
  for key in data.sonnets:
    # print(key,data.sonnets[key])
    data.sonnets[key] = torch.tensor(data.sonnets[key])

In [7]:

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


number_of_sonnets = 153
data_path = Path('/content/drive/MyDrive/JB/P-tuning/data/')
train = SonnetData(data_path / 'train')
test = SonnetData(data_path / 'test')
validate = SonnetData(data_path / 'validate')

process(train,tokenizer)
process(test,tokenizer)
process(validate,tokenizer)

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
)

trainer.train()

***** Running training *****
  Num examples = 1441
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 543


Epoch,Training Loss,Validation Loss
1,No log,4.989789
2,No log,4.892489
3,4.972800,4.882153


***** Running Evaluation *****
  Num examples = 169
  Batch size = 8
***** Running Evaluation *****
  Num examples = 169
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 169
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=543, training_loss=4.945821547815593, metrics={'train_runtime': 38.596, 'train_samples_per_second': 112.007, 'train_steps_per_second': 14.069, 'total_flos': 17649778950144.0, 'train_loss': 4.945821547815593, 'epoch': 3.0})

In [16]:
model.to("cpu")
tuned_model_pipeline = pipeline("text-generation",model=model,tokenizer=tokenizer)
tuned_model_pipeline("How oft, when thou, my music, music play'st Upon that blessed")[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"How oft, when thou, my music, music play'st Upon that blessed truth,\nThe beauty of thy sweet love:\nMy most love, in my love's life, shall be given\nOf the sweet love. So thou art when"

In [None]:
original_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
original_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
original_model_pipeline = pipeline("text-generation",model=original_model,tokenizer=original_tokenizer)


In [15]:
original_model_pipeline("How oft, when thou, my music, music play'st Upon that blessed")[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'How oft, when thou, my music, music play\'st Upon that blessed me, how do we know to that song?"\n\n\n\n\n\nâ€”\n\n\nWe could have made something of it and now it is. There are many'