In [12]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [1]:
import pandas as pd

In [13]:
sentences_df = pd.read_fwf("antistereotype_sentences.txt", [(0,1024)])

  sentences_df = pd.read_fwf("antistereotype_sentences.txt", [(0,1024)])


In [14]:
sentences_df.iloc[2]["sentence"]

'The schoolgirl is walking down the street. She does not have to worry about child predators.'

In [18]:
class Sentences(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2-medium", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.sentences = []

        for row in sentences_df['sentence']:
          self.sentences.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.sentences = self.sentences[:20000]
        self.sentences_count = len(self.sentences)
        
    def __len__(self):
        return self.sentences_count

    def __getitem__(self, item):
        return self.sentences[item]
    
dataset = Sentences(sentences_df["sentence"], truncate=True, gpt2_type="gpt2-medium")      

In [19]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [20]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [21]:
model = train(dataset, model, tokenizer)



Training epoch 0
0


2123it [01:22, 25.60it/s]


Training epoch 1
tensor(1.2730, device='cuda:0', grad_fn=<NllLossBackward0>)


2123it [01:22, 25.80it/s]


Training epoch 2
tensor(0.4523, device='cuda:0', grad_fn=<NllLossBackward0>)


2123it [01:22, 25.81it/s]


Training epoch 3
tensor(0.3204, device='cuda:0', grad_fn=<NllLossBackward0>)


2123it [01:22, 25.80it/s]


Training epoch 4
tensor(0.2242, device='cuda:0', grad_fn=<NllLossBackward0>)


2123it [01:22, 25.80it/s]


In [22]:
model.save_pretrained("fine_tuned")