In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, set_seed, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
import random
import os

set_seed(42)

torch_device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch_device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


# Demo

## Nucleus sampling + Top-K sampling + num sequences = 5

In [2]:
# add the EOS token as PAD token to avoid warnings
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("../output/gpt2/final", pad_token_id=tokenizer.eos_token_id).to(torch_device)

# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# encode context the generation is conditioned on
sep_token = "<|reply|>"
input_text = "Oh nice! I got around this (kind of) by being a bio major with a minor in neuro research. My school had 2 neruo programs - bio based and psych based. My minor was in psych based neuro but I took my bio electives as bio based neuro courses. Still had to do calc 1, calc 2, chem 1, chem 2, orgo 1, orgo 2, physics 1, and physics 2. But I DIDNT have to take intro to pharma kinetics, inorganic chemistry and a few other higher level chem classes. I did this bc the psych based neuro courses had almost no bio and I love bio. But math and chem are my kryptonite."
model_inputs = tokenizer([" ".join([input_text, sep_token])], return_tensors='pt').to(torch_device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    early_stopping=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8,
    num_return_sequences=5,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=False).split('<|reply|>')[1].split('\n')[0][1:]
    print(f"{i}: {text}\n")





Output:
----------------------------------------------------------------------------------------------------
0: This is my favorite part of the bio class.  

1: &gt; My minor was in psych based neuro but I took my bio electives as bio based neuro courses.

2: I have no idea what you're talking about. I'm interested to know. 

3: This is what I think! 

4: How did your bio pass?  I also knew that I had to pay a lot of money, even though I was very good at a math course. I know that was a good one to pay



# Training

## Preprocess

In [None]:
BATCH_SIZE = 2
MAX_LENGTH = 512
EPOCHS = 2

bos_token='<|startoftext|>'
eos_token='<|endoftext|>'
pad_token='<|endoftext|>'
sep_token='<|reply|>'

tokenizer = AutoTokenizer.from_pretrained("distilgpt2", return_tensors='pt', eos_token=eos_token, pad_token=pad_token)

# Tokenizer function for later mapping.
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)

def insert_tags(pair):
    return " ".join([eos_token, pair[0], sep_token, pair[1], eos_token])

# Create dataset as a DatasetDict object
DATA_DIR = '../data/raw/'
filenames = os.listdir(DATA_DIR)
dfs = [pd.read_csv(DATA_DIR + name, index_col='Unnamed: 0') for name in filenames]
df = pd.concat(dfs)
def create_dataset(df):
    comments = df[['comment', 'reply']]['comment'].apply(str).to_list()
    replies = df[['comment', 'reply']]['reply'].apply(str).to_list()
    texts = [insert_tags(pair) for pair in zip(comments, replies)]

    train_percentage = 0.9
    validation_percentage = 0.07
    test_percentage = 0.03

    random.shuffle(texts)
    texts_size = len(texts)
    texts_train = texts[:int(train_percentage*texts_size)]
    texts_validation = texts[int(train_percentage*texts_size):]

    dataset = dict()
    dataset['train'] = Dataset.from_dict({'text': texts_train})
    dataset['validation'] = Dataset.from_dict({'text': texts_validation})
    datasets = DatasetDict(dataset)
    return datasets

def group_texts(examples):
    examples['labels'] = examples['input_ids']
    return examples

dataset = create_dataset(df)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text"],
    )

tokenized_dataset.set_format("pt", columns=['input_ids', 'attention_mask'], output_all_columns=True)

lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=1,
    )

lm_dataset.set_format("pt", columns=['input_ids', 'attention_mask', 'labels'], output_all_columns=True)

# Data Collator pads the inputs for Causal Language Modeling.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
tokenizer.decode(lm_dataset['train'][1]['labels'])

## Training

In [None]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(torch_device)

training_args = TrainingArguments(
    output_dir="../output/gpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_steps=10000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    data_collator=data_collator,
)

trainer.train()