# Imports

In [1]:
%%capture
!pip install transformers
!pip install datasets
!pip install wandb

In [2]:
# Import required modules from transformers and datasets libraries
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, Trainer
from datasets import load_dataset, Dataset

In [3]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# Optional: log both gradients and parameters
%env WANDB_WATCH=all

env: WANDB_WATCH=all


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Dataset, Tokenizer, and base GPT2 model from HF Hub

In [6]:
dataset = load_dataset('aegrif/CIS6930_DAAGR_Empathetic_Dialogues')

Downloading readme:   0%|          | 0.00/750 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/aegrif___parquet/aegrif--CIS6930_DAAGR_Empathetic_Dialogues-3358e2c61020f15c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/84167 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10973 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12077 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/aegrif___parquet/aegrif--CIS6930_DAAGR_Empathetic_Dialogues-3358e2c61020f15c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("aegrif/CIS6930_DAAGR_GPT2_TrainedTokenizer")

Downloading (…)okenizer_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [8]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Variables

In [28]:
# Output directory for the fine-tuned model
output_dir = '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_no_emo' ## specific to Amanda's drive location

In [10]:
# Set the device for training (CUDA for GPU, CPU otherwise)
device = 'cuda'

In [11]:
# Training parameters
number_of_samples_per_epoch = 64636
number_of_epochs = 50
batch_size = 48
warmup_ratio = 0.4
weight_decay = 0.5
steps = 200
learning_rate = 7e-5

max_steps = ((number_of_samples_per_epoch * number_of_epochs) // batch_size)

In [12]:
#Define custom tokens to indicate specific types of data
# These tokens will be inserted into the data as markers to facilitate processing and analysis
u_token = "<|user|>"
b_token = "<|bot|>"

# Process data for training

## Format dataset

In [13]:
# Function to format the dataset according to the desired input format
def format_dataset(example):
    formatted_example = {
        'text': f"{u_token}{example['previous_utterance']} {b_token}{example['utterance']}"
    }
    return formatted_example

In [14]:
formatted_dataset = {}
# Apply the formatting function to all splits of the dataset
for split in dataset.keys():
    formatted_dataset[split] = dataset[split].map(format_dataset)

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

Map:   0%|          | 0/10973 [00:00<?, ? examples/s]

Map:   0%|          | 0/12077 [00:00<?, ? examples/s]

In [15]:
print(formatted_dataset['train']['text'][29704])

<|user|>thank you! i have tried..it is been a very traumatic year in so many ways..and i did not think it could get any worse a few years ago..wrong! i hear you with pain! please try every treatment you can. diet plays a huge role also chiropractor, nuerofeedback, different herbs! i found eating right, exercising, herbs, vitamins, organic food helped with my pain caused by doctors i had tons of neuropathy, pain and fibro like symptoms in addition to the trauma of deaths, abuse etc. i hope you feel better and you inspire me right now that i do not have it so bad!!! <|bot|>that is exactly what i have been going through along with horrible doctors that did nothing for me. i just went high nutrient vegan months ago and am improving my leaps and bounds. i follow dr. joel furhman. you can keep going and your journey is giving you compassion for others that are going through horrible things. take heart and i hope you feel better.


In [16]:
# Define the filtering function
def filter_function(example):
    return example['utterance_idx'] != 1

# Loop through the splits in the dataset and filter them
filtered_dataset = {}
for split in formatted_dataset.keys():
    filtered_dataset[split] = formatted_dataset[split].filter(filter_function)

# Print the number of rows in each filtered split
for split in filtered_dataset:
    print(f"Number of rows in {split}: {len(filtered_dataset[split])}")

Filter:   0%|          | 0/84167 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10973 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12077 [00:00<?, ? examples/s]

Number of rows in train: 64636
Number of rows in test: 8426
Number of rows in validation: 9308


## Tokenize dataset

### Set custom tokens

In [17]:
# Set custom tokens for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [18]:
special_tokens = {'additional_special_tokens': [u_token, b_token]}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [19]:
# Function to tokenize the dataset examples
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False)
# Tokenize all splits of the formatted dataset
tokenized_datasets = {split: filtered_dataset[split].map(tokenize_function, batched=True) for split in formatted_dataset.keys()}

Map:   0%|          | 0/64636 [00:00<?, ? examples/s]

Map:   0%|          | 0/8426 [00:00<?, ? examples/s]

Map:   0%|          | 0/9308 [00:00<?, ? examples/s]

In [20]:
print(tokenized_datasets)

{'train': Dataset({
    features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance', 'text', 'input_ids', 'attention_mask'],
    num_rows: 64636
}), 'test': Dataset({
    features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance', 'text', 'input_ids', 'attention_mask'],
    num_rows: 8426
}), 'validation': Dataset({
    features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance', 'text', 'input_ids', 'attention_mask'],
    num_rows: 9308
})}


In [21]:
# Loop over the train, validation, and test splits of the dataset
for split in ['train', 'validation', 'test']:
  # Remove unwanted columns from the split
  tokenized_datasets[split]= tokenized_datasets[split].remove_columns(['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'previous_utterance', 'new_context', 'text'])

In [22]:
print(tokenized_datasets)

{'train': Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 64636
}), 'test': Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 8426
}), 'validation': Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 9308
})}


In [23]:
# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Finetune

In [24]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True

In [25]:
from transformers import EarlyStoppingCallback
early_stop = EarlyStoppingCallback(10)

In [26]:
# Set up the training arguments

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    warmup_ratio=warmup_ratio,
    max_steps=max_steps,
    num_train_epochs=number_of_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps = steps,
    save_strategy="steps",
    save_steps= steps,
    logging_dir="./logs",
    logging_steps=steps,
    load_best_model_at_end=True,
    tf32=True,
    report_to="wandb",
    metric_for_best_model = "eval_loss",
    learning_rate=learning_rate
)

In [27]:
from transformers import EarlyStoppingCallback
early_stop = EarlyStoppingCallback(10)

In [31]:
# Initialize the Trainer with the model, training arguments, datasets, data collator, and optimizer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    callbacks=[early_stop],
)
# Train the model
trainer.train()



[34m[1mwandb[0m: Currently logged in as: [33mamandaegriffith[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,87.8241,77.998856
400,25.3221,5.599696
600,5.0378,4.364439
800,3.988,3.727403
1000,3.654,3.414086
1200,3.4943,3.31461
1400,3.4151,3.250769
1600,3.3515,3.210174
1800,3.3067,3.183418
2000,3.2723,3.155888


TrainOutput(global_step=12600, training_loss=4.716546945723277, metrics={'train_runtime': 2139.6802, 'train_samples_per_second': 1510.409, 'train_steps_per_second': 31.467, 'total_flos': 2.5804378418688e+16, 'train_loss': 4.716546945723277, 'epoch': 9.35})

# Preview Output

In [32]:
print(trainer.evaluate())

{'eval_loss': 2.9702579975128174, 'eval_runtime': 5.9104, 'eval_samples_per_second': 1574.857, 'eval_steps_per_second': 32.824, 'epoch': 9.35}


In [34]:
loaded_model = GPT2LMHeadModel.from_pretrained(output_dir + '/checkpoint-10600')

In [35]:
print(dataset['train'][0])

{'conv_id': 'hit:0_conv:1', 'utterance_idx': 1, 'context': 'sentimental', 'prompt': 'i remember going to the fireworks with my best friend. there was a lot of people, but it only felt like us in the world.', 'utterance': 'i remember going to see the fireworks with my best friend. it was the first time we ever spent time alone together. although there was a lot of people, we felt like the only people in the world.', 'new_context': 'disappointed', 'previous_utterance': '<|start|>'}


In [36]:
import textwrap

def build_input_string(dataset, conv_id):
    filtered_data = [x for x in dataset if x['conv_id'] == conv_id]
    filtered_data.sort(key=lambda x: x['utterance_idx'])
    print(filtered_data)

    input_string = ""
    for idx, item in enumerate(filtered_data[:-1]):
        if idx % 2 == 0:
            input_string += f"{u_token}{item['utterance']}"
        else:
            input_string += f"{b_token}{item['utterance']} "

    input_string += b_token
    
    return input_string

conv_id = dataset['validation'][150]['conv_id']
input_string = build_input_string(dataset['validation'], conv_id)


[{'conv_id': 'hit:182_conv:365', 'utterance_idx': 1, 'context': 'nostalgic', 'prompt': 'i am now, so anytime that i go to a place that i did something when i was younger makes me whimsical and causes me to think about the past.', 'utterance': 'i love going to places that i explored and play at as a child. every time i go, i always feel super nostalgic.', 'new_context': 'disappointed', 'previous_utterance': '<|start|>'}, {'conv_id': 'hit:182_conv:365', 'utterance_idx': 2, 'context': 'nostalgic', 'prompt': 'i am now, so anytime that i go to a place that i did something when i was younger makes me whimsical and causes me to think about the past.', 'utterance': 'i am very nostalgic too, that must be a fun time!', 'new_context': 'disappointed', 'previous_utterance': 'i love going to places that i explored and play at as a child. every time i go, i always feel super nostalgic.'}, {'conv_id': 'hit:182_conv:365', 'utterance_idx': 3, 'context': 'nostalgic', 'prompt': 'i am now, so anytime that 

In [37]:
print(input_string)

<|user|>i love going to places that i explored and play at as a child. every time i go, i always feel super nostalgic.<|bot|>i am very nostalgic too, that must be a fun time! <|user|>yes, until you realize that it was years since you played there, and you start feeling old<|bot|>


In [38]:
input_tokens = tokenizer.encode(input_string, return_tensors="pt")
input_tokens = input_tokens.to(device)

# Set the pad token ID
tokenizer.pad_token_id = tokenizer.eos_token_id
print(len(input_tokens))
# Generate the output
output_tokens = model.generate(
    input_tokens,
    max_new_tokens = 50,
    num_return_sequences=1,
    attention_mask=input_tokens.ne(tokenizer.pad_token_id).long(),
    no_repeat_ngram_size = 2,
    temperature = .75,
    do_sample=True,
    early_stopping=True
)

# Decode the output tokens
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=False)

# Wrap the output text at 80 characters
wrapped_text = textwrap.fill(output_text, width=80)

print(wrapped_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1
<|user|>i love going to places that i explored and play at as a child. every
time i go, i always feel super nostalgic.<|bot|>i am very nostalgic too, that
must be a fun time! <|user|>yes, until you realize that it was years since you
played there, and you start feeling old<|bot|>you have to start looking back. i
wish it would work out that way. it really does not. we all have regrets, but we
just try not to stress out. that is what i am feeling. you should try to not
stress things out


In [None]:
print(output_tokens)

In [39]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [40]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [41]:
loaded_model.push_to_hub("aegrif/CIS6930_DAAGR_GPT2_NoEmo")
tokenizer.push_to_hub("aegrif/CIS6930_DAAGR_GPT2_NoEmo")

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/aegrif/CIS6930_DAAGR_GPT2_NoEmo/commit/9459cb36724b34701abbfb3ad35282107df37f0b', commit_message='Upload tokenizer', commit_description='', oid='9459cb36724b34701abbfb3ad35282107df37f0b', pr_url=None, pr_revision=None, pr_num=None)

## Test loading

In [42]:
model_2 = GPT2LMHeadModel.from_pretrained("aegrif/CIS6930_DAAGR_GPT2_NoEmo")

Downloading (…)lve/main/config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [43]:
input_tokens = tokenizer.encode(input_string, return_tensors="pt")
input_tokens = input_tokens.to(device)

# Set the pad token ID
tokenizer.pad_token_id = tokenizer.eos_token_id
print(len(input_tokens))
# Generate the output

model_2.to(device)
output_tokens = model_2.generate(
    input_tokens,
    max_new_tokens = 20,
    num_return_sequences=1,
    attention_mask=input_tokens.ne(tokenizer.pad_token_id).long(),
    no_repeat_ngram_size = 2,
    temperature = 0
)

# Decode the output tokens
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=False)

# Wrap the output text at 80 characters
wrapped_text = textwrap.fill(output_text, width=80)

print(wrapped_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1
<|user|>i love going to places that i explored and play at as a child. every
time i go, i always feel super nostalgic.<|bot|>i am very nostalgic too, that
must be a fun time! <|user|>yes, until you realize that it was years since you
played there, and you start feeling old<|bot|>yeah, it is a bit nostalgic, but i
am sure it will be worth it. i will
