In [1]:
import os
import math
import pandas as pd
import numpy as np
from pprint import pprint
import torch
import matplotlib.pyplot as plt
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from pytorch_lightning import seed_everything
import pickle as pkl
from scipy.stats import spearmanr, pearsonr
# os.chdir("/content/drive/MyDrive/iitkgp-mtp-dialog-response-generation")
RANDOM_SEED = 50
seed_everything(RANDOM_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 50


In [2]:
with open("tc_all_prompts.pkl", "rb") as f1:
    tc_all_prompts = pkl.load(f1)

with open("tc_all_labels.pkl", "rb") as f1:
    tc_all_labels = pkl.load(f1)

with open("pc_all_prompts.pkl", "rb") as f1:
    pc_all_prompts = pkl.load(f1)

with open("pc_all_labels.pkl", "rb") as f1:
    pc_all_labels = pkl.load(f1)

pc_all_prompts_labelled = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled = [-1] * len(tc_all_prompts)
pc_all_prompts_labelled_understandable = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled_understandable = [-1] * len(tc_all_prompts)
pc_all_prompts_labelled_natural = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled_natural = [-1] * len(tc_all_prompts)
pc_all_prompts_labelled_context = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled_context = [-1] * len(tc_all_prompts)
pc_all_prompts_labelled_interesting = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled_interesting = [-1] * len(tc_all_prompts)
pc_all_prompts_labelled_facts = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled_facts = [-1] * len(tc_all_prompts)
pc_all_prompts_labelled_overall = [-1] * len(pc_all_prompts)
tc_all_prompts_labelled_overall = [-1] * len(tc_all_prompts)

int2word = {
    "understandable": {0: "no", 1: "yes"},
    "natural": {1: "no", 2: "somewhat", 3: "yes"},
    "context": {1: "no", 2: "somewhat", 3: "yes"},
    "interesting": {1: "dull", 2: "somewhat interesting", 3: "interesting"},
    "facts": {0: "no", 1: "yes"},
    "overall": {1: "very bad", 2: "bad", 3: "neutral", 4: "good", 5: "very good"}
}
change_int_to_word = True

def replace_in_q(q):
    q = q.replace("Interesting (1 - 3)", "Interesting (dull/somewhat interesting/interesting)")\
        .replace("(1 - 3)", "(no/somewhat/yes)")\
        .replace("(1 - 5)", "(very bad/bad/neutral/good/very good)")\
        .replace("(0 - 1)", "(no/yes)")
    return q

for i, each in enumerate(pc_all_prompts):
    prompt_here = pc_all_prompts[i]
    label_1 = round(np.mean(np.array(pc_all_labels[i][0])))
    label_2 = round(np.mean(np.array(pc_all_labels[i][1])))
    label_3 = round(np.mean(np.array(pc_all_labels[i][2])))
    label_4 = round(np.mean(np.array(pc_all_labels[i][3])))
    label_5 = round(np.mean(np.array(pc_all_labels[i][4])))
    label_6 = round(np.mean(np.array(pc_all_labels[i][5])))
    # all_prompts[i] = prefix + each + f"\n1. Understandable: {str(label_1)}\n2. Natural: {str(label_2)}\n3. Maintains Context: {str(label_3)}\n4. Interesting: {str(label_4)}\n5. Overall Quality: {str(label_5)}"
    # pc_all_prompts_labelled[i] = each.lstrip() + f"\n1. Understandable: {str(label_1)}\n2. Natural: {str(label_2)}\n3. Maintains Context: {str(label_3)}\n4. Interesting: {str(label_4)}\n5. Uses Knowledge: {str(label_5)}\n6. Overall Quality: {str(label_6)}"
    before_questions = each.lstrip()[:each.find("\n1.")]
    before_questions_nofacts = before_questions[:before_questions.find("Facts:")] + before_questions[before_questions.find("Generated "):]
    if change_int_to_word:
        pc_all_prompts_labelled[i] = replace_in_q(each.lstrip() + f"""\n1. {str(int2word["understandable"][label_1])}\n2. {str(int2word["natural"][label_2])}\n3. {str(int2word["context"][label_3])}\n4. {str(int2word["interesting"][label_4])}\n5. {str(int2word["facts"][label_5])}\n6. {str(int2word["overall"][label_6])}""")
        pc_all_prompts_labelled_understandable[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n1."):each.find("\n2.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["understandable"][label_1])}""")
        pc_all_prompts_labelled_natural[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n2."):each.find("\n3.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["natural"][label_2])}""")
        pc_all_prompts_labelled_context[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n3."):each.find("\n4.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["context"][label_3])}""")
        pc_all_prompts_labelled_interesting[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n4."):each.find("\n5.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["interesting"][label_4])}""")
        pc_all_prompts_labelled_facts[i] = replace_in_q(before_questions + each.lstrip()[each.find("\n5."):each.find("\n6.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["facts"][label_5])}""")
        pc_all_prompts_labelled_overall[i] = replace_in_q(before_questions + each.lstrip()[each.find("\n6."):each.find("\n\nAnswers")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["overall"][label_6])}""")
    else:
        pc_all_prompts_labelled[i] = each.lstrip() + f"""\n1. {str(label_1)}\n2. {str(label_2)}\n3. {str(label_3)}\n4. {str(label_4)}\n5. {str(label_5)}\n6. {str(label_6)}"""
        pc_all_prompts_labelled_understandable[i] = each.lstrip()[:each.find("\n2.")][:-3] + f"""\n\nAnswers: \n1. {str(label_1)}"""
        pc_all_prompts_labelled_natural[i] = before_questions_nofacts + each.lstrip()[each.find("\n2."):each.find("\n3.")][:-3] + f"""\n\nAnswers: \n1. {str(label_2)}"""
        pc_all_prompts_labelled_context[i] = before_questions_nofacts + each.lstrip()[each.find("\n3."):each.find("\n4.")][:-3] + f"""\n\nAnswers: \n1. {str(label_3)}"""
        pc_all_prompts_labelled_interesting[i] = before_questions_nofacts + each.lstrip()[each.find("\n4."):each.find("\n5.")][:-3] + f"""\n\nAnswers: \n1. {str(label_4)}"""
        pc_all_prompts_labelled_facts[i] = before_questions + each.lstrip()[each.find("\n5."):each.find("\n6.")][:-3] + f"""\n\nAnswers: \n1. {str(label_5)}"""
        pc_all_prompts_labelled_overall[i] = before_questions + each.lstrip()[each.find("\n6."):each.find("\n\nAnswers")][:-3] + f"""\n\nAnswers: \n1. {str(label_6)}"""

for i, each in enumerate(tc_all_prompts):
    prompt_here = tc_all_prompts[i]
    label_1 = round(np.mean(np.array(tc_all_labels[i][0])))
    label_2 = round(np.mean(np.array(tc_all_labels[i][1])))
    label_3 = round(np.mean(np.array(tc_all_labels[i][2])))
    label_4 = round(np.mean(np.array(tc_all_labels[i][3])))
    label_5 = round(np.mean(np.array(tc_all_labels[i][4])))
    label_6 = round(np.mean(np.array(tc_all_labels[i][5])))
    before_questions = each.lstrip()[:each.find("\n1.")]
    before_questions_nofacts = before_questions[:before_questions.find("Facts:")] + before_questions[before_questions.find("Generated "):]
    if change_int_to_word:
        tc_all_prompts_labelled[i] = replace_in_q(each.lstrip() + f"""\n1. {str(int2word["understandable"][label_1])}\n2. {str(int2word["natural"][label_2])}\n3. {str(int2word["context"][label_3])}\n4. {str(int2word["interesting"][label_4])}\n5. {str(int2word["facts"][label_5])}\n6. {str(int2word["overall"][label_6])}""")
        tc_all_prompts_labelled_understandable[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n1."):each.find("\n2.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["understandable"][label_1])}""")
        tc_all_prompts_labelled_natural[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n2."):each.find("\n3.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["natural"][label_2])}""")
        tc_all_prompts_labelled_context[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n3."):each.find("\n4.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["context"][label_3])}""")
        tc_all_prompts_labelled_interesting[i] = replace_in_q(before_questions_nofacts + each.lstrip()[each.find("\n4."):each.find("\n5.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["interesting"][label_4])}""")
        tc_all_prompts_labelled_facts[i] = replace_in_q(before_questions + each.lstrip()[each.find("\n5."):each.find("\n6.")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["facts"][label_5])}""")
        tc_all_prompts_labelled_overall[i] = replace_in_q(before_questions + each.lstrip()[each.find("\n6."):each.find("\n\nAnswers")][:-3] + f"""\n\nAnswers: \n1. {str(int2word["overall"][label_6])}""")
    else:
        tc_all_prompts_labelled[i] = each.lstrip() + f"""\n1. {str(label_1)}\n2. {str(label_2)}\n3. {str(label_3)}\n4. {str(label_4)}\n5. {str(label_5)}\n6. {str(label_6)}"""
        tc_all_prompts_labelled_understandable[i] = each.lstrip()[:each.find("\n2.")][:-3] + f"""\n\nAnswers: \n1. {str(label_1)}"""
        tc_all_prompts_labelled_natural[i] = before_questions_nofacts + each.lstrip()[each.find("\n2."):each.find("\n3.")][:-3] + f"""\n\nAnswers: \n1. {str(label_2)}"""
        tc_all_prompts_labelled_context[i] = before_questions_nofacts + each.lstrip()[each.find("\n3."):each.find("\n4.")][:-3] + f"""\n\nAnswers: \n1. {str(label_3)}"""
        tc_all_prompts_labelled_interesting[i] = before_questions_nofacts + each.lstrip()[each.find("\n4."):each.find("\n5.")][:-3] + f"""\n\nAnswers: \n1. {str(label_4)}"""
        tc_all_prompts_labelled_facts[i] = before_questions + each.lstrip()[each.find("\n5."):each.find("\n6.")][:-3] + f"""\n\nAnswers: \n1. {str(label_5)}"""
        tc_all_prompts_labelled_overall[i] = before_questions + each.lstrip()[each.find("\n6."):each.find("\n\nAnswers")][:-3] + f"""\n\nAnswers: \n1. {str(label_6)}"""

remove_question_number = True
# if remove_question_number:
for i, each in enumerate(pc_all_prompts_labelled_overall):
    pc_all_prompts_labelled_overall[i] = each.replace("Given your answers above, w", "W")
for i, each in enumerate(tc_all_prompts_labelled_overall):
    tc_all_prompts_labelled_overall[i] = each.replace("Given your answers above, w", "W")

for i, each in enumerate(pc_all_prompts_labelled_facts):
    pc_all_prompts_labelled_facts[i] = each.replace("response is conditioned", "response is supposed to be conditioned")
    pc_all_prompts_labelled_facts[i] = each.replace("how well does the", "does the")
for i, each in enumerate(tc_all_prompts_labelled_facts):
    tc_all_prompts_labelled_facts[i] = each.replace("response is conditioned", "response is supposed to be conditioned")
    tc_all_prompts_labelled_facts[i] = each.replace("how well does the", "does the")
        
# for i, each in enumerate(pc_all_prompts):
#     pc_all_prompts_understandable[i] = each[:each.find("Facts")] + each[each.find("Generated response:"):]
# for i, each in enumerate(tc_all_prompts):
#     tc_all_prompts_understandable[i] = each[:each.find("Facts")] + each[each.find("Generated response:"):]

In [3]:
print(pc_all_prompts_labelled[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Facts:
Person 1's statement: i also have a dog walking business.
Person 1's statement: i've three dogs.
Person 1's statement: my father was a door to door salesman.
Person 1's statement: i am in an open polyamorous relationship.
Person 1's statement: i like to watch the olympics.

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Understandable (no/yes): Is the response understandable given the previous context?
2. Natural (no/somewhat/yes): Does the response seem like something that a person would naturally say?
3. Maintains Context (no/somewhat/yes): Does the response serve as a valid continuation of the preceding conversation?
4. Interesting (dull/somewhat interesting/interesting): Is the response dull or interesting?
5. Uses Knowledge (no/yes): Given the

In [4]:
print(pc_all_prompts_labelled_understandable[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Understandable (no/yes): Is the response understandable given the previous context?

Answers: 
1. yes


In [5]:
print(pc_all_prompts_labelled_natural[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Natural (no/somewhat/yes): Does the response seem like something that a person would naturally say?

Answers: 
1. yes


In [6]:
print(pc_all_prompts_labelled_context[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Maintains Context (no/somewhat/yes): Does the response serve as a valid continuation of the preceding conversation?

Answers: 
1. yes


In [7]:
print(pc_all_prompts_labelled_interesting[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Interesting (dull/somewhat interesting/interesting): Is the response dull or interesting?

Answers: 
1. dull


In [8]:
print(pc_all_prompts_labelled_facts[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Facts:
Person 1's statement: i also have a dog walking business.
Person 1's statement: i've three dogs.
Person 1's statement: my father was a door to door salesman.
Person 1's statement: i am in an open polyamorous relationship.
Person 1's statement: i like to watch the olympics.

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Uses Knowledge (no/yes): Given the facts that the response is conditioned on, does the response use the facts?

Answers: 
1. no


In [9]:
print(pc_all_prompts_labelled_overall[0])

Context:
Person 1: lead singer for a band , music teacher
Person 2: wow nice are you really good ?
Person 1: millions of plays on soundcloud
Person 2: really would you share or are you shy

Facts:
Person 1's statement: i also have a dog walking business.
Person 1's statement: i've three dogs.
Person 1's statement: my father was a door to door salesman.
Person 1's statement: i am in an open polyamorous relationship.
Person 1's statement: i like to watch the olympics.

Generated response: 
Person 1: ha ha i'm so shy

Questions about the generated response:
1. Overall Quality (very bad/bad/neutral/good/very good): What is your overall impression of the quality of the generated response?

Answers: 
1. good


In [10]:
import torch
from transformers import GPT2Tokenizer, GPTNeoForCausalLM, GPTNeoModel, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import pandas as pd
from torch.utils.data import Dataset, random_split, Subset
import random
from torch import cuda
import os
import numpy as np

In [11]:
# model_checkpoint = "gpt2-medium"
model_checkpoint = "gpt2-large"
# model_checkpoint = "bigscience/bloom-560m"
# model_checkpoint = "bigscience/bloom-1b1"

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint,
                            bos_token="<|startoftext|>",
                            eos_token="<|endoftext|>",
                            pad_token="<|pad|>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
model = GPT2LMHeadModel.from_pretrained(model_checkpoint).to(device)
# model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)
# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").cuda()
# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

In [13]:
tc_lines = []
for a, b, c, d, e, f in zip(tc_all_prompts_labelled_understandable, tc_all_prompts_labelled_natural, tc_all_prompts_labelled_context, tc_all_prompts_labelled_interesting, tc_all_prompts_labelled_facts, tc_all_prompts_labelled_overall):
    tc_lines.append(a)
    tc_lines.append(b)
    tc_lines.append(c)
    tc_lines.append(d)
    tc_lines.append(e)
    tc_lines.append(f)

pc_lines = []
for a, b, c, d, e, f in zip(pc_all_prompts_labelled_understandable, pc_all_prompts_labelled_natural, pc_all_prompts_labelled_context, pc_all_prompts_labelled_interesting, pc_all_prompts_labelled_facts, pc_all_prompts_labelled_overall):
    pc_lines.append(a)
    pc_lines.append(b)
    pc_lines.append(c)
    pc_lines.append(d)
    pc_lines.append(e)
    pc_lines.append(f)

max_length = 512
tc_descriptions = [description for description in tc_lines if len(tokenizer.encode(description)) < max_length-2]
pc_descriptions = [description for description in pc_lines if len(tokenizer.encode(description)) < max_length-2]
pc_max_length = max([len(tokenizer.encode(description)) for description in pc_descriptions])
tc_max_length = max([len(tokenizer.encode(description)) for description in tc_descriptions])
print(pc_max_length)
print(len(pc_descriptions))
print(tc_max_length)
print(len(tc_descriptions))

258
1800
453
2112


In [14]:
class PromptsDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer("<|startoftext|>" 
                                        + txt +    
                                        "<|endoftext|>",
                                        truncation=True,
                                        max_length=max_length, 
                                        padding='max_length')
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

tc_dataset = PromptsDataset(tc_descriptions, tokenizer, max_length) 
pc_dataset = PromptsDataset(pc_descriptions, tokenizer, max_length)

In [15]:
# tc_train_size = int(0.7 * len(tc_dataset))
# pc_train_size = int(0.7 * len(pc_dataset))

tc_train_size = 1512
pc_train_size = 1260

tc_val_size = 288
pc_val_size = 270

tc_test_size = 312 
pc_test_size = 270 

tc_train_dataset = Subset(tc_dataset, range(tc_train_size))
pc_train_dataset = Subset(pc_dataset, range(pc_train_size))

tc_val_dataset = Subset(tc_dataset, range(tc_train_size, tc_train_size + tc_val_size))
pc_val_dataset = Subset(pc_dataset, range(pc_train_size, pc_train_size + pc_val_size))

tc_test_dataset = Subset(tc_dataset, range(tc_train_size + tc_val_size, tc_train_size + tc_val_size + tc_test_size))
pc_test_dataset = Subset(pc_dataset, range(pc_train_size + pc_val_size, pc_train_size + pc_val_size + pc_test_size))

In [16]:
int2word = {
    "understandable": {0: "no", 1: "yes"},
    "natural": {1: "no", 2: "somewhat", 3: "yes"},
    "context": {1: "no", 2: "somewhat", 3: "yes"},
    "interesting": {1: "dull", 2: "somewhat interesting", 3: "interesting"},
    "facts": {0: "no", 1: "yes"},
    "overall": {1: "very bad", 2: "bad", 3: "neutral", 4: "good", 5: "very good"}
}

label2int = {
    "understandable": {"no": 0, "yes": 1},
    "natural" : {"no": 1, "somewhat": 2, "yes": 3},
    "context": {"no": 1, "somewhat": 2, "yes": 3},
    "interesting": {"dull": 1, "somewhat interesting": 2, "interesting": 3},
    "facts": {"no": 0, "yes": 1},
    "overall": {"very bad": 1, "bad": 2, "neutral": 3, "good": 4, "very good": 5},
}

tc_pc_concat_testdataset = torch.utils.data.ConcatDataset([tc_test_dataset, pc_test_dataset])
test_lines = []
test_labels = []
for i, each in enumerate(tc_pc_concat_testdataset):
    datapoint = tokenizer.decode(tc_pc_concat_testdataset[i][0], skip_special_tokens=True)
    unlabelled_here = ".".join(datapoint.split(".")[:-1]) + ". "
    test_lines.append(unlabelled_here)
    if "Is the response dull or interesting" in unlabelled_here:
        label_int = label2int["interesting"][datapoint.replace(unlabelled_here, "")]
    elif "overall impression of the quality" in unlabelled_here:
        label_int = label2int["overall"][datapoint.replace(unlabelled_here, "")]
    elif "facts that the response is conditioned" in unlabelled_here:
        label_int = label2int["facts"][datapoint.replace(unlabelled_here, "")]
    elif "understandable given the previous context" in unlabelled_here:
        label_int = label2int["understandable"][datapoint.replace(unlabelled_here, "")]
    elif "valid continuation of the preceding conversation" in unlabelled_here:
        label_int = label2int["context"][datapoint.replace(unlabelled_here, "")]
    elif "something that a person would naturally say" in unlabelled_here:
        label_int = label2int["natural"][datapoint.replace(unlabelled_here, "")]
    test_labels.append(label_int)

In [17]:
tc_pc_concat_traindataset = torch.utils.data.ConcatDataset([tc_train_dataset, pc_train_dataset])
tc_pc_concat_valdataset = torch.utils.data.ConcatDataset([tc_val_dataset, tc_val_dataset])

In [20]:
lr=2e-5
bs=10
ne=10
training_args = TrainingArguments(output_dir=f'gpt2large-epoch10-singlequestions-try2-pctc-lr{lr}-bs{bs}-ne{ne}',
                                  overwrite_output_dir=False,
                                  num_train_epochs=ne,
                                  learning_rate=lr,
                                  save_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  logging_strategy="epoch",
                                  logging_first_step=True,
                                #   logging_steps=200,
                                #   save_steps =1000,
#                                   per_device_train_batch_size=1,
                                  auto_find_batch_size=True,
                                  gradient_accumulation_steps=bs,
                                #   warmup_steps=500,
                                  weight_decay=0.01,  
                                  load_best_model_at_end=True,
                                  fp16=False,
                                  )
                                #   logging_dir='./logs')

In [22]:
trainer = Trainer(model=model, args=training_args,  
                  train_dataset=tc_pc_concat_traindataset,
                  eval_dataset=tc_pc_concat_valdataset, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: 
              {'input_ids': torch.stack([f[0] for f in data]),       
               'attention_mask': torch.stack([f[1] for f in data]),
               'labels': torch.stack([f[0] for f in data])}
                 )

In [None]:
# Start training process!
trainer.train()

***** Running training *****
  Num examples = 2772
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 10
  Total optimization steps = 340


Epoch,Training Loss,Validation Loss


***** Running training *****
  Num examples = 2772
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 10
  Total optimization steps = 690


Epoch,Training Loss,Validation Loss


In [None]:
####### EVAL #######

In [89]:
model_checkpoint = f'gpt2large-singlequestions-pctc-lr1e-05-bs10-ne3/checkpoint-819/'
model = GPT2LMHeadModel.from_pretrained(model_checkpoint).to(device)
# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").cuda()
# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
# model.eval()

loading configuration file gpt2large-singlequestions-pctc-lr1e-05-bs10-ne3/checkpoint-819/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-large",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "pad_token_id": 50258,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_

In [None]:
input_ids = torch.tensor([tokenizer(test_lines[0]).input_ids]).to(device)
print(input_ids)
print(tokenizer.decode(input_ids[0], skip_special_tokens=True))

In [16]:
predicted_q1 = []
predicted_q2 = []
predicted_q3 = []
predicted_q4 = []
predicted_q5 = []
predicted_q6 = []
labels_q1 = []
labels_q2 = []
labels_q3 = []
labels_q4 = []
labels_q5 = []
labels_q6 = []
# test_labels

label2int = {
    "understandable": {"no": 0, "yes": 1},
    "natural" : {"no": 1, "somewhat": 2, "yes": 3},
    "context": {"no": 1, "somewhat": 2, "yes": 3},
    "interesting": {"dull": 1, "somewhat interesting": 2, "interesting": 3},
    "facts": {"no": 0, "yes": 1},
    "overall": {"very bad": 1, "bad": 2, "neutral": 3, "good": 4, "very good": 5},
}
for i, each in enumerate(test_lines):
    print(i)
    tokenized = tokenizer(each)
    input_ids = torch.tensor([tokenized.input_ids]).to(device)
    unlabelled_here = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    if len(input_ids[0]) < 1000:
        generation = model.generate(input_ids, do_sample=False, max_new_tokens=100, num_beams=5)
        ## do_sample=False enables greedy decoding
        predicted_label = tokenizer.batch_decode(generation, skip_special_tokens=True)[0].replace(each, "").lstrip().rstrip()
        if "understandable given the previous context" in unlabelled_here:
            label_int = label2int["understandable"][predicted_label]
            predicted_q1.append(label_int)
            labels_q1.append(test_labels[i])
        elif "something that a person would naturally say" in unlabelled_here:
            label_int = label2int["natural"][predicted_label]
            predicted_q2.append(label_int)
            labels_q2.append(test_labels[i])
        elif "valid continuation of the preceding conversation" in unlabelled_here:
            label_int = label2int["context"][predicted_label]
            predicted_q3.append(label_int)
            labels_q3.append(test_labels[i])
        elif "Is the response dull or interesting" in unlabelled_here:
            label_int = label2int["interesting"][predicted_label]
            predicted_q4.append(label_int)
            labels_q4.append(test_labels[i])
        elif "facts that the response is conditioned" in unlabelled_here:
            label_int = label2int["facts"][predicted_label]
            predicted_q5.append(label_int)
            labels_q5.append(test_labels[i])
        elif "overall impression of the quality" in unlabelled_here:
            label_int = label2int["overall"][predicted_label]
            predicted_q6.append(label_int)
            labels_q6.append(test_labels[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [75]:
print(len(predicted_q1))
print(len(predicted_q2))
print(len(predicted_q3))
print(len(predicted_q4))
print(len(predicted_q5))
print(len(predicted_q6))

98
83
91
112
101
102


In [99]:
print(len(tc_pc_concat_traindataset))
print(len(tc_pc_concat_traindataset))

2738


In [18]:
overall_labels = 
for i in range(len(tc_pc_concat_traindataset)):
    label_here = tokenizer.decode(tc_pc_concat_traindataset[i][0], skip_special_tokens=True).split()[-1]
    if label_here in overall_labels:
        print(label_here)

neutral
no
good
interesting
yes
yes
yes
no
yes
no
no
yes
no
bad
bad
yes
yes
somewhat
yes
interesting
yes
neutral
interesting
no
good
interesting
neutral
no
yes
good
yes
neutral
no
yes
good
yes
yes
somewhat
yes
yes
yes
yes
yes
yes
yes
yes
somewhat
somewhat
no
no
somewhat
bad
somewhat
bad
no
yes
interesting
interesting
yes
yes
bad
yes
bad
somewhat
yes
no
interesting
somewhat
somewhat
interesting
no
yes
yes
no
yes
yes
dull
somewhat
yes
no
somewhat
no
somewhat
no
good
yes
yes
bad
somewhat
yes
no
yes
no
somewhat
no
somewhat
yes
no
no
good
bad
yes
yes
yes
somewhat
no
yes
yes
no
yes
dull
yes
bad
bad
no
bad
yes
good
interesting
no
neutral
yes
yes
bad
yes
dull
interesting
yes
somewhat
somewhat
yes
interesting
yes
good
no
yes
somewhat
yes
bad
somewhat
good
no
yes
somewhat
interesting
yes
yes
yes
yes
somewhat
interesting
interesting
interesting
no
yes
somewhat
yes
yes
somewhat
yes
good
no
good
yes
dull
good
neutral
no
yes
yes
yes
somewhat
yes
no
interesting
interesting
yes
yes
yes
somewhat
somewh

KeyboardInterrupt: 

In [76]:
print(labels_q1, "\n")
print(labels_q2, "\n")
print(labels_q3, "\n")
print(labels_q4, "\n")
print(labels_q5, "\n")
print(labels_q6, "\n")

[1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

[3, 3, 2, 3, 1, 2, 2, 1, 3, 3, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 3, 1, 3, 2, 2, 2, 2, 3, 2, 2, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 3, 1, 2, 2, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 3, 3, 2, 3, 3, 3, 3, 2, 3, 2, 3, 3] 

[3, 2, 1, 2, 3, 2, 2, 2, 3, 2, 3, 2, 2, 2, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 2, 2, 1, 3, 3, 2, 3, 3, 2, 3, 2, 2, 3, 2, 3, 2, 2, 1, 3, 1, 3, 1, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 2, 1, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3] 

[2, 3, 3, 2, 3, 3, 1, 3, 2, 3, 3, 3, 3, 3, 1, 2, 2, 2, 1, 3, 2, 3, 1, 3, 3, 2, 3, 2, 2, 3, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 1, 2, 3, 3, 2, 1, 

In [100]:
## epoch 4
print(predicted_q1, "\n")
print(predicted_q2, "\n")
print(predicted_q3, "\n")
print(predicted_q4, "\n")
print(predicted_q5, "\n")
print(predicted_q6, "\n")

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 

In [17]:
## epochs=10, lr=2e-5, bs=10
print(predicted_q1, "\n")
print(predicted_q2, "\n")
print(predicted_q3, "\n")
print(predicted_q4, "\n")
print(predicted_q5, "\n")
print(predicted_q6, "\n")

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

[3, 2, 3, 3, 3, 3, 2, 1, 3, 2, 1, 1, 3, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 2, 2, 3, 3, 3, 1, 3, 2, 3, 3, 3, 2, 2, 3, 1, 2, 1, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[3, 1, 3, 1, 2, 2, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 2, 1, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2, 3, 2, 3, 3, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 

In [72]:
## epoch 3
print(predicted_q1, "\n")
print(predicted_q2, "\n")
print(predicted_q3, "\n")
print(predicted_q4, "\n")
print(predicted_q5, "\n")
print(predicted_q6, "\n")

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

[3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 1, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 1, 1, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 

In [77]:
## epoch 2
print(predicted_q1, "\n")
print(predicted_q2, "\n")
print(predicted_q3, "\n")
print(predicted_q4, "\n")
print(predicted_q5, "\n")
print(predicted_q6, "\n")

[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

[3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 1, 2, 3, 2, 2, 1, 1, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[3, 1, 1, 3, 1, 3, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [88]:
## epoch 1
print(predicted_q1, "\n")
print(predicted_q2, "\n")
print(predicted_q3, "\n")
print(predicted_q4, "\n")
print(predicted_q5, "\n")
print(predicted_q6, "\n")

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 

In [94]:
def get_pearson_spearman_corr(preds_q1, avg_q1, preds_q2, avg_q2, preds_q3, avg_q3, preds_q4, avg_q4, preds_q5, avg_q5, preds_q6, avg_q6):
    print("PEARSON CORRELATION:")
    print("Q1:")
    rho, p = pearsonr(preds_q1, avg_q1)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q2:")
    rho, p = pearsonr(preds_q2, avg_q2)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q3:")
    rho, p = pearsonr(preds_q3, avg_q3)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q4:")
    rho, p = pearsonr(preds_q4, avg_q4)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q5:")
    rho, p = pearsonr(preds_q5, avg_q5)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q6:")
    rho, p = pearsonr(preds_q6, avg_q6)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")
    
    print("SPEARMAN CORRELATION:")
    print("Q1:")
    rho, p = spearmanr(preds_q1, avg_q1)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q2:")
    rho, p = spearmanr(preds_q2, avg_q2)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q3:")
    rho, p = spearmanr(preds_q3, avg_q3)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q4:")
    rho, p = spearmanr(preds_q4, avg_q4)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q5:")
    rho, p = spearmanr(preds_q5, avg_q5)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

    print("Q6:")
    rho, p = spearmanr(preds_q6, avg_q6)
    print("Rho = ", rho)
    print("p = ", p)
    print("\n")

In [95]:
preds_q1 = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
avg_q1 = labels_q1
preds_q2 = [3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
avg_q2 = labels_q2
preds_q3 = [3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 1, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 1, 1, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
avg_q3 = labels_q3
preds_q4 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] 
avg_q4 = labels_q4
preds_q5 = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0] 
avg_q5 = labels_q5
preds_q6 = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] 
avg_q6 = labels_q6

get_pearson_spearman_corr(preds_q1, avg_q1, preds_q2, avg_q2, preds_q3, avg_q3, preds_q4, avg_q4, preds_q5, avg_q5, preds_q6, avg_q6)

PEARSON CORRELATION:
Q1:
Rho =  -0.05462842470431956
p =  0.5931655956007832


Q2:
Rho =  0.006505118409748642
p =  0.953456882222216


Q3:
Rho =  0.12035278408675669
p =  0.25580698387456907


Q4:
Rho =  0.06593054455671633
p =  0.4897743833181122


Q5:
Rho =  0.059057722312233885
p =  0.5574405295094135


Q6:
Rho =  nan
p =  nan


SPEARMAN CORRELATION:
Q1:
Rho =  -0.054628424704319715
p =  0.593165595600782


Q2:
Rho =  0.0242753059693034
p =  0.8275564053321658


Q3:
Rho =  0.1317974756233523
p =  0.21301119507458474


Q4:
Rho =  0.07367642766150788
p =  0.4401020765307063


Q5:
Rho =  0.059057722312233885
p =  0.5574405295094149


Q6:
Rho =  nan
p =  nan






In [81]:
for i in range(10):
    print(i)
    print(test_lines[i])
    print("\n\n")

0
Context:
Person 1: hello, how are you this evening? do you like sports? 
Person 2:  i m great, and yeah i do like to watch sports, basketball, hockey, soccer and even swimming, what about you? 

Generated response: 
Person 1: i do not but i am more into swimming myself. i don't like sports, but i do know there are some really boring swimming competitions

Questions about the generated response:
1. Interesting (dull/somewhat interesting/interesting): Is the response dull or interesting?

Answers: 
1. 



1
Context:
Person 1:  right? i wonder if they have cricket cards lol? i think they have stickers for soccer. 
Person 2:  some guy found $ 3 million dollars of baseball cards from the 1900s. i wonder what he did with the money. 
Person 1:  yeah. that's an amazing find. i heard they had been hidden since the 40s. 
Person 2:  thanks, grandpa! i bet grandpa wishes he had cashed them in before he cashed out. 

Generated response: 
Person 1: well, but time probably added to the value, no? i

In [85]:
line = test_lines[9]
tokenized = tokenizer(line)
input_ids = torch.tensor([tokenized.input_ids]).to(device)
if len(input_ids[0]) < 1000:
    generation = model.generate(input_ids, do_sample=False, max_new_tokens=100, num_beams=5)
    print(tokenizer.batch_decode(generation, skip_special_tokens=True)[0].replace(line, ""))

 good


In [None]:
model_checkpoint = f'gpt2-tc-medium-lm-lr{lr}-bs{bs}-ne{ne}/checkpoint-46'
model = GPT2LMHeadModel.from_pretrained(model_checkpoint).to(device)
# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").cuda()
# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

loading configuration file gpt2-tc-medium-lm-lr1e-05-bs10-ne4/checkpoint-46/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "pad_token_id": 50258,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "to

In [None]:
each = """Context:
Person 1:  the guy is a machine! but he needs a better stage name. maybe president banana? 
Person 2:  i do not think the president of zimbabwe would be happy about that 
Person 1:  he could be the artist formerly known as president banana? i don't really care as long as i can listen to him on the radio! i love two things on my radio : indonesian pop music and electromagnetic storms from jupiter. those are my go to listening pleasures! 
Person 2:  yeah that is so cool that if you turn your radio to am, you may capture jupiter's storms 

Facts:
according to canadian law, all radios are required to have at least 40 % of the music played be canadian.

Generated response: 
Person 1: that might be banned in canada. the law says that all stations must have 40 % of the music played be canadian!

Questions about the generated response:
1. Understandable (0 - 1): Is the response understandable given the previous context?
2. Natural (1 - 3): Does the response seem like something that a person would naturally say?
3. Maintains Context (1 - 3): Does the response serve as a valid continuation of the preceding conversation?
4. Interesting (1 - 3): Is the response dull or interesting?
5. Uses Knowledge (0 - 1): Given the facts that the response is conditioned on, how well does the response use the facts?
6. Overall Quality (1 - 5): Given your answers above, what is your overall impression of the quality of the generated response?

Answers:
"""
tokenized = tokenizer(each)
input_ids = torch.tensor([tokenized.input_ids]).to(device)
if len(input_ids[0]) < 1000:
    generation = model.generate(input_ids, do_sample=False, max_new_tokens=100, num_beams=5)
    print(tokenizer.batch_decode(generation, skip_special_tokens=True)[0])

Context:
Person 1:  the guy is a machine! but he needs a better stage name. maybe president banana? 
Person 2:  i do not think the president of zimbabwe would be happy about that 
Person 1:  he could be the artist formerly known as president banana? i don't really care as long as i can listen to him on the radio! i love two things on my radio : indonesian pop music and electromagnetic storms from jupiter. those are my go to listening pleasures! 
Person 2:  yeah that is so cool that if you turn your radio to am, you may capture jupiter's storms 

Generated response: 
Person 1: that might be banned in canada. the law says that all stations must have 40 % of the music played be canadian!

Questions about the generated response:
1. Understandable (0 - 1): Is the response understandable given the previous context?

Answers:
1. 1


In [None]:
tokenizer.decode(tc_train_dataset[0][0], skip_special_tokens=True)[-200:]

'does the response use the facts?\n6. Overall Quality (1 - 5): Given your answers above, what is your overall impression of the quality of the generated response?\n\nAnswers:\n1. 1\n2. 2\n3. 2\n4. 3\n5. 1\n6. 4'

In [None]:
tc_unlabelled_test[0][-200:]

'e is conditioned on, how well does the response use the facts?\n6. Overall Quality (1 - 5): Given your answers above, what is your overall impression of the quality of the generated response?\n\nAnswers:'

In [None]:
tc_test_dataset = tc_unlabelled_test
for i, each in enumerate(tc_test_dataset):
    # print(each)
    tokenized = tokenizer(each + "\n")
    input_ids = torch.tensor([tokenized.input_ids]).to(device)
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    if len(input_ids[0]) < 1000:
        generation = model.generate(input_ids, do_sample=False, max_new_tokens=100, num_beams=3, temperature=0.1)
        print(f"\nnum = {i}")
        print(tokenizer.batch_decode(generation, skip_special_tokens=True)[0].replace(input_text, ""))

In [None]:
#### UNDERSTANDABLE ####
tc_test_dataset = tc_unlabelled_test
for i, each in enumerate(tc_test_dataset):
    # print(each)
    tokenized = tokenizer(each + "\n")
    input_ids = torch.tensor([tokenized.input_ids]).to(device)
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    if len(input_ids[0]) < 1000:
        generation = model.generate(input_ids, do_sample=False, max_new_tokens=100, num_beams=3, temperature=0.1)
        print(f"\nnum = {i}")
        print(tokenizer.batch_decode(generation, skip_special_tokens=True)[0].replace(input_text, ""))


num = 0
1. 1

num = 1
1. 1

num = 2
1. 1

num = 3
1. 1

num = 4
1. 1

num = 5
1. 1

num = 6
1. 1

num = 7
1. 1

num = 8
1. 1

num = 9
1. 1

num = 10
1. Understand (0 - 1): Is the response understandable given the previous context?

num = 11
1. 1

num = 12
1. 1

num = 13
1. Understand (0 - 1): Is the response understandable given the previous?

num = 14
1. 1

num = 15
1. 1

num = 16
1. 1

num = 17
1. 1

num = 18
1. 1

num = 19
1. 1

num = 20
1. 1

num = 21
1. 1

num = 22
1. 1

num = 23
1. 1

num = 24
1. 1

num = 25
1. 1

num = 26
1. 1

num = 27
1. 1

num = 28
1. 1

num = 29
1. Understand (0 - 1): Is the response understandable given the previous?

num = 30
1. 1

num = 31
1. 1

num = 32
1. Understand (0 - 1): Is the response understandable given the previous context?

2 (1 - 3): Does the response seem like something?
 (1 - 3)

3. Natural ( 1 - 3): Does the response seem like something?
 (1 1 - 3)
 (1 - 3)

num = 33
1. 1

num = 34
1. 1

num = 35
1. 1

num = 36
1. 1

num = 37
1. 1

num = 

In [None]:
str1 = "hey there"
tokenized = tokenizer(str1)
input_ids = torch.tensor([tokenized.input_ids]).to(device)
generation = model.generate(input_ids, do_sample=False, max_new_tokens=100, num_beams=5)
print(tokenizer.batch_decode(generation, skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


hey theres


In [None]:
input_ids = torch.tensor([test_dataset[0][0].tolist()]).to(device)
if len(input_ids[0]) < 1000:
    generation = model.generate(input_ids, do_sample=False, max_new_tokens=24, num_beams=5)
    print(tokenizer.batch_decode(generation, skip_special_tokens=True)[0])

In [None]:
print(tokenizer.decode(test_dataset[0][0]))

<|startoftext|> Human Evaluation of Chatbot Outputs:

Annotation Instructions: 
You will be given a conversation between two individuals. You will then be given a potential chatbot-generated response for the next turn in the conversation. Your task is to rate the response on several metrics. The response for one metric should not influence other metrics. For example, if a response is not understandable or has grammatical errors - you should try to ignore this when considering whether it maintains context or if it is interesting.
The following are the metrics and corresponding rating scales that each response is required to be rated on:
Understandable (0 - 1): Is the response understandable in the context of the history? A score of 0 (no) means that the response is difficult to understand. You do not know what the person is trying to say. A score of 1 (yes) means that the response is understandable. You know what the person is trying to say.
Natural (1 - 3): Is the response naturally wr

In [None]:
for each in test_dataset:
    input_ids = torch.tensor([each[0].tolist()]).to('cuda')
    if len(input_ids[0]) < 1000:
        outputs = model.generate(input_ids, do_sample=False, max_new_tokens=24, num_beams=5)
        print(str(tokenizer.decode(input_ids[0])))
        model_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        print(model_response)
        print("\n")
    else:
        print(len(input_ids[0]))

In [None]:
tokenized['input_ids']

In [None]:
model.config.pad_token_id = tokenizer.pad_token_id
# prompt = "hey there"
# tokenized = tokenizer(prompt, return_tensors="pt")
# input_ids = tokenized['input_ids'].to('cuda')

outputs = model.generate(torch.tensor([test_dataset[0][0].tolist()]).to('cuda'), do_sample=False, max_new_tokens=100, num_beams=5)
print("Prompt: " + str(tokenizer.decode(input_ids[0])))
model_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# feedback_starts = model_response.find("Feedback: ")
print("Model Response: " + model_response)
# print()