In [83]:
import evaluate as ev
import re
import semEval as sem
import torch
import random
from transformers import BertModel
from transformers import AutoTokenizer
from tqdm import tqdm_notebook as tqdm

# This is so that you don't have to restart the kernel everytime you edit hmm.py
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# read in the data with semEval function
data = ev.get_json_data('data/Subtask_1_1_train.json')
data[:2]
# converts the list of dictionaries into one large dictionary where the main callable key is the emotion_utterance_ID
# that way we can call a specific emotion_utterance_ID dicretly from this dict it reduces the data one level
dicts = ev.convert_list_to_dict(data, main_key = 'emotion_utterance_ID')
dicts['dia1utt3']

{'emotion_utterance_ID': 'dia1utt3',
 'emotion': 'surprise',
 'conversation': [{'utterance_ID': 1,
   'text': 'Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked .',
   'speaker': 'Chandler',
   'emotion': 'neutral'},
  {'utterance_ID': 2,
   'text': 'Oh , yeah . Had that dream .',
   'speaker': 'All',
   'emotion': 'neutral'},
  {'utterance_ID': 3,
   'text': 'Then I look down , and I realize there is a phone ... there .',
   'speaker': 'Chandler',
   'emotion': 'surprise'},
  {'utterance_ID': 4,
   'text': 'Instead of ... ?',
   'speaker': 'Joey',
   'emotion': 'surprise'},
  {'utterance_ID': 5,
   'text': 'That is right .',
   'speaker': 'Chandler',
   'emotion': 'anger'},
  {'utterance_ID': 6,
   'text': 'Never had that dream .',
   'speaker': 'Joey',
   'emotion': 'neutral'},
  {'utterance_ID': 7,
   'text': 'No .',
   'speaker': 'Phoebe',
   'emotion': 'neutral'},
  {'utterance_ID': 8,
   'text': 'All of a sudde

In [85]:
print(torch.cuda.is_available())
device = torch.device("cpu")
# TODO: Uncomment the below line if you see True in the print statement
# device = torch.device("cuda:0")

False


In [41]:
# create data  of {'pos_cause': , 'target': , 'label': , }
nli_data_pairs = []
for d in data:
    # get the target text
    convo_id, target_utt_id = sem.get_target_conv_utt_ids(d['emotion_utterance_ID'])
    target_text = d['conversation'][int(target_utt_id) - 1]['text']
    cause_spans_ids = sem.get_cause_span_ids(d['cause_spans'])
    # for every conversation
    for utterance in d['conversation']:
        label = 1 if utterance['utterance_ID'] in cause_spans_ids else 0
        nli_data_pairs.append({'pos_cause': utterance['text'],'target': target_text,'label': label})

In [None]:
# create train test val
random.seed(42)
random.shuffle(nli_data_pairs)
train, val, test = sem.get_train_val_test(nli_data_pairs)

In [56]:
print(len(train) / len(nli_data_pairs), sem.count_labels(train) / len(train))
print(len(val) / len(nli_data_pairs), sem.count_labels(val)/ len(val))
print(len(test) / len(nli_data_pairs), sem.count_labels(test) / len(test))
print( sem.count_labels(nli_data_pairs) / len(nli_data_pairs))

0.5999933315550814 0.1038436602760026
0.19999777718502712 0.10575159766601834
0.20000889125989152 0.09707712825072239
0.1028718769449631


In [57]:
tokenizer = sem.BatchTokenizer()
x = tokenizer(*[["this is the first premise", "This is the second premise"], ["This is first hypothesis", "This is the second hypothesis"]])
print(x)
tokenizer.hf_tokenizer.batch_decode(x["input_ids"])

Downloading config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  2023,  2003,  1996,  2034, 18458,   102,  2023,  2003,  2034,
         10744,   102,     0],
        [  101,  2023,  2003,  1996,  2117, 18458,   102,  2023,  2003,  1996,
          2117, 10744,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


['[CLS] this is the first premise [SEP] this is first hypothesis [SEP] [PAD]',
 '[CLS] this is the second premise [SEP] this is the second hypothesis [SEP]']

# We can batch the train, validation, and test data, and then run it through the tokenizer

In [65]:
train_causes, train_targets, train_labels = sem.generate_pairwise_input(train)
validation_causes, validation_targets, validation_labels = sem.generate_pairwise_input(val)
test_causes, test_targets, test_labels = sem.generate_pairwise_input(test)

In [69]:
# batch and tokenize the train causes and targets
batch_size = 16

# Notice that since we use huggingface, we tokenize and
# encode in all at once!
tokenizer = sem.BatchTokenizer()
train_input_batches = [b for b in sem.chunk_multi(train_causes, train_targets, batch_size)]
#print(train_input_batches)
# Tokenize + encode
train_input_batches = [tokenizer(*batch) for batch in train_input_batches]

In [72]:
# batch the labels
train_label_batches = [b for b in chunk(train_labels, batch_size)]
train_label_batches = [sem.encode_labels(batch) for batch in train_label_batches]

In [78]:
class NLIClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int, model_name='prajjwal1/bert-small'):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size

        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained(model_name)


        # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive.
        # Freeze them if training is too slow. Notice that the learning
        # rate should probably be smaller in this case.
        # Uncommenting out the below 2 lines means only our classification layer will be updated.

        # for param in self.bert.parameters():
        #     param.requires_grad = False

        self.bert_hidden_dimension = self.bert.config.hidden_size

        # TODO: Add an extra hidden layer in the classifier, projecting
        #      from the BERT hidden dimension to hidden size. Hint: torch.nn.Linear(), should we have a bias term?
        # torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)

        self.hidden_layer = torch.nn.Linear(self.bert_hidden_dimension, self.hidden_size)

        # TODO: Add a relu nonlinearity to be used in the forward method
        #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        # torch.nn.RelU returns a function that uses the ReLU logic

        self.relu = torch.nn.ReLU()


        self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=1)

    def encode_text(self,symbols):
        """Encode the (batch of) sequence(s) of token symbols BERT.
            Then, get CLS represenation.

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: CLS token embedding
        """
        # First we get the contextualized embedding for each input symbol
        # We no longer need an LSTM, since BERT encodes context and
        # gives us a single vector describing the sequence in the form of the [CLS] token.
        encoded_sequence = self.bert(**symbols)
        # TODO: Get the [CLS] token
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        # print(encoded_sequence.last_hidden_state.shape)
        # Return only the first token's embedding from the last_hidden_state. Hint: using list slices
        return encoded_sequence.last_hidden_state[:,0,:]



    def forward(self,symbols):
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        output = self.hidden_layer(encoded_sents)
        output = self.relu(output)
        output = self.classifier(output)
        return self.log_softmax(output)

In [80]:
# For making predictions at test time
def predict(model, sents):
    #logits = model(sents)
    #return list(torch.argmax(logits, axis=1).squeeze().numpy()) # changed the axis to be 1

    logits = model(sents.to(device))
    return list(torch.Tensor.cpu(torch.argmax(logits, axis=1).squeeze()).numpy()) # change the axis to be 1

In [81]:
def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_sents,
    dev_labels,
    optimizer,
    model,
):
    print("Training...")
    loss_func = torch.nn.NLLLoss()
    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):

            # Empty the dynamic computation graph
            optimizer.zero_grad()
            device = torch.device('cuda:0')
            #print(features.to(device))
            preds = model(features.to(device)).squeeze(1)
            loss = loss_func(preds, labels.to(device))
            # Backpropogate the loss through our model
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            #pred = predict(model, sents).cpu()

            pred = predict(model, sents)

            all_preds.extend(pred)
            all_labels.extend(list(labels.cpu().numpy()))

        #dev_f1 = macro_f1_score(all_preds, all_labels)
        dev_f1 = macro_f1(all_preds, all_labels, possible_labels =  list(set(all_labels))  )

        print(f"Dev F1 {dev_f1}")

    # Return the trained model
    return model

In [88]:
# You can increase epochs if need be
epochs = 20

# TODO: Find a good learning rate and hidden size
LR = 0.01
hidden_size = 10

possible_labels = set(train_labels)
# we build this.
model = NLIClassifier(output_size=len(possible_labels), hidden_size=hidden_size)
# device tells us which GPU to use?
model.to(device)
# This is the optimizer from torch. we pass it parameters and the learnig rate
# how does adamW optimize the weights?
optimizer = torch.optim.AdamW(model.parameters(), LR)
# this is a class we have
batch_tokenizer = sem.BatchTokenizer()
# create batches of validations prem and hypothesis
validation_input_batches = [b for b in chunk_multi(validation_causes, validation_targets, batch_size)]

# Tokenize + encode
validation_input_batches = [batch_tokenizer(*batch) for batch in validation_input_batches]
validation_batch_labels = [b for b in chunk(validation_labels, batch_size)]
validation_batch_labels = [encode_labels(batch) for batch in validation_batch_labels]

training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    validation_input_batches,
    validation_batch_labels,
    optimizer,
    model,
)

Training...


NameError: name 'tqdm' is not defined

In [4]:
# get the keys of the data dictionary
dia_utts = list(dicts.keys())

# iterate over the keys of the data dictionary


for dia_utt in dia_utts[:10]:
    # get what conversation we are looking at and the targeted utterance from dia_utt val, ex: 'dia1utt3' 
    convo_id, target_utt_id = sem.get_target_conv_utt_ids(dia_utt)
   
    print('conversation_id: {}'.format(convo_id))
    
    for span in dicts[dia_utt]['cause_spans']:
        # create a dictionary of the conversation where the utterance is the key
        # I am not sure I need to do this, but it make the code cleaner? It migh be slower or it might be faster than not
        conversation = ev.convert_list_to_dict(dicts[dia_utt]['conversation'], main_key = 'utterance_ID')

        # pull out the number from the utterances so that we have index that is relating to 
        # the original conversation
        span_as_list = span.split('_')
        utterance_ID = span_as_list[0]
        
        # get the full texts from the conversation
        text = conversation[int(utterance_ID)]['text']
        # return or print the begining and ending index of the slice in the utterance that belongs in the cause span
        beg_end_idx = ev.get_span_position(span_as_list[1], text)
        #print('{}_{}_{}'.format(utterance_ID,beg_end_idx[0], beg_end_idx[1]))
        cause_pair = [target_utt_id + '_' + dicts[dia_utt]['emotion'],'{}_{}_{}'.format(utterance_ID,beg_end_idx[0], beg_end_idx[1] - 1)]
        
        print(cause_pair)
    print()
    

#print(dicts['dia1utt3'])
#ev.get_span_position()

conversation_id: 1
['3_surprise', '1_21_27']
['3_surprise', '3_0_14']

conversation_id: 1
['4_surprise', '1_21_27']
['4_surprise', '3_0_14']
['4_surprise', '4_0_2']

conversation_id: 1
['5_anger', '1_21_27']
['5_anger', '3_0_14']
['5_anger', '4_0_2']

conversation_id: 2
['1_sadness', '1_0_6']

conversation_id: 2
['3_surprise', '3_0_2']

conversation_id: 3
['3_sadness', '3_2_20']

conversation_id: 3
['6_surprise', '4_14_20']
['6_surprise', '5_2_14']
['6_surprise', '6_0_1']

conversation_id: 3
['7_joy', '5_2_14']
['7_joy', '6_0_1']
['7_joy', '7_0_4']

conversation_id: 3
['9_surprise', '5_2_10']
['9_surprise', '8_4_9']
['9_surprise', '9_0_5']

conversation_id: 5
['1_joy', '3_0_10']



In [None]:
# 1. Init our model

# 2. create optimzer

# 3. Run the training loop

In [None]:
# lets work on how predictions are supposed to look
# [conv_id, emo_utt_id, cau_utt_id, span_start_id, span_end_id, emotion_category]





In [15]:
! pip install -U sentence-transformers --user

Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting torchvision (from sentence-transformers)
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/13/24/23cdf7e7dc33e5c01588c315f8424d31afa9edb05a80168f3d44f7178ff7/torchvision-0.16.1-cp311-cp311-win_amd64.whl.metadata
  Using cached torchvision-0.16.1-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Using cached torchvision-0.16.1-cp311-cp311-win_amd64.whl (1.1 MB)
Installing collected packages: torchvision, sentence-transformers
Successfully installed sentence-transformers-2.2.2 torchvision-0.16.1



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from sentence_transformers import SentenceTransformer

In [17]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']

# Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)

Downloading .gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
No module named 'torch.utils.checkpoint'

In [None]:
embedding

In [22]:
# I am trying to understand how we need to format the prediction. it is extremely confusin
emotion_idx = dict(zip(['neutral','anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'], range(7)))
emotion_idx

{'neutral': 0,
 'anger': 1,
 'disgust': 2,
 'fear': 3,
 'joy': 4,
 'sadness': 5,
 'surprise': 6}

In [28]:
print(list(dicts.keys())[:5])
data[:5]

['dia1utt3', 'dia1utt4', 'dia1utt5', 'dia2utt1', 'dia2utt3']


[{'emotion_utterance_ID': 'dia1utt3',
  'emotion': 'surprise',
  'conversation': [{'utterance_ID': 1,
    'text': 'Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked .',
    'speaker': 'Chandler',
    'emotion': 'neutral'},
   {'utterance_ID': 2,
    'text': 'Oh , yeah . Had that dream .',
    'speaker': 'All',
    'emotion': 'neutral'},
   {'utterance_ID': 3,
    'text': 'Then I look down , and I realize there is a phone ... there .',
    'speaker': 'Chandler',
    'emotion': 'surprise'},
   {'utterance_ID': 4,
    'text': 'Instead of ... ?',
    'speaker': 'Joey',
    'emotion': 'surprise'},
   {'utterance_ID': 5,
    'text': 'That is right .',
    'speaker': 'Chandler',
    'emotion': 'anger'},
   {'utterance_ID': 6,
    'text': 'Never had that dream .',
    'speaker': 'Joey',
    'emotion': 'neutral'},
   {'utterance_ID': 7,
    'text': 'No .',
    'speaker': 'Phoebe',
    'emotion': 'neutral'},
   {'utterance_ID'