# NLP Standard Project:

- Students: **Matteo Belletti**, **Alessandro Pasi**, **Stricescu Razvan Ciprian**.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

## Data loading and preprocessing:

In [2]:
# open json in project_data_MELD folder
import json
with open('project_data_MELD/MELD_train_efr.json') as f:
    data = json.load(f)
print(f"Number of samples: {len(data)}")
print(f"Example of a sample: {data[0]}")

Number of samples: 4000
Example of a sample: {'episode': 'utterance_0', 'speakers': ['Chandler', 'The Interviewer', 'Chandler', 'The Interviewer', 'Chandler'], 'emotions': ['neutral', 'neutral', 'neutral', 'neutral', 'surprise'], 'utterances': ["also I was the point person on my company's transition from the KL-5 to GR-6 system.", "You must've had your hands full.", 'That I did. That I did.', "So let's talk a little bit about your duties.", 'My duties?  All right.'], 'triggers': [0.0, 0.0, 0.0, 1.0, 0.0]}


In [3]:
# Convert data to pandas dataframe
import pandas as pd
df = pd.DataFrame(data)
# drop episode and speakers columns
df = df.drop(columns=['episode', 'speakers'])
print(f"Dataframe shape: {df.shape}")
print(f"Dataframe columns: {df.columns}")

Dataframe shape: (4000, 3)
Dataframe columns: Index(['emotions', 'utterances', 'triggers'], dtype='object')


Changing nan values to zeros in order to avoid errors.

In [4]:
# Chech for NaN values on triggers column
nan_counter = 0
for row in df["triggers"]:
    for elem in row:
        if elem != 1 and elem != 0:
            nan_counter += 1

print(f"Number of NaN values changed: {nan_counter}")

Number of NaN values changed: 9


In [5]:
df["triggers"] = df["triggers"].apply(lambda x: [0.0 if elem != 1 and elem != 0 else elem for elem in x])

In [6]:
# I split the dataframe as 80% train, 10% test, 10% validation
from sklearn.model_selection import train_test_split
random_state = 42

train, test = train_test_split(df, test_size=0.2, random_state=random_state)
test, val = train_test_split(test, test_size=0.5, random_state=random_state)

len(train), len(test), len(val)

(3200, 400, 400)

In [21]:
train

Unnamed: 0,emotions,utterances,triggers
3994,"[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
423,"[surprise, fear]","[And y'know what, I just realised, in the last...","[1.0, 1.0]"
2991,"[fear, neutral, joy]","[Oh! Hey, Mr. Treeger., : What are you doing?,...","[1.0, 0.0, 1.0]"
1221,"[neutral, neutral, neutral, surprise, sadness,...","[Okay to come in?, Yeah, come on, eat, whateve...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
506,"[neutral, neutral, neutral, joy]","[Ok, bye. Well, Monica's not coming, it's jus...","[1.0, 0.0, 0.0, 0.0]"
...,...,...,...
1130,"[neutral, sadness, fear, anger, neutral]","[Listen, Robert's gonna be here any second so,...","[0.0, 0.0, 0.0, 1.0, 0.0]"
1294,"[neutral, neutral, neutral, neutral, neutral, ...","[Triskaidekaphobia., The fear of, No! No, fear...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
860,"[joy, neutral, neutral]","[Yes! Yes! Please, just give it to me!, Yeah, ...","[0.0, 0.0, 0.0]"
3507,"[neutral, sadness, fear, joy, sadness, disgust...","[Okay., Okay, we have to talk. I'm just gettin...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ..."


In [28]:
# I find the max length of utterances in the train set
max_len = 0
for row in df["triggers"]:
    if len(row) > max_len:
        max_len = len(row)
print(f"Max length of utterances: {max_len}")

Max length of utterances: 24


In [7]:
# I import bert-base-uncased
from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# I tokenize the sentences
# TODO do I need input_ids? I think I don't need them
def tokenize_sentences(sentences, tokenizer):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            # + 2 for [CLS] and [SEP] tokens
                            max_length = max_len + 2,           
                            padding='max_length',
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

# I tokenize the sentences
train_sentences = train["utterances"].values
train_labels = train["triggers"].values
train_input_ids, train_attention_masks = tokenize_sentences(train_sentences, tokenizer)
 
test_sentences = test["utterances"].values
test_labels = test["triggers"].values
test_input_ids, test_attention_masks = tokenize_sentences(test_sentences, tokenizer)

val_sentences = val["utterances"].values
val_labels = val["triggers"].values
val_input_ids, val_attention_masks = tokenize_sentences(val_sentences, tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
print(train_input_ids.dtype, train_attention_masks.dtype, train_labels.dtype)

torch.int64 torch.int64 object


In [10]:
print(type(train_labels), type(train_input_ids), type(train_attention_masks))

<class 'numpy.ndarray'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [38]:
# I prepare labels for the dataloaders

def prepare_and_pad_labels(labels):
    # Convert the list of lists into a list of tensors
    labels = [torch.tensor(l) for l in labels]
    # Pad the list of tensors to max length
    labels = torch.nn.utils.rnn.pad_sequence(labels, 
                                             batch_first=True, 
                                             padding_value=0.0)
    return labels

prepare_and_pad_labels(train_labels)[3], train_labels[3], train['triggers'][3]

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.]),
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])

In [39]:
# Create the dataloaders
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataset = TensorDataset(train_input_ids, train_attention_masks, prepare_and_pad_labels(train_labels))
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, prepare_and_pad_labels(test_labels))
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, prepare_and_pad_labels(val_labels))
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [42]:
# I import the model
from transformers import BertForSequenceClassification, AdamW
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    output_attentions = False,
    output_hidden_states = False,
)

# I create the optimizer
optimizer = AdamW(model.parameters(),
                    lr = 2e-5,
                    eps = 1e-8
                    )

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# we freeze the BERT embedding layer weights and fine-tune the classifier heads on top
for param in model.bert.parameters():
    param.requires_grad = False

# I create the loss function
from torch.nn import BCEWithLogitsLoss
loss_function = BCEWithLogitsLoss()
 
# I create the training loop
from tqdm import tqdm
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, train_dataloader, val_dataloader=val_dataloader, epochs=4, evaluation=False):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        print('-' * 10)
        running_loss = 0.0
        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }
            outputs = model(**inputs)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        print(f"Training loss: {epoch_loss}")
        if evaluation == True:
            val_loss, val_f1 = evaluate(model, val_dataloader)
            print(f"Validation loss: {val_loss}")
            print(f"Validation F1: {val_f1}")
    print("Training complete!")

In [18]:
# # I create the scheduler
# from transformers import get_linear_schedule_with_warmup
# epochs = 4
# total_steps = len(train_dataloader) * epochs
# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)

In [19]:
# # I create the accuracy function
# import numpy as np
# from sklearn.metrics import f1_score

# def flat_accuracy(preds, labels):
#     pred_flat = np.argmax(preds, axis=2).flatten()
#     labels_flat = labels.flatten()
#     return np.sum(pred_flat == labels_flat) / len(labels_flat)

# # I create the f1_score function
# def flat_f1_score(preds, labels):
#     pred_flat = np.argmax(preds, axis=2).flatten()
#     labels_flat = labels.flatten()
#     return f1_score(labels_flat, pred_flat)

In [14]:
# # I create the training loop
# import random
# import numpy as np
# seed_val = 42
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# print(device)

cpu


In [20]:
# for step, batch in enumerate(train_dataloader):
#     if step % 40 == 0 and not step == 0:
#         print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
#     b_input_ids = batch[0].to(device)
#     b_input_mask = batch[1].to(device)
#     b_labels = batch[2].to(device)
#     print(b_input_ids.shape, b_input_mask.shape, b_labels.shape)

torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([32, 24])
torch.Size([32, 64]) torch.Size([32, 64]) torch.Size([3

In [15]:
# import time
# import datetime

# def format_time(elapsed):
#     elapsed_rounded = int(round((elapsed)))
#     return str(datetime.timedelta(seconds=elapsed_rounded))

# for epoch_i in range(0, epochs):
#     print("")
#     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
#     print('Training...')
#     total_train_loss = 0
#     model.train()
#     for step, batch in enumerate(train_dataloader):
#         if step % 40 == 0 and not step == 0:
#             print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)
#         model.zero_grad()
#         outputs = model(b_input_ids, 
#                         token_type_ids=None, 
#                         attention_mask=b_input_mask, 
#                         labels=b_labels)
#         loss = outputs[0]
#         total_train_loss += loss.item()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()
#     avg_train_loss = total_train_loss / len(train_dataloader)            
#     print("")
#     print("  Average training loss: {0:.2f}".format(avg_train_loss))
#     print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
#     print("")
#     print("Running Validation...")
#     t0 = time.time()
#     model.eval()
#     total_eval_accuracy = 0
#     total_eval_f1 = 0
#     nb_eval_steps = 0
#     for batch in val_dataloader:
#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)
#         with torch.no_grad():        
#             outputs = model(b_input_ids, 
#                             token_type_ids=None, 
#                             attention_mask=b_input_mask)
#         logits = outputs[0]
#         logits = logits.detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()
#         total_eval_accuracy += flat_accuracy(logits, label_ids)
#         total_eval_f1 += flat_f1_score(logits, label_ids)
#         nb_eval_steps += 1
#     avg_val_accuracy = total_eval_accuracy / nb_eval_steps
#     print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
#     avg_val_f1 = total_eval_f1 / nb_eval_steps
#     print("  F1-Score: {0:.2f}".format(avg_val_f1))


Training...


ValueError: Target size (torch.Size([32, 24])) must be the same as input size (torch.Size([32, 2]))