# **Preparation**

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [None]:
DATA_PATH = 'gdrive/My Drive/Colab Notebooks/2020_NLP/friends/'
import sys
sys.path.append(DATA_PATH)

# **Tutorials**

##### **Settings**

In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 1.5MB 12.1MB/s 
[K     |████████████████████████████████| 2.9MB 50.0MB/s 
[K     |████████████████████████████████| 890kB 47.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer

In [None]:
pretrained_weights = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




# **Emotion Recognition**

##### **Dataset**

In [None]:
import json

data = {'train': {'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open(DATA_PATH + 'friends_' + dtype + '.json').read()):
    for line in dialog:
      data[dtype]['speaker'].append(line['speaker'])
      data[dtype]['utterance'].append(line['utterance'])
      data[dtype]['emotion'].append(line['emotion'])

In [None]:
test_data = pd.read_csv(DATA_PATH + "en_data.csv", sep=',')
print(test_data.shape)
print(test_data[:2])

(1623, 5)
   id  i_dialog  i_utterance speaker                      utterance
0   0         0            0  Phoebe  Alright, whadyou do with him?
1   1         0            1  Monica              Oh! You're awake!


In [None]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}

##### **Model**

In [None]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(1024, len(e2i_dict))

  def forward(self, utterance):
    tokens = self.bert_tokenizer.tokenize(utterance)
    tokens = ['[CLS]'] + tokens + ['[SEP]'] 
    ids = [tokenizer.convert_tokens_to_ids(tokens)] 
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] 
    hidden_tensor = hidden_tensor[:, 0, :] 
    logit = self.linear(hidden_tensor)
    return logit

##### **Evaluation Metrics**

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.5f' % v for v in precision])
  print('recall:\t\t', ['%.5f' % v for v in recall])
  print('micro_f1: %.5f' % micro_f1)

##### **Hyper-parameters**

In [None]:
learning_rate = 1e-5
n_epoch = 1

##### **Training**

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from tqdm.notebook import tqdm

model = Model()
model.cuda()
criterion = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)

  model.train()
  for i_batch in tqdm(range(len(data['train']['utterance']))):
    logit = model(data['train']['utterance'][i_batch])
    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  
  model.eval()
  pred_list, true_list = [], []
  for i_batch in tqdm(range(len(data['dev']['utterance']))):
    logit = model(data['dev']['utterance'][i_batch])
    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['dev']['emotion'][i_batch]]]
  evaluate(pred_list, true_list) 

i_epoch: 0


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.03271', '0.00000', '0.07317', '0.37097', '0.93686', '0.47682', '0.00000', '0.02353']
recall:		 ['0.24138', '0.00000', '0.81818', '0.37097', '0.48729', '0.55385', '0.00000', '1.00000']
micro_f1: 0.48642


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
final_result = []
model.eval()
pred_list, true_list = [], []
for i_batch in tqdm(range(len(test_data['utterance']))):
  id = test_data['id'][i_batch]
  logit = model(test_data['utterance'][i_batch])
  _, max_idx = torch.max(logit, dim=-1)
  max_idx = int(max_idx.cpu().numpy())
  
  final_result.append([id , i2e_dict[max_idx]])

final_result[:10]

In [None]:
rdf = pd.DataFrame(final_result, columns =['Id', 'Expected'])
rdf.to_csv(DATA_PATH + 'sample_eng.csv', index=False)