<a href="https://colab.research.google.com/github/DongHyunLee1224/lab/blob/main/FriendsEmotionAnalysis/Friends_emotion_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparation**

In [27]:
from google.colab import drive 
drive.mount('/content/gdrive') 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [28]:
DATA_PATH = 'gdrive/MyDrive/EmotionAnalysis/Data/'
import sys
sys.path.append(DATA_PATH)

# **Tutorials**

##### **Settings**

In [29]:
!pip install transformers --quiet # package installer for python

In [30]:
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer

In [31]:
pretrained_weights = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

# **Emotion Recognition**

##### **Dataset**

In [32]:
import json

data = {'train': {'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open(DATA_PATH + 'friends_' + dtype + '.json').read()):
    for line in dialog:
      data[dtype]['speaker'].append(line['speaker'])
      data[dtype]['utterance'].append(line['utterance'])
      data[dtype]['emotion'].append(line['emotion'])

In [33]:
verification_data = pd.read_csv(DATA_PATH + "en_data.csv", sep=',')
print(verification_data.shape)
print(verification_data[:5])

(1623, 5)
   id  ...                                          utterance
0   0  ...                      Alright, whadyou do with him?
1   1  ...                                  Oh! You're awake!
2   2  ...  Then you gotta come clean with Ma! This is not...
3   3  ...                                  Yeah, but this is
4   4  ...          I don't wanna hear it! Now go to my room!

[5 rows x 5 columns]


In [34]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}

In [35]:
print(e2i_dict)

{'neutral': 0, 'surprise': 1, 'non-neutral': 2, 'joy': 3, 'disgust': 4, 'fear': 5, 'sadness': 6, 'anger': 7}


In [36]:
print(i2e_dict)

{0: 'neutral', 1: 'surprise', 2: 'non-neutral', 3: 'joy', 4: 'disgust', 5: 'fear', 6: 'sadness', 7: 'anger'}


In [37]:
import collections
eMotion = collections.Counter()
eMotion.update(data['train']['emotion']) 
print(eMotion)
print(len(eMotion.keys()) ) # emotion : 8개 

eMotion.update(data['test']['emotion']) 
eMotion.update(data['dev']['emotion']) 
print(eMotion) 
# dev : Counter({'neutral': 491, 'non-neutral': 214, 'surprise': 151, 'joy': 123, 'anger': 85, 'sadness': 62, 'fear': 29, 'disgust': 23})
# train Counter({'neutral': 4752, 'non-neutral': 2017, 'joy': 1283, 'surprise': 1220, 'anger': 513, 'sadness': 351, 'disgust': 240, 'fear': 185})
# test  Counter({'neutral': 1287, 'non-neutral': 541, 'joy': 304, 'surprise': 286, 'anger': 161, 'sadness': 85, 'disgust': 68, 'fear': 32})
# sum : Counter({'neutral': 6530, 'non-neutral': 2772, 'joy': 1710, 'surprise': 1657, 'anger': 759, 'sadness': 498, 'disgust': 331, 'fear': 246})

print(len(eMotion.keys()) ) # emtion : 8개 

Counter({'neutral': 4752, 'non-neutral': 2017, 'joy': 1283, 'surprise': 1220, 'anger': 513, 'sadness': 351, 'disgust': 240, 'fear': 185})
8
Counter({'neutral': 6530, 'non-neutral': 2772, 'joy': 1710, 'surprise': 1657, 'anger': 759, 'sadness': 498, 'disgust': 331, 'fear': 246})
8


In [38]:
uTterance = collections.Counter()
uTterance.update(data['train']['utterance']) 
print(len(uTterance.keys()) ) # utterance : 9,291개  

uTterance.update(data['test']['utterance']) 
uTterance.update(data['dev']['utterance']) 
print(uTterance) 

print(len(uTterance.keys()) ) # utterance : 12,645개  

9291
12645


In [39]:
sPeaker = collections.Counter()
sPeaker.update(data['train']['speaker']) 
print(len(sPeaker.keys()) )  # speaker : 264명

sPeaker.update(data['test']['speaker']) 
sPeaker.update(data['dev']['speaker']) 
print(sPeaker) 

print(len(sPeaker.keys()) ) # speaker : 309명

264
Counter({'Ross': 2166, 'Joey': 2163, 'Rachel': 2095, 'Phoebe': 1926, 'Monica': 1888, 'Chandler': 1861, 'Janice': 97, 'Carol': 63, 'Emily': 61, 'Tag': 60, 'All': 55, 'Mona': 48, 'Doug': 46, 'Frank': 43, 'Pete': 40, 'Joanna': 39, 'Mark': 38, 'Susan': 38, 'Richard': 33, 'Paul': 32, 'Woman': 30, 'David': 29, 'Gunther': 29, 'Mr. Treeger': 28, 'Danny': 28, 'Mr. Geller': 28, 'Phoebe Sr': 26, 'Mrs. Geller': 25, 'Julie': 25, 'Earl': 25, 'Elizabeth': 24, 'Director': 21, 'Eric': 21, 'Man': 21, 'Mike': 21, 'Dr. Green': 20, 'Gary': 19, 'Barry': 19, 'Guy': 19, 'Lydia': 18, 'Chip': 18, 'Kate': 16, 'Dina': 16, 'Nurse': 15, 'Jill': 15, 'Chloe': 15, 'Dana': 15, 'Charlie': 14, 'Alice': 14, 'Bonnie': 14, 'Leslie': 14, 'Janine': 14, 'Steve': 14, 'The Casting Director': 13, 'Mr. Tribbiani': 13, 'Dr. Long': 13, 'Joshua': 13, 'Kim': 13, 'Cassie': 13, 'Kristen': 13, 'Tour Guide': 13, 'Cliff': 12, 'Policeman': 12, 'Julio': 12, 'Katie': 12, 'Kathy': 12, 'Ben': 12, 'Russell': 12, 'Bob': 11, 'Duncan': 11, 'Mis

##### **Model**

In [40]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(1024, len(e2i_dict))
    self.pre_utterance = ''
    self.b_pre_utterance = ''

  def forward(self, utterance):
    previous_tokens = self.bert_tokenizer.tokenize(self.pre_utterance)
    before_previous_tokens = self.bert_tokenizer.tokenize(self.b_pre_utterance)
    tokens = self.bert_tokenizer.tokenize(utterance)
    tokens = ['[CLS]'] + tokens + ['[SEP]'] + previous_tokens + ['[SEP]'] + before_previous_tokens + ['[SEP]']# (len)
    ids = [tokenizer.convert_tokens_to_ids(tokens)] # (bat=1, len)
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)

    self.b_pre_utterance = self.pre_utterance
    self.pre_utterance = utterance

    logit = self.linear(hidden_tensor)
    return logit

##### **Evaluation Metrics**

In [41]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

##### **Hyper-parameters**

In [42]:
pretrained_weights = 'bert-large-uncased'
learning_rate = 1e-6        
n_epoch = 3  

In [43]:
print(range(len(data['train']['speaker'])))
print(range(len(data['test']['speaker'])))
print(range(len(data['dev']['speaker'])))

range(0, 10561)
range(0, 2764)
range(0, 1178)


##### **Training**

In [44]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from tqdm import tqdm_notebook

model = Model()
model.cuda()
weights = [1/1220, 1/185, 1/1283, 1/240, 1/4752, 1/513, 1/2017, 1/351]
class_weights = torch.FloatTensor(weights).cuda()
criterion = torch.nn.CrossEntropyLoss(weight=class_weights) 
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)
  model.train()

  for i_batch in tqdm_notebook(range(len(data['train']['utterance']))):
    logit = model('[' + data['train']['speaker'][i_batch] + '] ' + data['train']['utterance'][i_batch])
    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  
  
  model.eval()
  pred_list, true_list = [], []
  for i_batch_d in range(len(data['dev']['utterance'])):
    logit = model('[' + data['dev']['speaker'][i_batch_d] + '] ' + data['dev']['utterance'][i_batch_d])
    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['dev']['emotion'][i_batch_d]]]
  evaluate(true_list, pred_list) # print results

i_epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))


precision:	 ['0.6515', '0.5506', '0.3204', '0.5030', '0.0000', '0.0000', '0.7500', '0.3621']
recall:		 ['0.8758', '0.6490', '0.1542', '0.6829', '0.0000', '0.0000', '0.1452', '0.2471']
micro_f1: 0.573005
i_epoch: 1


  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))


precision:	 ['0.7107', '0.6074', '0.3175', '0.5471', '0.3333', '0.0000', '0.6053', '0.4375']
recall:		 ['0.8656', '0.6556', '0.1869', '0.7561', '0.0435', '0.0000', '0.3710', '0.4118']
micro_f1: 0.607810
i_epoch: 2


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))


precision:	 ['0.7216', '0.6149', '0.2609', '0.6027', '0.4615', '0.0000', '0.5556', '0.4409']
recall:		 ['0.8554', '0.6556', '0.1682', '0.7154', '0.2609', '0.0000', '0.4032', '0.4824']
micro_f1: 0.606961


In [48]:
model.eval()
pred_list, true_list = [], []
for i_batch_d in tqdm_notebook(range(len(data['test']['utterance']))):
  logit = model('[' + data['test']['speaker'][i_batch_d] + '] ' + data['test']['utterance'][i_batch_d])
  _, max_idx = torch.max(logit, dim=-1)
  pred_list += max_idx.tolist()
  true_list += [e2i_dict[data['test']['emotion'][i_batch_d]]]
evaluate(true_list, pred_list) # print results

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2764.0), HTML(value='')))


precision:	 ['0.7724', '0.5556', '0.4437', '0.5585', '0.3824', '0.0000', '0.4409', '0.4101']
recall:		 ['0.8361', '0.6119', '0.2403', '0.7697', '0.1912', '0.0000', '0.4824', '0.5528']
micro_f1: 0.636035


  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
verification_result = []
model.eval()
for i_batch in tqdm_notebook(range(len(verification_data['utterance']))):
  id = verification_data['id'][i_batch]
  logit = model(verification_data['speaker'][i_batch] + '] ' + verification_data['utterance'][i_batch])
  _, max_idx = torch.max(logit, dim=-1)
  max_idx = int(max_idx.cpu().numpy())
  
  verification_result.append([id , i2e_dict[max_idx]])

verification_result[:10]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=1623.0), HTML(value='')))




[[0, 'neutral'],
 [1, 'surprise'],
 [2, 'anger'],
 [3, 'neutral'],
 [4, 'anger'],
 [5, 'non-neutral'],
 [6, 'neutral'],
 [7, 'neutral'],
 [8, 'surprise'],
 [9, 'neutral']]

In [46]:
rdf = pd.DataFrame(verification_result, columns =['Id', 'Expected'])
rdf.to_csv(DATA_PATH + '2019516018_LEEDONGHYUN_ENG.csv', index=False)