# Import Requirements

In [1]:
import os
import random
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer

from utils import make_batch_roberta
from model import ERC_model
from ERC_dataset import MELD_loader



Downloading vocab.json: 100%|██████████| 878k/878k [00:01<00:00, 766kB/s]
Downloading merges.txt: 100%|██████████| 446k/446k [00:00<00:00, 553kB/s]
Downloading config.json: 100%|██████████| 482/482 [00:00<00:00, 482kB/s]
Downloading vocab.json: 100%|██████████| 0.99M/0.99M [00:01<00:00, 838kB/s]
Downloading merges.txt: 100%|██████████| 446k/446k [00:00<00:00, 593kB/s]
Downloading config.json: 100%|██████████| 666/666 [00:00<00:00, 335kB/s]


# Fix Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2022) # Seed 고정

# CoMPM Prediction

In [None]:
def compm_inference():    
    """Model Loading"""
    test_dataset = MELD_loader('./dataset/MELD_test.txt', 'emotion')
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4, collate_fn=make_batch_roberta)
    
    clsNum = len(test_dataset.labelList)        
    model = ERC_model('roberta-large', clsNum, False, False, 'pretrained')
    model.load_state_dict(torch.load('./checkpoint/model.bin'))
    model = model.cuda()
    model.eval()           

    pred_emotion = []
    prob_outputs = []
    with torch.no_grad():
        for data in tqdm(test_dataloader):
            batch_input_tokens, batch_labels, batch_speaker_tokens = data
            batch_input_tokens, batch_labels = batch_input_tokens.cuda(), batch_labels.cuda()
            
            pred_logits = model(batch_input_tokens, batch_speaker_tokens) # (1, clsNum)
            pred_label = pred_logits.argmax(1).item()

            #########
            class_labels = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
            pred_emotion.append(class_labels[pred_label])

            outputs = torch.softmax(pred_logits.detach().cpu(), dim=-1).squeeze().numpy()
            outputs = [prob.item() for prob in outputs]
            prob_outputs.append(sorted(outputs, reverse=True)[0])
            #########

    return pred_emotion, prob_outputs

In [None]:
pred_emotion, prob_outputs = compm_inference()

# Index to Emotion Dictionary

In [3]:
emotions = ["neutral", "joy", "surprise", "anger", "sadness", "disgust", "fear"]
id2emotion = {idx: emotion for idx, emotion in enumerate(emotions)}

# Load EmoBERTa large base Model

In [4]:
tokenizer_large = AutoTokenizer.from_pretrained("tae898/emoberta-large")
model_large = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-large")
model_large.eval()
model_large.to('cuda')

tokenizer_base = AutoTokenizer.from_pretrained("tae898/emoberta-base")
model_base = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-base")
model_base.eval()
model_base.to('cuda')

Downloading tokenizer_config.json: 100%|██████████| 408/408 [00:00<00:00, 409kB/s]
Downloading tokenizer.json: 100%|██████████| 2.01M/2.01M [00:01<00:00, 1.18MB/s]
Downloading config.json: 100%|██████████| 1.02k/1.02k [00:00<00:00, 521kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.32G/1.32G [00:57<00:00, 24.8MB/s]
Downloading tokenizer_config.json: 100%|██████████| 407/407 [00:00<00:00, 201kB/s]
Downloading tokenizer.json: 100%|██████████| 2.01M/2.01M [00:01<00:00, 1.32MB/s]
Downloading config.json: 100%|██████████| 1.00k/1.00k [00:00<00:00, 1.03MB/s]
Downloading pytorch_model.bin: 100%|██████████| 476M/476M [00:22<00:00, 21.8MB/s]


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

# Collate

In [5]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizers, mode = "train"):
        self.dataset = data
        self.tokenizer = tokenizers
        self.speaker = data['Speaker']
        self.mode = mode

    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text =  self.dataset['Utterance'][idx]
        return text

# Load Test Dataset

In [6]:
test = pd.read_csv('./open/test.csv')

test = CustomDataset(test, tokenizer_large, mode = "test")

# EmoBERTa Prediction & Ensemble with CoMPM

In [7]:
preds = []
for i in tqdm(range(len(test))):
    text = test[i]
    tokens = tokenizer_large(text, truncation=True)
    tokens["input_ids"] = torch.tensor(tokens["input_ids"]).view(1, -1).to('cuda')
    tokens["attention_mask"] = (torch.tensor(tokens["attention_mask"]).view(1, -1).to('cuda'))

    outputs = model_large(**tokens)
    outputs = torch.softmax(outputs["logits"].detach().cpu(), dim=1).squeeze().numpy()
    outputs = [(id2emotion[idx], prob.item()) for idx, prob in enumerate(outputs)]
    pred_large = sorted(outputs, reverse=True, key=lambda x: x[1])[0]


    tokens = tokenizer_base(text, truncation=True)
    tokens["input_ids"] = torch.tensor(tokens["input_ids"]).view(1, -1).to('cuda')
    tokens["attention_mask"] = (torch.tensor(tokens["attention_mask"]).view(1, -1).to('cuda'))

    outputs = model_base(**tokens)
    outputs = torch.softmax(outputs["logits"].detach().cpu(), dim=1).squeeze().numpy()
    outputs = [(id2emotion[idx], prob.item()) for idx, prob in enumerate(outputs)]
    pred_base = sorted(outputs, reverse=True, key=lambda x: x[1])[0]

    if pred_large[0] == pred_base[0]:
        preds.append(pred_large[0])

    elif pred_large[1] < 0.5 and pred_base[1] < 0.5:
        preds.append(pred_emotion[i])

    else:
        if pred_large[1] >= pred_base[1]:
            preds.append(pred_large[0])
        else:
            preds.append(pred_base[0])

100%|██████████| 2610/2610 [01:44<00:00, 25.06it/s]


# Submit

In [None]:
submit = pd.read_csv('./open/sample_submission.csv')
submit['Target'] = preds
submit.to_csv('./open/submit_final.csv', index=False)
