In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ln -s /content/drive/MyDrive/ /gdrive

In [None]:
!mkdir data
!cp /gdrive/ETRI/result_0407.csv data/

In [None]:
!cp /gdrive/ETRI/path_data/path_train.pkl ./
!cp /gdrive/ETRI/path_data/path_dev.pkl ./
!cp /gdrive/ETRI/path_data/path_test.pkl ./
!cp /gdrive/ETRI/KEMDy19.zip ./

In [None]:
!unzip KEMDy19.zip

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M011.wav  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M012.csv  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M012.txt  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M012.wav  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M013.csv  
 extracting: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M013.txt  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M013.wav  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M014.csv  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M014.txt  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M014.wav  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M015.csv  
  inflating: KEMDy19/wav/Session17/Sess17_script04/Sess17_script04_M015.txt  
  inflating: K

In [None]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 16.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 69.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 80.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 k

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import sys
import warnings
import re

import numpy as np
import pandas as pd

import datasets
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader

import seaborn as sns

warnings.filterwarnings(action='ignore')

In [None]:
PRETRAINED_ROBERTA = 'klue/roberta-large'
TEXT_MAX_LENGTH = 249
NUM_LABELS = 7

In [None]:
class MultiModalDataset(Dataset):
    
    def __init__(self, 
                 data_path: str, 
                 pretrained_roberta: str,  
                 text_max_length: int, 
                 num_labels: int):
        super(MultiModalDataset, self).__init__()
        self.data_path = data_path
        self.text_tokenizer = AutoTokenizer.from_pretrained(pretrained_roberta, use_fast=True)
        self.text_model = AutoModelForSequenceClassification.from_pretrained(pretrained_roberta, num_labels=num_labels, output_hidden_states=False)
        self.text_max_length = text_max_length
        self.text, self.labels = self.load_data(self.data_path)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        normalized_text = self.normalize_string(self.text[idx])
        text_input = self.tokenize_text(normalized_text)
        text_data = {k: torch.tensor(v).squeeze() for k, v in text_input.items()}
        text_data['labels'] = torch.tensor(self.labels[idx])
        return text_data    
    
    def tokenize_text(self, text):
        tokenized_text = self.text_tokenizer(text,
                                            max_length = self.text_max_length,
                                            padding="max_length",
                                            truncation=True, 
                                            return_tensors="pt")
        return tokenized_text
    
    @staticmethod
    def normalize_string(text):
        text = re.sub(r"[\s]", r" ", str(text))
        text = re.sub(r"[^a-zA-Z가-힣ㄱ-ㅎ0-9.!?]+", r" ", str(text))
        return text

    @staticmethod
    def load_data(data_path: str):
        data = pd.read_pickle(data_path)
        data = data.dropna()
        text = list(data['text'])
        labels = list(data['emotion'])
        return text, labels

In [None]:
data_train = MultiModalDataset(data_path='path_train.pkl', 
                               pretrained_roberta=PRETRAINED_ROBERTA, 
                               text_max_length=TEXT_MAX_LENGTH,
                               num_labels=NUM_LABELS)
data_val = MultiModalDataset(data_path='path_dev.pkl', 
                               pretrained_roberta=PRETRAINED_ROBERTA, 
                               text_max_length=TEXT_MAX_LENGTH,
                               num_labels=NUM_LABELS)
data_test = MultiModalDataset(data_path='path_test.pkl', 
                               pretrained_roberta=PRETRAINED_ROBERTA, 
                               text_max_length=TEXT_MAX_LENGTH,
                               num_labels=NUM_LABELS)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'class

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_ROBERTA, use_fast=True)

In [None]:
import gc
gc.collect()

903

In [None]:
metric = load_metric("glue", "mnli")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(PRETRAINED_ROBERTA, num_labels=NUM_LABELS, output_hidden_states=False)

In [None]:
metric_name = "accuracy"

args = TrainingArguments(
    "saved",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate=2e-05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    greater_is_better=True,
    seed = 42
)

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=data_train,
    eval_dataset=data_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file https://huggingface.co/klue/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/571e05a2160c18c93365862223c4dae92bbd1b41464a4bd5f372ad703dba6097.ae5b7f8d8a28a3ff0b1560b4d08c6c3bd80f627288eee2024e02959dd60380d0
Model config RobertaConfig {
  "_name_or_path": "klue/roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_ep

In [None]:
trainer.train()

loading configuration file https://huggingface.co/klue/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/571e05a2160c18c93365862223c4dae92bbd1b41464a4bd5f372ad703dba6097.ae5b7f8d8a28a3ff0b1560b4d08c6c3bd80f627288eee2024e02959dd60380d0
Model config RobertaConfig {
  "_name_or_path": "klue/roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_ep

Epoch,Training Loss,Validation Loss,Accuracy
1,1.1868,0.865234,0.691606
2,0.6972,0.848999,0.715937
3,0.5741,0.934236,0.719586
4,0.4152,1.057345,0.717153
5,0.3083,1.187518,0.723236


***** Running Evaluation *****
  Num examples = 1644
  Batch size = 8
Saving model checkpoint to saved/checkpoint-822
Configuration saved in saved/checkpoint-822/config.json
Model weights saved in saved/checkpoint-822/pytorch_model.bin
tokenizer config file saved in saved/checkpoint-822/tokenizer_config.json
Special tokens file saved in saved/checkpoint-822/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1644
  Batch size = 8
Saving model checkpoint to saved/checkpoint-1644
Configuration saved in saved/checkpoint-1644/config.json
Model weights saved in saved/checkpoint-1644/pytorch_model.bin
tokenizer config file saved in saved/checkpoint-1644/tokenizer_config.json
Special tokens file saved in saved/checkpoint-1644/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1644
  Batch size = 8
Saving model checkpoint to saved/checkpoint-2466
Configuration saved in saved/checkpoint-2466/config.json
Model weights saved in saved/checkpoint-2466/pytorc

TrainOutput(global_step=4110, training_loss=0.6033595082823667, metrics={'train_runtime': 6335.087, 'train_samples_per_second': 5.189, 'train_steps_per_second': 0.649, 'total_flos': 1.490000493593475e+16, 'train_loss': 0.6033595082823667, 'epoch': 5.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1644
  Batch size = 8


{'epoch': 5.0,
 'eval_accuracy': 0.7232360097323601,
 'eval_loss': 1.1875184774398804,
 'eval_runtime': 90.6653,
 'eval_samples_per_second': 18.133,
 'eval_steps_per_second': 2.272}

In [None]:
pred = trainer.predict(data_test)

***** Running Prediction *****
  Num examples = 2055
  Batch size = 8


In [None]:
pred

PredictionOutput(predictions=array([[ 4.2237697 ,  5.0857825 , -2.2794783 , ..., -2.0746925 ,
        -1.7009395 , -2.3078198 ],
       [ 3.4805222 , -1.2829148 , -2.4359221 , ..., -3.7235756 ,
        -1.5269098 ,  0.32847974],
       [ 1.067754  , -0.9923486 , -0.8863773 , ..., -1.2146715 ,
         6.3959737 , -2.8342047 ],
       ...,
       [ 7.21782   , -0.30942732, -2.0715668 , ..., -2.5429811 ,
        -0.75715685, -1.0485512 ],
       [-1.102238  , -1.5349534 , -1.0054612 , ..., -0.93647987,
        -1.2439367 , -1.1644173 ],
       [ 2.628472  ,  0.79952085,  0.6529304 , ..., -2.173507  ,
        -0.7308053 , -1.4088047 ]], dtype=float32), label_ids=array([1, 0, 5, ..., 0, 3, 2]), metrics={'test_loss': 1.121650218963623, 'test_accuracy': 0.7250608272506083, 'test_runtime': 113.615, 'test_samples_per_second': 18.087, 'test_steps_per_second': 2.262})

In [None]:
predictions = pred.predictions
predictions

array([[ 4.2237697 ,  5.0857825 , -2.2794783 , ..., -2.0746925 ,
        -1.7009395 , -2.3078198 ],
       [ 3.4805222 , -1.2829148 , -2.4359221 , ..., -3.7235756 ,
        -1.5269098 ,  0.32847974],
       [ 1.067754  , -0.9923486 , -0.8863773 , ..., -1.2146715 ,
         6.3959737 , -2.8342047 ],
       ...,
       [ 7.21782   , -0.30942732, -2.0715668 , ..., -2.5429811 ,
        -0.75715685, -1.0485512 ],
       [-1.102238  , -1.5349534 , -1.0054612 , ..., -0.93647987,
        -1.2439367 , -1.1644173 ],
       [ 2.628472  ,  0.79952085,  0.6529304 , ..., -2.173507  ,
        -0.7308053 , -1.4088047 ]], dtype=float32)

In [None]:
np.save("/gdrive/ETRI/npy/pred_roberta_large_whole_1.npy", predictions)

In [None]:
test = pd.read_pickle('path_test.pkl')

In [None]:
test = test.dropna()

In [None]:
len(test['emotion'])

2055

In [None]:
y_pred = np.argmax(predictions,1)
test['pred'] = y_pred

In [None]:
test.head()

Unnamed: 0,total_path,text,emotion,pred
4039,./KEMDy19/wav/Session08/Sess08_impro02/Sess08_...,l/ 내가 아주 아주 아주 듬뿍 발라줄게 그 약 가져와봐. (어딨냐)\n,1,1
7262,./KEMDy19/wav/Session15/Sess15_script01/Sess15...,n/ 뭐야 나 말 안해.\n,0,3
529,./KEMDy19/wav/Session02/Sess02_script02/Sess02...,b/ u/ 미안해. 우리 아버지는 음 음 음 3년동안 폐암으로 병원에 계시다가 돌아...,5,5
4541,./KEMDy19/wav/Session09/Sess09_impro01/Sess09_...,아 그니까 그 김 부장이 또 나한테 막 뭐라했단 말이야 왜 맨날 나한테는 뭐라고만...,3,3
2924,./KEMDy19/wav/Session06/Sess06_impro01/Sess06_...,아이 나 참 u/ 아이 나 피곤해. 어? 그 나 내일 u/ 일찍 자야돼. 나 내일 ...,0,0


In [None]:
labels = test.emotion.tolist()
preds = test.pred.tolist()
count = 0
for i, j in zip(labels, preds):
  if i == j:
    count += 1
  else:
    pass
acc = count / len(labels) * 100
acc

72.50608272506082

In [None]:
!cp -r ./saved/checkpoint-4110 /gdrive/ETRI/backup/roberta_large

참고문헌  
transformers 공식문서 How to fine-tune a model on text classification
https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb  

