## Import

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m93.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [3]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle
  Downloading setproctitle-1.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.18.0-py2.py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 KB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?2

In [4]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os
import wandb

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import AutoModel
from transformers import AutoTokenizer
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [6]:
wandb.init(
    # set the wandb project where this run will be logged
    project="deeptextlab",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": 1e-6,
    "architecture": "klue/roberta-base",
    "dataset": "ETRI",
    "epochs": 10,
    "batch_size" : 4
    }
)


[34m[1mwandb[0m: Currently logged in as: [33m02hyewon26[0m ([33mdeeptextlab[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [8]:
CFG = {
    'EPOCHS': 10,
    'LEARNING_RATE':1e-6,
    'BATCH_SIZE': 4,
    'SEED':41
}

## Fixed RandomSeed

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Load

In [22]:
train = pd.read_csv('./ETRI/utterance_text/train.csv')

In [23]:
val = pd.read_csv('./ETRI/utterance_text/val.csv')

In [24]:
test = pd.read_csv('./ETRI/utterance_text/test.csv')

## Label encoding

In [25]:
le = LabelEncoder()
le.fit(train['Solo_Label'])

train['Solo_Label'] = le.transform(train['Solo_Label'])
test['Solo_Label'] = le.transform(test['Solo_Label'])
val['Solo_Label'] = le.transform(val['Solo_Label'])

## Tokenizer Define

In [28]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

## CustomDataset

In [29]:
class CustomDataset(Dataset):
  
    def __init__(self, data):
        self.dataset = data
        self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
        
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
    
        
        y = self.dataset['Solo_Label'][idx]
        return input_ids, attention_mask, y

In [30]:
train = CustomDataset(train)
valid = CustomDataset(val)
test = CustomDataset(test)

train_dataloader = torch.utils.data.DataLoader(train, batch_size= CFG['BATCH_SIZE'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid, batch_size= CFG['BATCH_SIZE'], shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test, batch_size= CFG['BATCH_SIZE'], shuffle=False)

## Model Define

In [31]:
class BaseModel(nn.Module):

    def __init__(self, dropout=0.5, num_classes=len(le.classes_)):

        super(BaseModel, self).__init__()

        self.bert = AutoModel.from_pretrained('klue/roberta-base')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

## Train

In [32]:
def train(model, optimizer, train_loader, val_loader, test_loader, device, save_path):

    model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    best_score = 0
    best_model = "None"
    for epoch_num in range(CFG["EPOCHS"]):

        model.train()
        train_loss = []
        for input_ids, token_type_ids, attention_mask, train_label in tqdm(train_loader):
            optimizer.zero_grad()

            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)     
    
            batch_loss = criterion(output, train_label.long()) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_score = validation(model, criterion, val_loader, device)
        test_loss, test_score = validation(model, criterion, test_loader, device)

        print(f'Epoch [{epoch_num}], Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}], Test Loss : [{np.mean(test_loss) :.5f}] Test F1 Score : [{test_score:.5f}]')#
        wandb.log({"Epoch": epoch_num, "val F1 Score": val_score, "test F1 Score": test_score})#

        if best_score < val_score:
            # 모델 저장
            torch.save(model.state_dict(), save_path)
            best_model = model
            best_score = val_score
        
    return best_model                         

In [33]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="weighted")

def validation(model, criterion, test_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for input_ids, token_type_ids, attention_mask, val_label in tqdm(test_loader):
            val_label = val_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)
    
            batch_loss = criterion(output, val_label.long()) 
            val_loss.append(batch_loss.item())      
            
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += val_label.detach().cpu().numpy().tolist()
        val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_f1    

## Run!!

In [34]:
model = BaseModel()
model.eval()
torch.no_grad()
torch.cuda.empty_cache()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
save_path = '/content/drive/MyDrive/DeepTextLab/Contest_ETRI_multimodal/PRACTICE_CODE/text/hyewon/model.bin'

infer_model = train(model, optimizer, train_dataloader, val_dataloader, test_dataloader, device, save_path)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [0], Val Loss : [0.65250] Val F1 Score : [0.72701], Test Loss : [0.59487] Test F1 Score : [0.76574]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [1], Val Loss : [0.64710] Val F1 Score : [0.74346], Test Loss : [0.56747] Test F1 Score : [0.77930]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [2], Val Loss : [0.63187] Val F1 Score : [0.75849], Test Loss : [0.55738] Test F1 Score : [0.79064]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [3], Val Loss : [0.65007] Val F1 Score : [0.75623], Test Loss : [0.55633] Test F1 Score : [0.78765]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [4], Val Loss : [0.65037] Val F1 Score : [0.75476], Test Loss : [0.54814] Test F1 Score : [0.79311]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [5], Val Loss : [0.65587] Val F1 Score : [0.76112], Test Loss : [0.55493] Test F1 Score : [0.79784]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [6], Val Loss : [0.66020] Val F1 Score : [0.76334], Test Loss : [0.57194] Test F1 Score : [0.79215]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [7], Val Loss : [0.66304] Val F1 Score : [0.75893], Test Loss : [0.56309] Test F1 Score : [0.80100]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [8], Val Loss : [0.66176] Val F1 Score : [0.76715], Test Loss : [0.57108] Test F1 Score : [0.79316]


  0%|          | 0/2351 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/697 [00:00<?, ?it/s]

Epoch [9], Val Loss : [0.69544] Val F1 Score : [0.76225], Test Loss : [0.57989] Test F1 Score : [0.79717]
