In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from tqdm.notebook import tqdm
import torch.nn.functional as F
import GPUtil
import random

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [3]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [4]:
train_data = pd.read_csv('./resource/ratings_train.txt', sep='\t').dropna(axis=0).sample(frac=0.2)
test_data = pd.read_csv('./resource/ratings_test.txt', sep='\t').dropna(axis=0).sample(frac=0.2)

In [5]:
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)
test_data[:5]

Unnamed: 0,document,label
47724,주인공에게 다가온 최대의 반전.. 간만에 좋은영화..,1
47319,YTN뉴스에 감독나와서 이야기 나누던데 샬림에게 빨간벽돌로지은 집을 선물했다는군요,1
3931,마지막에 데이가 지은 표정을 잊을수가 없다.. 내인생 최고의영화ㅠㅠ,1
7327,하...마지막에 프란 떄문에 울뻔 했음...진짜..ㅠㅠㅠㅠㅠㅠ 성우분 연기 대박!!...,1
32182,재밌는데요?ㅋㅋㅋㅋㅋ,1


In [6]:
# max_len = 64
# batch_size = 64
# warmup_ratio = 0.1
# num_epochs = 5
# max_grad_norm = 1
# log_interval = 200
# learning_rate =  5e-5

In [7]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 0]
        label = self.df.iloc[idx, 1]
        return text, label

In [8]:
# data_list = []
# for ques, label in zip(data['발화문'], data['1번 감정'])  :
#     data = []   
#     data.append(ques)
#     data.append(str(label))
#     data_list.append(data)
# data_list[:5]

In [9]:
#data_train, data_test = train_test_split(data_list, test_size=0.2, shuffle=True, random_state=34)

In [10]:
train_dataset = MyDataset(train_data)
test_dataset = MyDataset(test_data)

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True, num_workers=0)

In [12]:
#train_data = ["[CLS] " + str(sentence) + " [SEP]" for sentence in train_data]
#print(tokenizer.encode(t, add_special_tokens=True, max_length=512, pad_to_max_length=True) for t in test_data)

# for text, label in train_dataloader:
#     #en = [tokenizer.encode(t, add_special_tokens=True, max_length=512, pad_to_max_length=True) for t in text]
#     print(en)
#     de = [tokenizer.decode(t) for t in en]
#     print(de)

In [13]:
device = torch.device("cuda:0")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [14]:
optimizer = Adam(model.parameters(), lr=1e-5)
itr = 1
p_itr = 1000
epochs = 2
total_loss = 0
total_len = 0
total_correct = 0
MAX_LEN=128

In [15]:
#print('Tokenized: ', tokenizer.tokenize(data['발화문'][0]))
#  print(train_dataset.__getitem__(2))
# #for text, label in tqdm(enumerate(train_loader), total=len(train_loader)):
# for text,label in train_loader:
#      print(text)
#      print(label)

In [16]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in range(epochs):
    GPUtil.showUtilization()
    model.train()
    for batch_id, (text, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        model.zero_grad()
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True) for t in text]
        #padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        sample = torch.tensor(encoded_list)
        labels = torch.tensor(label)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        #loss, logits = outputs
        loss = outputs[0]
        logits = outputs[1]


        pred = torch.argmax(F.softmax(logits, dim=1), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0
    #torch.save(model.state_dict(), './model.pt')
        itr+=1 
     
    model.eval()
    total_len = 0
    total_correct = 0
    for batch_id, (text, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, pad_to_max_length=True) for t in text]
        sample = torch.tensor(encoded_list)
        labels = torch.tensor(label)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        with torch.no_grad():    
            outputs = model(sample, labels=labels)
        
        logits = outputs[1]
        pred = torch.argmax(F.softmax(logits, dim=1), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
    print("epoch {} test acc {}".format(epoch+1, total_correct/total_len))
    total_len = 0
    total_correct = 0
    
        
        

| ID | GPU | MEM |
------------------
|  0 | 29% | 13% |


  0%|          | 0/7500 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  labels = torch.tensor(label)
  labels = torch.tensor(label)


[Epoch 1/2] Iteration 1000 -> Train Loss: 0.6929, Accuracy: 0.524
[Epoch 1/2] Iteration 2000 -> Train Loss: 0.6112, Accuracy: 0.659
[Epoch 1/2] Iteration 3000 -> Train Loss: 0.4939, Accuracy: 0.756
[Epoch 1/2] Iteration 4000 -> Train Loss: 0.4563, Accuracy: 0.781
[Epoch 1/2] Iteration 5000 -> Train Loss: 0.4385, Accuracy: 0.790
[Epoch 1/2] Iteration 6000 -> Train Loss: 0.4299, Accuracy: 0.790
[Epoch 1/2] Iteration 7000 -> Train Loss: 0.4148, Accuracy: 0.809


  0%|          | 0/2500 [00:00<?, ?it/s]

  labels = torch.tensor(label)
  labels = torch.tensor(label)


epoch 1 test acc 0.8192819281928193
| ID | GPU | MEM |
------------------
|  0 | 83% | 51% |


  0%|          | 0/7500 [00:00<?, ?it/s]

[Epoch 2/2] Iteration 8000 -> Train Loss: 0.3910, Accuracy: 0.839
[Epoch 2/2] Iteration 9000 -> Train Loss: 0.3490, Accuracy: 0.848
[Epoch 2/2] Iteration 10000 -> Train Loss: 0.3595, Accuracy: 0.843
[Epoch 2/2] Iteration 11000 -> Train Loss: 0.3556, Accuracy: 0.846
[Epoch 2/2] Iteration 12000 -> Train Loss: 0.3433, Accuracy: 0.850
[Epoch 2/2] Iteration 13000 -> Train Loss: 0.3611, Accuracy: 0.839
[Epoch 2/2] Iteration 14000 -> Train Loss: 0.3546, Accuracy: 0.844
[Epoch 2/2] Iteration 15000 -> Train Loss: 0.3427, Accuracy: 0.846


  0%|          | 0/2500 [00:00<?, ?it/s]

epoch 2 test acc 0.830983098309831


In [17]:
torch.cuda.empty_cache()