# **Settings**

GPU 및 RAM 확인

In [None]:
#할당된 GPU 확인 (Colab)
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Fri Dec 25 05:10:18 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


# **Training**

패키지 및 라이브러리, 데이터셋 다운로드

In [None]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install -q transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

[K     |████████████████████████████████| 1.5MB 6.0MB/s 
[K     |████████████████████████████████| 2.9MB 36.0MB/s 
[K     |████████████████████████████████| 890kB 43.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
--2020-12-25 05:10:26--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt’


2020-12-25 05:10:26 (37.1 MB/s) - ‘ratings_train.txt’ saved [14628807/14628807]

--2020-12-25 05:10:26--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubuserconte

In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
import time
import datetime

In [None]:
#데이터 보기
!head ratings_train.txt
!head ratings_test.txt

id	document	label
9976970	아 더빙.. 진짜 짜증나네요 목소리	0
3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1
10265843	너무재밓었다그래서보는것을추천한다	0
9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0
6483659	사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다	1
5403919	막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.	0
7797314	원작의 긴장감을 제대로 살려내지못했다.	0
9443947	별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단 낫겟다 납치.감금만반복반복..이드라마는 가족도없다 연기못하는사람만모엿네	0
7156791	액션이 없는데도 재미 있는 몇안되는 영화	1
id	document	label
6270596	굳 ㅋ	1
9274899	GDNTOPCLASSINTHECLUB	0
8544678	뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아	0
6825595	지루하지는 않은데 완전 막장임... 돈주고 보기에는....	0
6723715	3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??	0
7898805	음악이 주가 된, 최고의 음악영화	1
6315043	진정한 쓰레기	0
6097171	마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다	0
8932678	갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한cg남무 아 그립다 동사서독같은 영화가 이건 3류아류작이다	0


In [None]:
# GPU 사용
device = torch.device("cuda")

In [None]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [None]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [None]:
# 모델 레이어 보기
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [None]:
epochs = 3
batch_size = 32

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  t0 = time.time()
  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)
  print("Training epoch took: {:}".format(format_time(time.time() - t0)))

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))



Batch Loss: 62.707629323005676 Accuracy: tensor(0.6712, device='cuda:0')
Batch Loss: 102.42956794798374 Accuracy: tensor(0.7522, device='cuda:0')
Batch Loss: 138.82058583199978 Accuracy: tensor(0.7828, device='cuda:0')
Batch Loss: 171.33546549081802 Accuracy: tensor(0.8030, device='cuda:0')
Batch Loss: 201.30534455180168 Accuracy: tensor(0.8174, device='cuda:0')
Batch Loss: 231.95525649935007 Accuracy: tensor(0.8266, device='cuda:0')
Batch Loss: 261.9850127995014 Accuracy: tensor(0.8336, device='cuda:0')
Batch Loss: 291.8494745492935 Accuracy: tensor(0.8382, device='cuda:0')
Batch Loss: 321.20702780783176 Accuracy: tensor(0.8428, device='cuda:0')
Batch Loss: 348.9495624601841 Accuracy: tensor(0.8471, device='cuda:0')
Batch Loss: 376.9748760461807 Accuracy: tensor(0.8499, device='cuda:0')
Batch Loss: 405.10897704958916 Accuracy: tensor(0.8527, device='cuda:0')
Batch Loss: 432.7145830988884 Accuracy: tensor(0.8553, device='cuda:0')
Batch Loss: 462.9473852366209 Accuracy: tensor(0.8562, d

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 20.88129461929202 Accuracy: tensor(0.9141, device='cuda:0')
Batch Loss: 40.68382894620299 Accuracy: tensor(0.9183, device='cuda:0')
Batch Loss: 60.479398377239704 Accuracy: tensor(0.9193, device='cuda:0')
Batch Loss: 78.11578075401485 Accuracy: tensor(0.9226, device='cuda:0')
Batch Loss: 97.90653230436146 Accuracy: tensor(0.9221, device='cuda:0')
Batch Loss: 119.14256873168051 Accuracy: tensor(0.9213, device='cuda:0')
Batch Loss: 141.42611720599234 Accuracy: tensor(0.9199, device='cuda:0')
Batch Loss: 161.42861601524055 Accuracy: tensor(0.9201, device='cuda:0')
Batch Loss: 182.9353379253298 Accuracy: tensor(0.9192, device='cuda:0')
Batch Loss: 204.09273461066186 Accuracy: tensor(0.9180, device='cuda:0')
Batch Loss: 225.33639425598085 Accuracy: tensor(0.9178, device='cuda:0')
Batch Loss: 246.07489034347236 Accuracy: tensor(0.9176, device='cuda:0')
Batch Loss: 268.8679468240589 Accuracy: tensor(0.9167, device='cuda:0')
Batch Loss: 289.53505449555814 Accuracy: tensor(0.9167, d

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 14.807505467906594 Accuracy: tensor(0.9437, device='cuda:0')
Batch Loss: 29.642429370433092 Accuracy: tensor(0.9447, device='cuda:0')
Batch Loss: 45.93778792396188 Accuracy: tensor(0.9435, device='cuda:0')
Batch Loss: 61.57102555409074 Accuracy: tensor(0.9433, device='cuda:0')
Batch Loss: 77.29186184890568 Accuracy: tensor(0.9429, device='cuda:0')
Batch Loss: 93.31204211339355 Accuracy: tensor(0.9420, device='cuda:0')
Batch Loss: 108.92109078168869 Accuracy: tensor(0.9418, device='cuda:0')
Batch Loss: 125.33305867761374 Accuracy: tensor(0.9411, device='cuda:0')
Batch Loss: 142.2926924675703 Accuracy: tensor(0.9405, device='cuda:0')
Batch Loss: 156.83353400975466 Accuracy: tensor(0.9407, device='cuda:0')
Batch Loss: 171.38969629257917 Accuracy: tensor(0.9413, device='cuda:0')
Batch Loss: 188.13396718725562 Accuracy: tensor(0.9410, device='cuda:0')
Batch Loss: 205.26246642693877 Accuracy: tensor(0.9405, device='cuda:0')
Batch Loss: 220.49350722692907 Accuracy: tensor(0.9407, 

In [None]:
losses, accuracies
torch.save(model.state_dict(), "model_kor_koelectra_v3_base.pt")
from google.colab import files
files.download('model_kor_koelectra_v3_base.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
t0 = time.time()
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)
print("Test took: {:}".format(format_time(time.time() - t0)))

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.9112, device='cuda:0')
Test took: 0:06:43
