
# Pytorch + HuggingFace 
## KoElectra Model
박장원님의 KoElectra-small 사용<br>
https://monologg.kr/2020/05/02/koelectra-part1/<br>
https://github.com/monologg/KoELECTRA

## Dataset
네이버 영화 리뷰 데이터셋<br>
https://github.com/e9t/nsmc

## References
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249

## 주의사항
꼭 GPU로 해주세요 - 1epoch 당 약 20분 소요

In [149]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width: 90% !important; }</style>"))

In [150]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers




In [151]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
import gc
from sklearn.model_selection import KFold
import numpy as np
import os

In [152]:
# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('./drive/MyDrive/4_party_project')

In [153]:
# GPU 사용
device = torch.device("cuda")

tok_model = 'koelectra-base-v3-discriminator'

# Dataset 만들어서 불러오기 

In [154]:
#naver movie, Hotel, shopping 
class NHSDataset(Dataset): 
    def __init__(self, csv_file):
        # NaN값 제거...
        self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
        # 중복제거
        self.dataset.drop_duplicates(subset=['text'], inplace=True)
        self.tokenizer = AutoTokenizer.from_pretrained("monologg/" + tok_model)
        print(self.dataset.describe())
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 0:2].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
            )
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
        return input_ids, attention_mask, y, text

In [155]:
#train데이터 test데이터 NHSDataset class에 넣어줌
train_dataset = NHSDataset('data/sentiment_test_dataset/hotel_shopping_train.txt')
test_dataset = NHSDataset('data/sentiment_test_dataset/hotel_test.txt')

torch.cuda.empty_cache()

               label
count  103524.000000
mean        0.500367
std         0.500002
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
              label
count  51782.000000
mean       0.500908
std        0.500004
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000


# Create Model

In [156]:
model = ElectraForSequenceClassification.from_pretrained("monologg/" + tok_model).to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

RuntimeError: CUDA out of memory. Tried to allocate 104.00 MiB (GPU 0; 8.00 GiB total capacity; 6.05 GiB already allocated; 69.51 MiB free; 6.53 GiB reserved in total by PyTorch)

# Model 저장되있는거 돌려보기

In [None]:

# model.load_state_dict(torch.load("원하는모델.pt"))
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))


In [None]:
# 모델 레이어 보기
model

# Learn

In [None]:
import gc
gc.collect()

In [None]:
epochs = 10
batch_size = 16

In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle=True)


In [None]:
losses = []
accuracies = []

for i in tqdm(range(epochs)):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0
    model.train()
    for input_ids_batch, attention_masks_batch, y_batch, org_text in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        
    total_loss += loss.item()
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
        print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

# 테스트 데이터셋 정확도및 강한(긍정,부정) 확률 확인


In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch , original_text in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    for idx in range(len(y_pred)):
        pred_tensor = F.softmax(y_pred[idx])
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "MODEL_" + tok_model + "__2.pt")