In [1]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 9.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 45.9MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 56.7MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torchtext
from google.colab import drive
from transformers import AdamW,get_linear_schedule_with_warmup,get_constant_schedule_with_warmup,BertTokenizer,BertForSequenceClassification,BertModel
import time
import datetime
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,WeightedRandomSampler
drive.mount('/content/gdrive')
os.chdir('./gdrive/My Drive/기상청')

Mounted at /content/gdrive


In [3]:
# data load
train_data=pd.read_csv('./train_data.csv',header=0)
test_data=pd.read_csv('./test_data.csv',header=0)
print(train_data.head())

                                               total  피해
0  신임 장관에게 바란다. 열을 추적, 탐지해 폭염 정보를 한 눈에 보여주는 열화상카메...   0
1  주간날씨와 농사. 지난주 농사날씨 기온은 #.#로 평년보다 #.# 높았으며, 강수량...   0
2  수문장, 폭염에도 흐트러지지 않아. 서울 낮 최고기온이 #도까지 치솟는 등 폭염이 ...   0
3  국내산 조기, 몸통 두툼하고 길이가 짧대요. 차례상 비용 #.#만#원선 고사리도라지...   0
4  . 뉴스 # 입력 #.#.#  수정 #.#.#  댓글 # 좋아요 스크랩하기 공유하기...   0


# 문장별 전처리 
각 문장의 앞마다 [CLS]를 붙이고
각 문장의 끝에는 [SEP]

In [4]:
train_document=['[CLS]'+i+'[SEP]' for i in train_data.total]
test_document=['[CLS]'+i+'[SEP]' for i in test_data.total]

# Tokenizing
hugging face에 있는 multilingual cased 활용 - 감사합니다 선배님들  
kobert는 단어의 수가 너무 적어서 nsmc data 대부분이 [UNK]로 나오게 된다

In [5]:
tokenizer=BertTokenizer.from_pretrained('monologg/kobert', do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=77779.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=51.0, style=ProgressStyle(description_w…




In [6]:
print(tokenizer.vocab_size)

8002


In [7]:
tokenized_train=[tokenizer.tokenize(s) for s in train_document]
tokenized_test=[tokenizer.tokenize(s) for s in test_document]

In [8]:
print(tokenized_train)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# 패딩


In [9]:
# max len으로 자르고, 모자란 부분은 패딩으로 채움
train_ids = torch.tensor([tokenizer.encode(x,add_special_tokens=False,truncation=True,padding='max_length',max_length=512) for x in tokenized_train])
test_ids = torch.tensor([tokenizer.encode(x,add_special_tokens=False,truncation=True,padding='max_length',max_length=512) for x in tokenized_test])

# 어텐션 마스크
additive 방식으로 진행하기에, padding인 부에는 0, 아닌 부분에는 1로

In [10]:
attention_masks_train=(torch.tensor(train_ids).eq(1)==0).long()
attention_masks_test=(torch.tensor(test_ids).eq(1)==0).long()

  """Entry point for launching an IPython kernel.
  


# Label

In [11]:
train_label=torch.tensor(train_data['피해'].tolist())
test_label=torch.tensor(test_data['피해'].tolist())

# TensorDataset으로 묶기

# WeightedRandomSampler 활용
피해 관련과 피해 비관련의 비율이 너무 상이하기에 (15%,85%) Weighted Random Sampler로 균형적으로 Sampling 작업 실행

In [12]:
weight=[1/85,1/15]
samples_weight=torch.FloatTensor(np.array([weight[t] for t in train_label]))
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [27]:
test_data

<torch.utils.data.dataset.TensorDataset at 0x7f689ac4b2b0>

In [13]:
# 배치 사이즈
batch_size = 2
# train
train_data = TensorDataset(train_ids, attention_masks_train, train_label)
train_dataloader = DataLoader(train_data, batch_size=batch_size,drop_last=True,sampler=sampler)
# test
test_data = TensorDataset(test_ids, attention_masks_test, test_label)
test_dataloader = DataLoader(test_data, batch_size=batch_size,drop_last=True)

# BERT for classification

In [21]:
# 분류를 위한 BERT 모델 생성
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [22]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 1e-6, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 20

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [17]:
import time,datetime
def Eplased(dt):
    d=int(round(dt))
    return str(datetime.timedelta(seconds=d))

In [18]:
from sklearn.metrics import classification_report

In [None]:
# 그래디언트 초기화
# device 설정
device='cuda:0'
COST=[]
# 에폭만큼 반복
for epoch in range(1, epochs+1):
    # ========================================
    #               Training
    # ========================================
    
    # 시작시간
    t0=time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = Eplased(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        optimizer.zero_grad()
        
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행
                      
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]
        COST.append(loss.item())
        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        
    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.5f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(Eplased(time.time() - t0)))   
    print("")
    print("Training complete!")
    
    if epoch%5==0:
        print("")
        print("Check Scores...")

        #시작 시간 설정
        t0 = time.time()

        # 평가모드로 변경
        model.eval()

        # 변수 초기화
        
        Predicted=[]
        Actual=[]

        # 데이터로더에서 배치만큼 반복하여 가져옴
        for batch in train_dataloader:
            # 배치를 GPU에 넣음
            batch = tuple(t.to(device) for t in batch)
            
            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch
            
            # 그래디언트 계산 안함
            with torch.no_grad():     
                # Forward 수행
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)
            
            # 로스 구함
            logits = outputs[0]

            # CPU로 데이터 이동
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            # 출력 로짓과 라벨을 비교하여 정확도 계산
            pred_flat = np.argmax(logits, axis=1).flatten().tolist()
            labels_flat = label_ids.flatten().tolist()
            Predicted.extend(pred_flat)
            Actual.extend(labels_flat)
        
        print('Scores')
        print(classification_report(Actual,Predicted))
        print('//////')
    
        

In [35]:
# Train loss 계산
print("")
print("Check Train Scores...")

#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화

Predicted=[]
Actual=[]

# 데이터로더에서 배치만큼 반복하여 가져옴
for batch in train_dataloader:
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    pred_flat = np.argmax(logits, axis=1).flatten().tolist()
    labels_flat = label_ids.flatten().tolist()
    Predicted.extend(pred_flat)
    Actual.extend(labels_flat)

print('Scores')
print(classification_report(Actual,Predicted))
print('//////')


Check Train Scores...
Scores
              precision    recall  f1-score   support

           0       0.99      0.87      0.92      3536
           1       0.89      0.99      0.94      3706

    accuracy                           0.93      7242
   macro avg       0.94      0.93      0.93      7242
weighted avg       0.94      0.93      0.93      7242

//////


In [30]:
model.eval()

        # 변수 초기화
        
Predicted=[]
Actual=[]

# 데이터로더에서 배치만큼 반복하여 가져옴
for batch in test_dataloader:
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    pred_flat = np.argmax(logits, axis=1).flatten().tolist()
    labels_flat = label_ids.flatten().tolist()
    Predicted.extend(pred_flat)
    Actual.extend(labels_flat)

print('Scores')
print(classification_report(Actual,Predicted))
print('//////')

Scores
              precision    recall  f1-score   support

           0       0.94      0.81      0.87       672
           1       0.43      0.74      0.55       132

    accuracy                           0.80       804
   macro avg       0.69      0.78      0.71       804
weighted avg       0.86      0.80      0.82       804

//////


In [34]:
torch.save(model.state_dict(),'./[기상청][koBERT]')

In [None]:
class LabelSmoothLoss(nn.Module):
    
    def __init__(self, smoothing=0.0):
        super(LabelSmoothLoss, self).__init__()
        self.smoothing = smoothing
    
    def forward(self, input, target):
        log_prob = F.log_softmax(input, dim=-1)
        weight = input.new_ones(input.size()) * \
            self.smoothing / (input.size(-1) - 1.)
        weight.scatter_(-1, target.unsqueeze(-1), (1. - self.smoothing))
        loss = (-weight * log_prob).sum(dim=-1).mean()
        return loss