In [1]:
! pip install transformers
! pip install konlpy
! pip install kss

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 6.2MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 25.5MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 60.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torchtext
from google.colab import drive
from konlpy.tag import Okt
from torch.autograd import Variable, grad
from transformers import AdamW,get_linear_schedule_with_warmup,get_constant_schedule_with_warmup
import time
import datetime
import pickle
import kss
from sklearn.metrics import confusion_matrix,classification_report
drive.mount('/content/gdrive')
os.chdir('./gdrive/My Drive/기상청')

Mounted at /content/gdrive


In [210]:
# pretrained된 w2v model 불러오기
vectors=torchtext.vocab.Vectors('wv_128',cache='./')

In [211]:
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

In [212]:
config=Config({'embedding_dim':128,'hidden_dim':256,'seq_len':512,'batch_size':32,'dense_dim':32,'dropout':0.3,'num_layers':1,'padding_idx':1})

In [213]:
# 1 preprocessing
# torchtext를 활용하면 한방에(tokenize,wordembedding,padding ...)
# SOS,EOS,padding,fix_len : 50
import torchtext
# Field 정의 ~ 빅데이터 시스템 및 계산에서 공부한 column과 비슷
# Text <- padding 하지 말아라
Text=torchtext.data.Field(sequential=True,use_vocab=True,batch_first=True,tokenize=lambda i : okt.morphs(i,stem=True),lower=False,fix_length=config.seq_len,init_token='<SOS>',eos_token='<EOS>',pad_token='<PAD>',unk_token='<UNK>')
Target=torchtext.data.Field(sequential=False,use_vocab=False,batch_first=True,is_target=True)

# Data 정의
Train_data=torchtext.data.TabularDataset('./train_data.csv',format='csv',fields=[('total',Text),('피해',Target)],
                                           skip_header=True)
Test_data=torchtext.data.TabularDataset('./test_data.csv',format='csv',fields=[('total',Text),('피해',Target)],
                                           skip_header=True)

In [214]:
# load embeddings using torchtext
Text.build_vocab(Train_data,min_freq=3,vectors=vectors)

In [215]:
# data loader
train_loader=torchtext.data.Iterator(Train_data,batch_size=config.batch_size)
test_loader=torchtext.data.Iterator(Test_data,batch_size=config.batch_size)

In [216]:
class lstm_model(nn.Module):
    def __init__(self,config,vectors):
        super().__init__()
        self.config=config
        self.embedding=nn.Embedding.from_pretrained(vectors,freeze=False,padding_idx=self.config.padding_idx)
        self.lstm=nn.LSTM(self.config.embedding_dim,self.config.hidden_dim,bidirectional=True,batch_first=True)
        self.classifier=nn.Sequential(nn.Linear(2*self.config.hidden_dim,self.config.dense_dim),nn.ReLU(),nn.Linear(self.config.dense_dim,2))
        # classifier 부분의 마지막 단은, 어차피 cross entropy를 활용할 것이기 때문에 굳이 sigmoid를 취하지 않아도 된다.
        self.dropout=nn.Dropout(self.config.dropout)
    def forward(self,input):
        '''
        input shape : batch size, seq_len 
        '''
        embed=self.embedding(input) # batch size, seq_len, embedding_dim
        embed=self.dropout(embed)
        input,(hidden,cell)=self.lstm(embed) # input shape  :batch size, seq len, hidden dim
        input=input[:,-1,:] # batch size, 2*hidden dim
        pred=self.classifier(input) # pred shape : batch size, 2
        return pred

In [226]:
# train
import time
device = 'cuda:0'
model = lstm_model(config,Text.vocab.vectors)
model.to(device)
# Criterion
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor([0.1,0.9]).cuda())

# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 1e-4, # 학습률
                  eps = 1e-8, # 0으로 나누는 것을 막아준다.
                  weight_decay=0.3
                )
# 에폭수
epochs = 100

In [222]:
# 시간 표시 함수
import datetime
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [227]:
start_time=time.time()
for epoch in range(1,epochs+1):
    model.train()
    total_loss=0
    total_acc=0
    for batch in train_loader:
        x=batch.total.to(device)
        y=batch.피해.to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss=criterion(pred,y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1) 
        optimizer.step()   
        total_loss+=loss.item()
        acc=(pred.argmax(-1)==y).float().mean()
        total_acc+=acc
    if epoch%10==0:
        print("EPOCH : %d"%epoch)
        print("| TRAIN_LOSS : %.4f | TRAIN_ACC : %.4f | Eplased time : %s"\
            %(total_loss/len(train_loader),total_acc/len(train_loader),format_time(time.time()-start_time)))
        

            

EPOCH : 10
| TRAIN_LOSS : 0.3491 | TRAIN_ACC : 0.9021 | Eplased time : 0:01:52
EPOCH : 20
| TRAIN_LOSS : 0.3163 | TRAIN_ACC : 0.9211 | Eplased time : 0:03:44
EPOCH : 30
| TRAIN_LOSS : 0.2596 | TRAIN_ACC : 0.9316 | Eplased time : 0:05:35
EPOCH : 40
| TRAIN_LOSS : 0.2500 | TRAIN_ACC : 0.9316 | Eplased time : 0:07:25
EPOCH : 50
| TRAIN_LOSS : 0.2874 | TRAIN_ACC : 0.9222 | Eplased time : 0:09:15
EPOCH : 60
| TRAIN_LOSS : 0.2881 | TRAIN_ACC : 0.9157 | Eplased time : 0:11:05
EPOCH : 70
| TRAIN_LOSS : 0.2935 | TRAIN_ACC : 0.9286 | Eplased time : 0:12:54
EPOCH : 80
| TRAIN_LOSS : 0.3155 | TRAIN_ACC : 0.9193 | Eplased time : 0:14:44
EPOCH : 90
| TRAIN_LOSS : 0.3371 | TRAIN_ACC : 0.9127 | Eplased time : 0:16:34
EPOCH : 100
| TRAIN_LOSS : 0.3202 | TRAIN_ACC : 0.9074 | Eplased time : 0:18:22


## TRAIN

In [228]:
model.eval()
total_loss=0
total_acc=0
Pred=[]
TruE=[]
for batch in train_loader:
    x=batch.total.to(device)
    y=batch.피해.to(device)
    pred = model(x)
    loss=criterion(pred,y)
    total_loss+=loss.item()
    acc=(pred.argmax(-1)==y).float().mean()
    total_acc+=acc
    Pred.extend(pred.argmax(-1).cpu().tolist())
    TruE.extend(y.cpu().tolist())

print("| TRAIN_LOSS : %.4f | TRAIN_ACC : %.4f |"\
    %(total_loss/len(train_loader),total_acc/len(train_loader)))   
print(classification_report(TruE,Pred))     

model.eval()
total_loss=0
total_acc=0
Pred=[]
TruE=[]
for batch in test_loader:
    x=batch.total.to(device)
    y=batch.피해.to(device)
    pred = model(x)
    loss=criterion(pred,y)
    total_loss+=loss.item()
    acc=(pred.argmax(-1)==y).float().mean()
    total_acc+=acc
    Pred.extend(pred.argmax(-1).cpu().tolist())
    TruE.extend(y.cpu().tolist())

print("| TEST_LOSS : %.4f | TEST_ACC : %.4f |"\
    %(total_loss/len(test_loader),total_acc/len(test_loader)))          
print(classification_report(TruE,Pred))

| TRAIN_LOSS : 0.3025 | TRAIN_ACC : 0.9138 |
              precision    recall  f1-score   support

           0       0.97      0.92      0.95      6074
           1       0.69      0.86      0.76      1169

    accuracy                           0.91      7243
   macro avg       0.83      0.89      0.86      7243
weighted avg       0.93      0.91      0.92      7243

| TEST_LOSS : 0.9694 | TEST_ACC : 0.8493 |
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       673
           1       0.54      0.58      0.56       132

    accuracy                           0.85       805
   macro avg       0.73      0.74      0.74       805
weighted avg       0.86      0.85      0.85       805



In [229]:
torch.save(model.stat6e_dict(),'./model_w2v_lstm_2')