In [1]:
import torch
print(torch.__version__)

1.4.0


In [2]:
!pip install torchtext==0.4.0

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import torchtext
import numpy as np

In [4]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable

## 하이퍼 파라미터 셋팅

In [5]:
batch_size= 128
num_epochs= 10

word_vec_size= 256 #==embedding size
dropout_p=0.3

hidden_size= 512
num_layer= 4

learning_rate= 0.001

In [6]:
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. SMS train, test dataset 가져오기

In [7]:
from data_loader import DataLoader

In [8]:
loaders= DataLoader(
    train_fn='data/sms.maxlen.uniq.shuf.train.tsv',
    batch_size=batch_size,
    valid_ratio= .2,
    device=-1,
    max_vocab= 999999,
    min_freq=5,
)

In [9]:
test_loaders= DataLoader(
    train_fn='data/sms.maxlen.uniq.shuf.test.tsv',
    batch_size=batch_size,
    valid_ratio= .01,
    device=-1,
    max_vocab= 999999,
    min_freq=5,
)

## 2. 대략적인 데이터 형태

In [10]:
print("|train| = ", len(loaders.train_loader.dataset),
     '|valid| = ', len(loaders.valid_loader.dataset))
vocab_size= len(loaders.text.vocab)
num_classes= len(loaders.label.vocab)
print('|vocab| =', vocab_size, '|classes| =', num_classes)

|train| =  3723 |valid| =  931
|vocab| = 1541 |classes| = 2


## 3. 데이터 로드 함수

In [11]:
n=3
for i, data in enumerate(loaders.train_loader):
    labels= data.label
    texts= data.text
    
    if i>n:
        break
    print('[%d]' %i)
    print('한 번에 로드되는 데이터 크기: ', len(labels))
    
    for j in range(n):
#         print(labels[j]) #형태 변환이 어떻게 되는지 궁금해서 출력해봄 -> tensor(0) # 0은 label
        label= labels[j].numpy() #tensor -> numpy로 변환
        text= texts[j].numpy()
        print('label: ', label)
        print('text: ', text.shape)

[0]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (12,)
label:  0
text:  (12,)
label:  0
text:  (12,)
[1]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (5,)
label:  0
text:  (5,)
label:  0
text:  (5,)
[2]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (54,)
label:  0
text:  (54,)
label:  0
text:  (54,)
[3]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (31,)
label:  1
text:  (31,)
label:  1
text:  (31,)


## 4. 모델 선언

In [12]:
class RNN(nn.Module):
    def __init__(self,
                input_size, #vocab_size
                word_vec_size, #word embbeding vector 차원
                hidden_size, #bidirectional LSTM의 hidden state & cell state의 size
                n_classes,
                num_layers=4, #쌓을 레이어 수
                dropout_p= 0.3
                ):
        super(RNN, self).__init__()
        
        self.input_size= input_size
        self.word_vec_size= word_vec_size
        self.hidden_size= hidden_size
        self.n_classes= n_classes
        self.num_layer= num_layer
        self.dropout_p= dropout_p
        
        #입력차원(vocab_size), 출력차원(word_vec_size)
        self.emb= nn.Embedding(input_size, word_vec_size) #부터
        
        self.lstm= nn.LSTM(input_size= word_vec_size,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          dropout= dropout_p,
                          batch_first=True,
                          bidirectional= True)
        self.fc= nn.Linear(hidden_size*2, num_classes)
        self.activation= nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        # x: (batch_size, length)
        x= self.emb(x)
        
        # x:(batch_size, length, word_vec_size)
        x, _= self.lstm(x) # x: output, _: 마지막 time step의 hidden state& cell state
        
        # x: (batch_size, length, hidden_size*2)
        # x[:,-1]: (batch_size, 1, hidden_size*2)
        out= self.activation( self.fc(x[:,-1]) ) #마지막 time step
        # self.fc(x[:,-1]): (batch_size, num_classes)
        
        return out

In [13]:
model= RNN(input_size= vocab_size,
          word_vec_size=word_vec_size,
          hidden_size=hidden_size,
          n_classes=num_classes,
          num_layers=num_layer,
          dropout_p=dropout_p)

In [14]:
def ComputeAccr(dloader, imodel):
    correct=0
    total=0
    
    model.eval()
    for i, data in enumerate(dloader):
        texts= data.text.to(device)
        labels= data.label.to(device)
        
        output= model(texts) #(batch_size, num_classes)
        _, output_index= torch.max(output, 1) #(batch_size, 1)
        
        total += labels.size(0)
        correct += (output_index == labels).sum().float()
        
    model.train()
    return (100*correct/total).numpy() #tensor->numpy

In [15]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 13.53


## 5. loss, optimizer

In [16]:
loss_func= nn.NLLLoss()
optimizer= torch.optim.Adam(model.parameters(), lr= learning_rate)

## 6. 학습

In [18]:
total_step= len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts= data.text.to(device)
        labels= data.label.to(device)

        print('[%d]'%i)

        # Forward prop
        outputs= model(texts)
        loss= loss_func(outputs, labels)

        # Backward prop & opimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) %10==0:
            print('Epoch [{}/{}], step [{}/{}], Loss:{:.4f}, Accr:{:.2f}'
                  .format(epoch+1, num_epochs, i+1, total_step, loss.item(), ComputeAccr(loaders.valid_loader, model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], step [10/30], Loss:0.1620, Accr:86.47
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], step [20/30], Loss:0.4294, Accr:86.25
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [1/10], step [30/30], Loss:0.0864, Accr:86.47
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [2/10], step [10/30], Loss:0.3523, Accr:86.47
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [2/10], step [20/30], Loss:0.1788, Accr:86.47
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [2/10], step [30/30], Loss:0.5501, Accr:86.47
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [3/10], step [10/30], Loss:0.1924, Accr:86.47
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [3/10], step [20/30], Loss:0.1640, Accr:86.47
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [3/10], step [30/30], Loss:0.0991, Accr:86.47
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [4/10], step [10/30], Loss:0.2277, Accr:86.47
[10]
[11]
[12]
[13]


In [19]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 92.80


In [20]:
netname= './nets/rnn_weight.pkl'
torch.save(model, netname, )

  "type " + obj.__name__ + ". It won't be checked "
