In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, datasets

In [2]:
BATCH_SIZE = 64
lr = 0.001
EPOCHS = 10
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("다음 기기로 학습합니다:", DEVICE)

다음 기기로 학습합니다: cuda


In [3]:
TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)

In [4]:
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

In [34]:
print(' '.join(vars(trainset.examples[0])['text']))

a winters day, 28th december 1986, two bored 14 year olds hire a movie. "hmmmm, police story, looks interesting", "who is this jackie chan?", "never heard of him". two hours later after watching the film, in a daze, we wanted to know more. 16 years later (and severely out of pocket from collecting jc movies!) the film still grabs me like no other. ok, maybe i have a soft spot for it as it was my "first" (cannonball run doesn't count!!) jc movie, but it is an excellent movie. it has all the classic jc elements, action, humour, action, heart and action! some comments say it's dated, it was made in 1985, of course it's dated! but then so must jaws, casablanca, singin' in the rain and the godfather!!!!!! without movies like police story where would hollywood action be today? ps set standards, many a scene has been stolen for use in other movies. to really fully appreciate it you must see it in widescreen, you miss so much of the movie otherwise (yes, he really does fall off the bus going r

In [35]:
print(vars(trainset.examples[0])['text'])

['a', 'winters', 'day,', '28th', 'december', '1986,', 'two', 'bored', '14', 'year', 'olds', 'hire', 'a', 'movie.', '"hmmmm,', 'police', 'story,', 'looks', 'interesting",', '"who', 'is', 'this', 'jackie', 'chan?",', '"never', 'heard', 'of', 'him".', 'two', 'hours', 'later', 'after', 'watching', 'the', 'film,', 'in', 'a', 'daze,', 'we', 'wanted', 'to', 'know', 'more.', '16', 'years', 'later', '(and', 'severely', 'out', 'of', 'pocket', 'from', 'collecting', 'jc', 'movies!)', 'the', 'film', 'still', 'grabs', 'me', 'like', 'no', 'other.', 'ok,', 'maybe', 'i', 'have', 'a', 'soft', 'spot', 'for', 'it', 'as', 'it', 'was', 'my', '"first"', '(cannonball', 'run', "doesn't", 'count!!)', 'jc', 'movie,', 'but', 'it', 'is', 'an', 'excellent', 'movie.', 'it', 'has', 'all', 'the', 'classic', 'jc', 'elements,', 'action,', 'humour,', 'action,', 'heart', 'and', 'action!', 'some', 'comments', 'say', "it's", 'dated,', 'it', 'was', 'made', 'in', '1985,', 'of', 'course', "it's", 'dated!', 'but', 'then', 'so',

In [5]:
TEXT.build_vocab(trainset, min_freq=5)
LABEL.build_vocab(trainset)

In [6]:
trainset, valset = trainset.split(split_ratio=0.8)
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (trainset, valset, testset),
    batch_size=BATCH_SIZE,
    shuffle=True,
    repeat=False)

In [7]:
vocab_size = len(TEXT.vocab)
n_classes = 2

In [8]:
print("[학습셋]: {} [검증셋]: {} [테스트셋]: {} [단어셋]: {} [클래스]: {}".format(len(trainset), len(valset), len(testset), vocab_size, n_classes))

[학습셋]: 20000 [검증셋]: 5000 [테스트셋]: 25000 [단어셋]: 46159 [클래스]: 2


In [9]:
class BasicGRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(BasicGRU, self).__init__()
        print("Building Basic GRU model...")
        self.n_layers = n_layers
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.gru(x, h_0)  # [i, b, h]
        h_t = x[:,-1,:]
        self.dropout(h_t)
        logit = self.out(h_t)  # [b, h] -> [b, o]
        return logit
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [10]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)  # 레이블 값을 0과 1로 변환
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [11]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1) # 레이블 값을 0과 1로 변환
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [12]:
model = BasicGRU(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Building Basic GRU model...


In [13]:
best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("[이폭: %d] 검증 오차:%5.2f | 검증 정확도:%5.2f" % (e, val_loss, val_accuracy))
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss

[이폭: 1] 검증 오차: 0.69 | 검증 정확도:51.00
[이폭: 2] 검증 오차: 0.69 | 검증 정확도:50.00
[이폭: 3] 검증 오차: 0.70 | 검증 정확도:51.00
[이폭: 4] 검증 오차: 0.68 | 검증 정확도:53.00
[이폭: 5] 검증 오차: 0.37 | 검증 정확도:84.00
[이폭: 6] 검증 오차: 0.32 | 검증 정확도:86.00
[이폭: 7] 검증 오차: 0.34 | 검증 정확도:87.00
[이폭: 8] 검증 오차: 0.37 | 검증 정확도:86.00
[이폭: 9] 검증 오차: 0.42 | 검증 정확도:86.00
[이폭: 10] 검증 오차: 0.40 | 검증 정확도:87.00


In [14]:
model.load_state_dict(torch.load('./snapshot/txtclassification.pt'))
test_loss, test_acc = evaluate(model, test_iter)
print('테스트 오차: %5.2f | 테스트 정확도: %5.2f' % (test_loss, test_acc))

테스트 오차:  0.33 | 테스트 정확도: 85.00
