<a href="https://colab.research.google.com/github/6ma6X/nn4nlp-code/blob/master/BOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
from torch.autograd import Variable

In [None]:
class BoW(torch.nn.Module):
    def __init__(self, nwords, ntags): # nwords: 単語数, ntags: タグ数
        super(BoW, self).__init__()

        """ variables """
        type = torch.FloatTensor
        use_cuda = torch.cuda.is_available()

        if use_cuda:
            type = torch.cuda.FloatTensor

        self.bias = Variable(torch.zeros(ntags),
                             requires_grad=True).type(type)
         # ntags の数で初期化

        """ layers """
        self.embedding = nn.Embedding(nwords, ntags)
        # nwords 個の埋め込み表現、それぞれの埋め込みの次元数は ntags 個

        # initialize the weights with xavier uniform (Glorot, X. & Bengio, Y. (2010))
        nn.init.xavier_uniform_(self.embedding.weight)


    def forward(self, words):
        emb = self.embedding(words)
        out = torch.sum(emb, dim=0) + self.bias # size(out) = N
        # BoW では単純に埋め込みの和をとって bias を足す
        out = out.view(1, -1) # size(out) = 1 x N
        # 1 * N のベクトルに
        return out

In [None]:
from collections import defaultdict
import time
import random
import numpy as np

In [None]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])

# Read in the data
train = list(read_dataset("train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [None]:
!head -n 10 train.txt

3 ||| The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
4 ||| The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth .
3 ||| Singer\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece .
2 ||| You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold .
3 ||| Yet the act is still charming here .
4 ||| Whether or not you 're enlightened by any of Derrida 's lectures on `` the other '' and `` the self , '' Derrida is an undeniably fascinating and playful fellow .
4 ||| Just the labour involved in creating 

In [None]:
train[:3] # (単語のインデックス, 評価(タグ)) のタプル の配列

[([1,
   2,
   3,
   4,
   5,
   6,
   1,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   9,
   17,
   5,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33],
  0),
 ([1,
   34,
   35,
   36,
   37,
   11,
   1,
   38,
   37,
   1,
   39,
   13,
   40,
   3,
   41,
   42,
   15,
   19,
   43,
   37,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   9,
   52,
   53,
   37,
   54,
   55,
   9,
   56,
   33],
  1),
 ([57,
   58,
   59,
   60,
   19,
   61,
   37,
   62,
   63,
   19,
   64,
   65,
   66,
   26,
   19,
   64,
   67,
   68,
   69,
   5,
   1,
   70,
   63,
   71,
   1,
   72,
   73,
   74,
   75,
   1,
   76,
   26,
   77,
   26,
   78,
   37,
   1,
   79,
   33],
  0)]

In [None]:
w2i["destined"] # 単語からインデックスへの対応付け

4

In [None]:
nwords

18648

In [None]:
ntags

5

In [None]:
# モデルの初期化

model = BoW(nwords, ntags)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters()) # 最適化手法

In [None]:
type = torch.LongTensor # FloatTensor は 32bit 浮動小数、LongTensor は 64bit 浮動小数
use_cuda = torch.cuda.is_available()

if use_cuda:
    type = torch.cuda.LongTensor
    model.cuda()

In [None]:
use_cuda

True

In [None]:
for ITER in range(100):
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for words, tag in train:
        words = torch.tensor(words).type(type)
        tag = torch.tensor([tag]).type(type)
        scores = model(words)
        loss = criterion(scores, tag)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (
                ITER, train_loss/len(train), time.time()-start))
    # Perform testing
    test_correct = 0.0
    for words, tag in dev:
        words = torch.tensor(words).type(type)
        scores = model(words)[0].detach().cpu().numpy()
        predict = np.argmax(scores)
        if predict == tag:
            test_correct += 1
    print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))

iter 0: train loss/sent=0.1049, time=6.25s
iter 0: test acc=0.3814
iter 1: train loss/sent=0.0998, time=6.46s
iter 1: test acc=0.3828
iter 2: train loss/sent=0.0958, time=6.45s
iter 2: test acc=0.3828
iter 3: train loss/sent=0.0913, time=6.67s
iter 3: test acc=0.3765
iter 4: train loss/sent=0.0880, time=6.69s
iter 4: test acc=0.3796
iter 5: train loss/sent=0.0837, time=6.30s
iter 5: test acc=0.3792
iter 6: train loss/sent=0.0806, time=6.50s
iter 6: test acc=0.3796
iter 7: train loss/sent=0.0772, time=6.87s
iter 7: test acc=0.3756
iter 8: train loss/sent=0.0742, time=6.68s
iter 8: test acc=0.3738
iter 9: train loss/sent=0.0710, time=6.86s
iter 9: test acc=0.3715
iter 10: train loss/sent=0.0680, time=6.49s
iter 10: test acc=0.3760
iter 11: train loss/sent=0.0654, time=6.36s
iter 11: test acc=0.3697
iter 12: train loss/sent=0.0628, time=6.35s
iter 12: test acc=0.3742
iter 13: train loss/sent=0.0603, time=6.54s
iter 13: test acc=0.3724
iter 14: train loss/sent=0.0582, time=6.74s
iter 14: t

KeyboardInterrupt: ignored