In [None]:
!nvidia-smi

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1SfrBnDt7-PrFL8zjfVap-FOPoUS6dqcT" -O data.zip

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!ls /gdrive/MyDrive/tutorial_nlp/data

In [None]:
!apt-get -q -y install swig 
!apt-get install mecab
!apt-get install libmecab-dev
!apt-get install mecab-ipadic-utf8
!pip install mecab-python3==0.996.5
!pip install unidic-lite

In [None]:
from collections import defaultdict
import time

import MeCab
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tagger = MeCab.Tagger('-Ochasen')
node = tagger.parse('坊主が屏風に上手に坊主の絵を描いた')

In [None]:
print(node)

In [None]:
def tokenize(sentence):
    node = tagger.parse(sentence)
    node = node.split('\n')
    tokenized_sentence = []
    for i in range(len(node)):
        feature = node[i].split('\t')
        if feature[0] == 'EOS':
            break
        tokenized_sentence.append(feature[0])
    return tokenized_sentence

In [None]:
tokenize(('坊主が屏風に上手に坊主の絵を描いた'))

In [None]:
!mkdir -p data
!cp -r /gdrive/MyDrive/tutorial_nlp/data/* ./data

In [None]:
!ls data

In [None]:
!head -5 ./data/kokoro.txt

In [None]:
def load_data(path):
    text = []
    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            line = tokenize(line)
            text.append(line)
    return text

In [None]:
text = load_data('./data/kokoro.txt')

In [None]:
print(text[0])

In [None]:
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
PAD = 0
UNK = 1
MIN_COUNT = 1

word2id = {
    PAD_TOKEN: PAD,
    UNK_TOKEN: UNK,
}

In [None]:
class Vocab(object):
    def __init__(self, word2id={}):
        self.word2id = word2id
        self.id2word = {id: word for word, id in word2id.items()}

    def build_vocab(self, sentences, min_count=1):
        # count words in the corpus
        word_counter = defaultdict(int)
        for sentence in sentences:
            for word in sentence:
                word_counter[word] = word_counter.get(word, 0) + 1

        # add to vocabs the word whose count >= min_count
        words = sorted([(word, count) for word, count in word_counter.items()], key=lambda x: x[1], reverse=True)
        for word, count in words:
            if count >= min_count:
                _id = len(self.word2id)
                self.word2id.setdefault(word, _id)
                self.id2word[_id] = word

        self.raw_vocab = {w: word_counter[w] for w in self.word2id if w in word_counter}

In [None]:
vocab = Vocab(word2id)
vocab.build_vocab(text, min_count=MIN_COUNT)

In [None]:
print(len(vocab.word2id))

In [None]:
def sentence_to_ids(vocab, sen):
    out = [vocab.word2id.get(word, UNK) for word in sen]
    return out

In [None]:
id_text = [sentence_to_ids(vocab, sen) for sen in text]

In [None]:
print(text[0])
print(id_text[0])

In [None]:
def pad_seq(seq, max_length):
    """Paddingを行う関数

    :param seq: list of int, 単語のインデックスのリスト
    :param max_length: int, バッチ内の系列の最大長
    :return seq: list of int, 単語のインデックスのリスト
    """
    seq += [PAD for _ in range(max_length - len(seq))]
    return seq

### CBOW

In [None]:
batch_size = 64
n_batches = 500
vocab_size = len(vocab.word2id)
embedding_size = 300

In [None]:
class CBOWDataLoader(object):
    def __init__(self, text, batch_size, window=3):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語とターゲットの単語の最大距離
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.s_pointer = 0
        self.max_s_pointer = len(self.text)
        self.w_pointer = 0

    def __iter__(self):
        return self

    def __next__(self):
        batch_X, batch_Y = [], []

        while len(batch_X) < self.batch_size:
            assert len(batch_X) == len(batch_Y)
            # 走査する対象の文
            sen = self.text[self.s_pointer]
            # 予測すべき単語
            word_Y = sen[self.w_pointer]
            # 入力となる単語群を取得
            words_X = sen[max(0, self.w_pointer - self.window):self.w_pointer] + sen[self.w_pointer + 1:self.w_pointer + 1 + self.window]
            words_X = pad_seq(words_X, self.window * 2)
            batch_X.append(words_X)
            batch_Y.append(word_Y)
            self.w_pointer += 1
            # 文を走査し終わったら次の文の先頭にポインタを移行する
            # 全ての文を走査し終わったら終了する
            if self.w_pointer >= len(sen):
                self.s_pointer += 1
                self.w_pointer = 0
                if self.s_pointer >= self.max_s_pointer:
                    self.s_pointer = 0
                    raise StopIteration

        # データはtorch.Tensorにする必要があります。dtype, deviceも指定します。
        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)

        return batch_X, batch_Y

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        super(CBOW, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        # 全結合層(バイアスなし)
        self.linear = nn.Linear(self.embedding_size, self.vocab_size, bias=False)

    def forward(self, batch_X, batch_Y):
        """
        :param batch_X: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :param batch_Y: torch.Tensor(dtype=torch.long), (batch_size,)
        :return loss: torch.Tensor(dtype=torch.float), CBOWのloss
        """
        # (batch_size, window*2, embedding_size)
        x = self.embedding(batch_X)
        # paddingした部分を無視するためにマスクをかけます
        # (batch_size, window*2, embedding_size)
        x = x * (batch_X != PAD).float().unsqueeze(-1)
        # (batch_size, embedding_size)
        x = x.sum(dim=1)
        # (batch_size, vocab_size)
        x = self.linear(x)
        # (batch_size, vocab_size)
        log_prob = F.log_softmax(x, dim=-1)
        loss = F.nll_loss(log_prob, batch_Y)
        return loss

### CBOW Training

In [None]:
cbow = CBOW(vocab_size, embedding_size).to(device)
optimizer_cbow = optim.Adam(cbow.parameters())
dataloader_cbow = CBOWDataLoader(id_text, batch_size)

In [None]:
def compute_loss(model, inputs, optimizer, is_train=True):
    """lossを計算するための関数
    
    is_train=Trueならモデルをtrainモードに、
    is_train=Falseならモデルをevaluationモードに設定します
    
    :param model: 学習させるモデル
    :param input: モデルへの入力
    :param optimizer: optimizer
    :param is_train: bool, モデルtrainさせるか否か
    """
    model.train(is_train)
    loss = model(*inputs)
    if is_train:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss.item()

In [None]:
start_at = time.time()

for batch_id, (batch_X, batch_Y) in enumerate(dataloader_cbow):
    loss = compute_loss(cbow, [batch_X, batch_Y], optimizer_cbow, is_train=True)
    if (batch_id + 1) % 100 == 0:
        print(f'Time {time.time() - start_at:.2f} [sec] Loss {loss:.4f}')
    if (batch_id + 1) >= n_batches:
        break

print('Training finished in {:.2f} [sec]'.format(time.time() - start_at))

### Skipgram

### Word Similarity

In [None]:
def compute_word_similarity(embedding_path, word, n):
    """
    与えられた単語に最も似ている単語とcos類似度を返す関数

    :param embedding_path: str, 保存した埋め込み層のパラメータのパス
    :param word: str, 単語
    :param n: int
    :return out: str, 上位n個の類似単語とそのcos類似度
    """
    embedding = torch.load(embedding_path)

    # 単語ベクトルを全て単位ベクトルにする
    norm = np.linalg.norm(embedding, ord=2, axis=1, keepdims=True)
    norm = np.where(norm == 0, 1, norm)
    embedding /= norm
    e = embedding[vocab.word2id[word]]

    # 単語ベクトル同士のcos類似度を計算する
    cos_sim = np.dot(embedding, e.reshape(-1, 1)).reshape(-1, )
    most_sim = np.argsort(cos_sim, axis=-1)[::-1][1:n+1]
    most_sim_words = [vocab.id2word[id] for id in most_sim]
    top_cos_sim = cos_sim[most_sim]
    out = ', '.join([w + f'({v:.4f})' for w, v in zip(most_sim_words, top_cos_sim)])
    return out

In [None]:
# 500バッチだけ学習した時
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "./data/" + model + "_embedding.pth", "私", 5))

In [None]:
# 1エポック学習した時
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "./data/" + model + "_embedding_1E.pth", "私", 5))

In [None]:
# 3エポック学習した時
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "./data/" + model + "_embedding_3E.pth", "私", 5))