# 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [268]:
import pandas as pd
import re
from collections import Counter
from tqdm import tqdm
tqdm.pandas()


def tokenize(doc):
    tokens = doc.split(' ')
    return tokens


def normalize(doc):
    doc = re.sub(r"[',.]", '', doc)   # 記号を削除
    doc = re.sub(r" {2,}", ' ', doc)  # 2回以上続くスペースを削除
    doc = re.sub(r" *?$", '', doc)    # 行頭と行末のスペースを削除
    doc = re.sub(r"^ *?", '', doc)
    doc = doc.lower()                 # 小文字に統一
    return doc


def token2id(token):
    if token in token2id_dic:
        return token2id_dic[token]
    else:
        return 0


columns = ('category', 'title')

train = pd.read_csv('../../data/NewsAggregatorDataset/train.txt',
                    names=columns, sep='\t')


docs = [normalize(doc) for doc in train.title.values.tolist()]
tokens = [tokenize(doc) for doc in docs]
tokens = sum(tokens, [])  # flat list
counter = Counter(tokens)

token2id_dic = {}
vocab_size = len(counter)
for index, (token, freq) in enumerate(counter.most_common(), 1):
    if freq < 2:
        token2id_dic[token] = 0
    else:
        token2id_dic[token] = index


In [272]:
token2id('the')

3

# 81. RNNによる予測
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．再帰型ニューラルネットワーク（RNN: Recurrent Neural Network）を用い，単語列xからカテゴリyを予測するモデルとして，次式を実装せよ...

In [50]:
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch

def preprocessor(doc):
    doc = normalize(doc)    
    tokens = tokenize(doc)
    return tokens


def tokens2ids(tokens):
    tokens = [token2id(token) for token in tokens]
    return torch.tensor(tokens, dtype=torch.int64)

In [259]:
dw = 300
dh = 50
L = 4

class RNN(nn.Module):
    def __init__(self, data_size, hidden_size, output_size, vocab_size):
        super(RNN, self).__init__()
        self.emb = torch.nn.Embedding(vocab_size, data_size)
        self.rnn = torch.nn.RNN(dw, dh, nonlinearity='relu')
        self.liner = nn.Linear(hidden_size, output_size)


    def forward(self, data, last_hidden):           # data: (max_len)
        data = self.emb(data)                       # data: (max_length, dw)
        y, hidden = self.rnn(data, last_hidden)     # y: (max_len, dh), hidden: (max_len, dh)
        y = y[:,-1,:]
        y = self.liner(y)
        y = torch.softmax(y, dim=1)
        return y, hidden


In [274]:
train['tokens'] = train.title.apply(preprocessor)
X_train = train.tokens.apply(tokens2ids)
X_train[0]

# tensor([   8,    0, 2416, 1604, 2143,    5, 1605,    4,  745])

tensor([   8,    0, 2416, 1604, 2143,    5, 1605,    4,  745])

In [275]:
max_len = train.tokens.apply(len).max()
model = RNN(dw, dh, L, vocab_size)

inputs = pad_sequence(X_train, batch_first=True)
h0 = torch.zeros(1, 121, dh, dtype=torch.float32)

outputs, hidden = model(inputs, h0)
print(outputs.size())
print(hidden.size())

'''
torch.Size([10672, 4])
torch.Size([1, 121, 50])
'''

torch.Size([10672, 4])
torch.Size([1, 121, 50])


'\ntorch.Size([10672, 4])\ntorch.Size([1, 121, 50])\n'

# 82. 確率的勾配降下法による学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [338]:
columns = ('category', 'title')

train = pd.read_csv('../../data/NewsAggregatorDataset/train.txt',
                    names=columns, sep='\t')
test = pd.read_csv('../../data/NewsAggregatorDataset/test.txt',
                   names=columns, sep='\t')

train['tokens'] = train.title.apply(preprocessor)
test['tokens'] = test.title.apply(preprocessor)

X_train = train.tokens.apply(tokens2ids)
X_test = test.tokens.apply(tokens2ids)

label2int = {'b': 0, 't': 1, 'e': 2, 'm': 3}
Y_train = train.category.map(label2int)
Y_test = test.category.map(label2int)
Y_train = torch.tensor(Y_train).long()
Y_test = torch.tensor(Y_test).long()

max_len = train.tokens.apply(len).max()
dataset_size = len(train)

In [340]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import numpy as np

def accuracy(pred, label):
    pred = np.argmax(pred.data.numpy(), axis=1) # 行ごとに最大値のインデックスを取得する．
    label = label.data.numpy()
    return (pred == label).mean()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNN(dw, dh, L, vocab_size)
criterion = nn.CrossEntropyLoss()  # クロスエントロピー損失関数
optimizer = optim.SGD(model.parameters(), lr=0.01)  # 確率的勾配降下法

X_train = pad_sequence(X_train, batch_first=True)
ds = TensorDataset(X_train, Y_train)
loader = DataLoader(ds, batch_size=1, shuffle=True)

model   = model.to(device)

for epoch in range(10):
    hidden = torch.zeros(1, 121, dh, dtype=torch.float32)
    n_correct = 0
    total_loss = 0
    for inputs, label in tqdm(loader):
        inputs = inputs.to(device)
        label = label.to(device)
        outputs, hidden = model(inputs, hidden)
        loss = criterion(outputs, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() # パラメータを更新

        total_loss += loss.data 
        outputs = np.argmax(outputs.data.numpy(), axis=1)
        label = label.data.numpy()
        hidden = hidden.detach()
        if outputs == label:
            n_correct += 1
        
    print('epoch: %d loss: %f accuracy: %f' % (epoch, loss, n_correct/dataset_size))


print('Finished Training')

'''
100%|██████████| 10672/10672 [02:33<00:00, 69.55it/s]
  0%|          | 9/10672 [00:00<02:10, 81.85it/s]epoch: 0 loss: 1.295777 accuracy: 0.416698
100%|██████████| 10672/10672 [02:31<00:00, 70.39it/s]
  0%|          | 7/10672 [00:00<02:38, 67.40it/s]epoch: 1 loss: 1.220215 accuracy: 0.407234
100%|██████████| 10672/10672 [02:17<00:00, 77.80it/s]
  0%|          | 8/10672 [00:00<02:21, 75.47it/s]epoch: 2 loss: 1.665452 accuracy: 0.411544
100%|██████████| 10672/10672 [02:12<00:00, 80.81it/s]
  0%|          | 8/10672 [00:00<02:21, 75.41it/s]epoch: 3 loss: 1.081051 accuracy: 0.410795
100%|██████████| 10672/10672 [02:38<00:00, 67.43it/s]
  0%|          | 8/10672 [00:00<02:18, 77.02it/s]epoch: 4 loss: 1.035693 accuracy: 0.412950
100%|██████████| 10672/10672 [02:38<00:00, 67.30it/s]
  0%|          | 3/10672 [00:00<06:55, 25.66it/s]epoch: 5 loss: 1.218706 accuracy: 0.416229
100%|██████████| 10672/10672 [02:19<00:00, 76.62it/s]
  0%|          | 7/10672 [00:00<02:37, 67.82it/s]epoch: 6 loss: 1.135605 accuracy: 0.408452
100%|██████████| 10672/10672 [02:02<00:00, 86.95it/s]
  0%|          | 9/10672 [00:00<02:08, 82.73it/s]epoch: 7 loss: 1.205219 accuracy: 0.413137
100%|██████████| 10672/10672 [02:12<00:00, 80.55it/s]
  0%|          | 8/10672 [00:00<02:21, 75.44it/s]epoch: 8 loss: 1.667904 accuracy: 0.414543
100%|██████████| 10672/10672 [02:12<00:00, 80.39it/s]epoch: 9 loss: 1.123815 accuracy: 0.413981
Finished Training
'''

100%|██████████| 10672/10672 [02:33<00:00, 69.55it/s]
  0%|          | 9/10672 [00:00<02:10, 81.85it/s]epoch: 0 loss: 1.295777 accuracy: 0.416698
100%|██████████| 10672/10672 [02:31<00:00, 70.39it/s]
  0%|          | 7/10672 [00:00<02:38, 67.40it/s]epoch: 1 loss: 1.220215 accuracy: 0.407234
100%|██████████| 10672/10672 [02:17<00:00, 77.80it/s]
  0%|          | 8/10672 [00:00<02:21, 75.47it/s]epoch: 2 loss: 1.665452 accuracy: 0.411544
100%|██████████| 10672/10672 [02:12<00:00, 80.81it/s]
  0%|          | 8/10672 [00:00<02:21, 75.41it/s]epoch: 3 loss: 1.081051 accuracy: 0.410795
100%|██████████| 10672/10672 [02:38<00:00, 67.43it/s]
  0%|          | 8/10672 [00:00<02:18, 77.02it/s]epoch: 4 loss: 1.035693 accuracy: 0.412950
100%|██████████| 10672/10672 [02:38<00:00, 67.30it/s]
  0%|          | 3/10672 [00:00<06:55, 25.66it/s]epoch: 5 loss: 1.218706 accuracy: 0.416229
100%|██████████| 10672/10672 [02:19<00:00, 76.62it/s]
  0%|          | 7/10672 [00:00<02:37, 67.82it/s]epoch: 6 loss: 1.1356

# 83. ミニバッチ化・GPU上での学習
問題82のコードを改変し，B事例ごとに損失・勾配を計算して学習を行えるようにせよ（Bの値は適当に選べ）．また，GPU上で学習を実行せよ．

In [342]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNN(dw, dh, L, vocab_size)
criterion = nn.CrossEntropyLoss()  # クロスエントロピー損失関数
optimizer = optim.SGD(model.parameters(), lr=0.01)  # 確率的勾配降下法

X_train = pad_sequence(X_train, batch_first=True)
ds = TensorDataset(X_train, Y_train)
loader = DataLoader(ds, batch_size=1024, shuffle=True)

model   = model.to(device)

for epoch in range(10):
    hidden = torch.zeros(1, 121, dh, dtype=torch.float32)
    n_correct = 0
    total_loss = 0
    for inputs, label in tqdm(loader):
        inputs = inputs.to(device)
        label = label.to(device)
        outputs, hidden = model(inputs, hidden)
        hidden = hidden.detach()
        loss = criterion(outputs, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() # パラメータを更新

        total_loss += loss.data 
        outputs = np.argmax(outputs.data.numpy(), axis=1)
        label = label.data.numpy()
        if outputs == label:
            n_correct += 1
        
    print('epoch: %d loss: %f accuracy: %f' % (epoch, loss, n_correct/dataset_size))


print('Finished Training')


0%|          | 0/11 [00:00<?, ?it/s]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

# 84. 単語ベクトルの導入
事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)を初期化し，学習せよ．

In [None]:
from gensim.models import KeyedVectors


googlenews = KeyedVectors.load_word2vec_format(
    '../../data/GoogleNews-vectors-negative300.bin', binary=True)

class RNN(nn.Module):
    def __init__(self, data_size, hidden_size, output_size, vocab_size):
        super(RNN, self).__init__()
        self.rnn = torch.nn.RNN(dw, dh, nonlinearity='relu')
        self.liner = nn.Linear(hidden_size, output_size)


    def forward(self, data, last_hidden):           # data: (max_len, dw)
        y, hidden = self.rnn(data, last_hidden)     # y: (max_len, dh), hidden: (max_len, dh)
        y = y[:,-1,:]
        y = self.liner(y)
        y = torch.softmax(y, dim=1)
        return y, hidden


def tokens2vec(tokens):
    vec = []]
    for token in tokens:
        if token in googlenews:
            vec.append(googlenews[token])
        else:
            vec.append([0]*dw)
    return vec
            

X_train = train.tokens.apply(tokens2vec)

X_train = torch.tensor(X_train, dtype=torch.float32)

Y_train = torch.tensor(Y_train).long()


dataset_size = len(train)
max_len = train.tokens.apply(len).max()

In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNN(dw, dh, L, vocab_size)
criterion = nn.CrossEntropyLoss()  # クロスエントロピー損失関数
optimizer = optim.SGD(model.parameters(), lr=0.01)  # 確率的勾配降下法

ds = TensorDataset(X_train, Y_train)
loader = DataLoader(ds, batch_size=8, shuffle=True)

model   = model.to(device)

for epoch in range(10):
    hidden = torch.zeros(1, 121, dh, dtype=torch.float32)
    n_correct = 0
    total_loss = 0
    for inputs, label in tqdm(loader):
        inputs = inputs.to(device)
        label = label.to(device)
        
        outputs, hidden = model(inputs, hidden)
        loss = criterion(outputs, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() # パラメータを更新
        hidden = hidden.detach()
        
        total_loss += loss.data 
        outputs = np.argmax(outputs.data.numpy(), axis=1)
        label = label.data.numpy()
        if outputs == label:
            n_correct += 1
        
    print('epoch: %d loss: %f accuracy: %f' % (epoch, loss, correct_count/dataset_size))


print('Finished Training')


# 85. 双方向RNN・多層化
順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．

In [None]:

class BidirectionalRNN(nn.Module):
    def __init__(self, data_size, hidden_size, output_size, vocab_size):
        super(RNN, self).__init__()
        self.rnn = torch.nn.RNN(dw, dh, nonlinearity='relu', bidirectional=True)
        self.liner = nn.Linear(hidden_size, output_size)


    def forward(self, data, last_hidden):           # data: (max_len, dw)
        y, hidden = self.rnn(data, last_hidden)     # y: (max_len, dh), hidden: (max_len, dh)
        y = y[:,-1,:]
        y = self.liner(y)
        y = torch.softmax(y, dim=1)
        return y, hidden


def tokens2vec(tokens):
    vec = []]
    for token in tokens:
        if token in googlenews:
            vec.append(googlenews[token])
        else:
            vec.append([0]*dw)
    return vec
            


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BidirectionalRNN(dw, dh, L, vocab_size)
criterion = nn.CrossEntropyLoss()  # クロスエントロピー損失関数
optimizer = optim.SGD(model.parameters(), lr=0.01)  # 確率的勾配降下法

ds = TensorDataset(X_train, Y_train)
loader = DataLoader(ds, batch_size=8, shuffle=True)

model   = model.to(device)

for epoch in range(10):
    hidden = torch.zeros(1, 121, dh, dtype=torch.float32)
    n_correct = 0
    total_loss = 0
    for inputs, label in tqdm(loader):
        inputs = inputs.to(device)
        label = label.to(device)
        
        outputs, hidden = model(inputs, hidden)
        loss = criterion(outputs, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() # パラメータを更新
        hidden = hidden.detach()
        
        total_loss += loss.data 
        outputs = np.argmax(outputs.data.numpy(), axis=1)
        label = label.data.numpy()
        if outputs == label:
            n_correct += 1
        
    print('epoch: %d loss: %f accuracy: %f' % (epoch, loss, correct_count/dataset_size))


print('Finished Training')


# 86. 畳み込みニューラルネットワーク (CNN)
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列xからカテゴリyを予測するモデルを実装せよ

# 87. 確率的勾配降下法によるCNNの学習Permalink
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

# 88. パラメータチューニング
問題85や問題87のコードを改変し，ニューラルネットワークの形状やハイパーパラメータを調整しながら，高性能なカテゴリ分類器を構築せよ．

# 89. 事前学習済み言語モデルからの転移学習
事前学習済み言語モデル（例えばBERTなど）を出発点として，ニュース記事見出しをカテゴリに分類するモデルを構築せよ．