In [27]:
import time
import math
import random
from typing import List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
import pickle
import urllib
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from bpemb import BPEmb

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandr.khvorov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [2]:
def tokenize_to_words(text: str):
    return word_tokenize(text)

def tokenize_to_sents(text: str):
    return sent_tokenize(text)

def read_nips(path: str, documents_limit=None) -> List[List[str]]:
    df = pd.read_csv(path, compression='gzip', sep=',')
    docs = df['paper_text'].values.astype(np.str)
    sents = []
    for doc in docs if documents_limit is None else docs[:documents_limit]:
        sents += [tokenize_to_words(s) for s in tokenize_to_sents(doc)]
    return sents

In [3]:
data = read_nips("../resources/datasets/nips-papers.csv.gz", documents_limit=10)

In [5]:
print(data[0])
print(len(data))

['767', 'SELF-ORGANIZATION', 'OF', 'ASSOCIATIVE', 'DATABASE', 'AND', 'ITS', 'APPLICATIONS', 'Hisashi', 'Suzuki', 'and', 'Suguru', 'Arimoto', 'Osaka', 'University', ',', 'Toyonaka', ',', 'Osaka', '560', ',', 'Japan', 'ABSTRACT', 'An', 'efficient', 'method', 'of', 'self-organizing', 'associative', 'databases', 'is', 'proposed', 'together', 'with', 'applications', 'to', 'robot', 'eyesight', 'systems', '.']
2135


In [6]:
bpemb_en = BPEmb(lang="en", dim=50, vs=10000)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [77]:
class MyBPE:
    def __init__(self, bpemb):
        self.bpemb = bpemb
        self.size = bpemb.vectors.shape[0]
        self.SOS = "SOS"
        self.EOF = "EOF"
        self.SOS_EMB = np.mean(bpemb.vectors[np.random.choice(len(bpemb.vectors), 100)], axis=0)
        self.EOF_EMB = np.mean(bpemb.vectors[np.random.choice(len(bpemb.vectors), 100)], axis=0)
        self.SOS_IND = self.size
        self.EOF_IND = self.size + 1
        self.size += 2
        self.vectors = np.vstack((bpemb_en.vectors, self.SOS_EMB.reshape(1, -1), self.EOF_EMB.reshape(1, -1)))
        
    def __len__(self):
        return self.vectors.shape[0]
    
    def dim(self):
        return self.vectors.shape[1]
        
    def encode(self, s, start=False, finish=False):
        if type(s) is str:
            res = self.bpemb.encode(s)
        else:
            res = []
            for i in range(len(s)):
                res += self.encode(s[i])
        if start:
            res.insert(0, self.SOS)
        if finish:
            res.append(self.EOF)
        return res
    
    def encode_ids(self, s, start=False, finish=False):
        if type(s) is str:
            res = self.bpemb.encode_ids(s)
        else:
            res = []
            for i in range(len(s)):
                res += self.encode_ids(s[i])
        if start:
            res.insert(0, self.SOS_IND)
        if finish:
            res.append(self.EOF_IND)
        return res
    
    def embed(self, s, start=False, finish=False):
        if type(s) is str:
            res = self.bpemb.embed(s)
        else:
            res = self.embed(s[0])
            for i in range(1, len(s)):
                res = np.vstack((res, self.embed(s[i])))
        if start:
            res = np.vstack((res, self.SOS_EMB.reshape(1, -1)))
        if finish:
            res = np.vstack((res, self.EOF_EMB.reshape(1, -1)))
        return res

In [78]:
bpemb = MyBPE(bpemb_en)

In [75]:
def inputTensor(line, bpemb):
    embed_line = bpemb.embed(line, start=True)
    tensor = torch.from_numpy(embed_line).view(embed_line.shape[0], 1, embed_line.shape[1])
    return tensor

def targetTensor(line, bpemb):
    return torch.LongTensor(bpemb.encode_ids(line, finish=True))

def randomTrainingExample(bpemb):
    line = data[random.randint(0, len(data) - 1)]
    input_line_tensor = inputTensor(line, bpemb)
    target_line_tensor = targetTensor(line, bpemb)
    return input_line_tensor, target_line_tensor

In [64]:
class RNN(nn.Module):
    def __init__(self, bpemb, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(bpemb.dim() + hidden_size, hidden_size)
        self.i2o = nn.Linear(bpemb.dim() + hidden_size, len(bpemb))
        self.o2o = nn.Linear(hidden_size + len(bpemb), len(bpemb))
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        input_combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [35]:
criterion = nn.NLLLoss()

learning_rate = 0.0005

def train(input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = rnn.initHidden()

    rnn.zero_grad()

    loss = 0

    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item() / input_line_tensor.size(0)

In [80]:
# rnn = RNN(bpemb, 128)

In [None]:
n_iters = 500
print_every = 50
plot_every = 25
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*randomTrainingExample(bpemb))
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

7m 23s (50 0%) 4.4132
12m 55s (100 1%) 4.1190
17m 50s (150 1%) 2.5947
23m 27s (200 2%) 0.0436
28m 24s (250 2%) 3.8095


In [112]:
max_length = 20

# Sample from a category and starting letter
def sample(bpemb, start_text='Hello'):
    with torch.no_grad():
        input = inputTensor(start_text, bpemb)
        hidden = rnn.initHidden()
        
        for i in range(input.size(0)):
            token_dist, hidden = rnn(input[i], hidden)
        
        token_ind = token_dist.topk(1)[1][0][0]
        out_inds = [token_ind.item()]
        token_emb = torch.from_numpy(bpemb.vectors[token_ind]).view(1, -1)

        for i in range(max_length):
            token_dist, hidden = rnn(token_emb, hidden)
            topv, topi = token_dist.topk(1)
            token_ind = topi[0][0]
            if token_ind == bpemb.EOF_IND:
                break
            else:
                out_inds.append(token_ind.item())
            token_emb = torch.from_numpy(bpemb.vectors[token_ind]).view(1, -1)
            
        output = start_text + bpemb.bpemb.decode_ids(out_inds)
        return output

In [116]:
print(sample(bpemb, start_text="In this"))

In thisb b b b b b b b l l l l l l l l l l l lin


In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, bpemb, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(bpemb.vectors))
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
        return torch.zeros(1, self.hidden_size)
    
class DecoderRNN(nn.Module):
    def __init__(self, bpemb, hidden_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(bpemb.vectors))
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, bpemb.vectors.shape[0])
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [10]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(context_size=CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.01)

NameError: name 'CONTEXT_SIZE' is not defined