In [None]:
# !pip install 'sru[cuda]<2.1.9'

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

import time
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from sru import SRU, SRUCell

In [2]:
import nltk
nltk.download('treebank')
nltk.download('universal_tagset')

tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  3914
Total Number of Tagged words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus  12


In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,random_state=1234)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  3131
Number of Sentences in Testing Data  783


In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

word_to_ix = {}

for word in vocab:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)

tag_to_ix = {}
for tag in tags:
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)

assert len(tag_to_ix) == len(tags)

In [5]:
class Tagger(nn.Module):

    def __init__(self, rnn, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(Tagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.sru = rnn
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size) # -1 to not count padding tag
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        sru_out, _ = self.sru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(sru_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [6]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 300

In [7]:
sru_module = SRU(EMBEDDING_DIM, HIDDEN_DIM,
                          num_layers = 2,          # number of stacking RNN layers
                          dropout = 0.0,           # dropout applied between RNN layers
                          bidirectional = False,   # bidirectional RNN
                          layer_norm = False,      # apply layer normalization on the output of each layer
                          highway_bias = 0,        # initial bias of highway gate (<= 0)
                          rescale = True,          # whether to use scaling correction
                        )

lstm_module = nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=2,)

In [8]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)

cuda


In [17]:
def calculate_score(model, dataset, scorer = accuracy_score):
  model.eval()

  with torch.no_grad():
    sum = 0
    for line in dataset:
      d = dict(line)   
      sentence = d.keys()
      tags = d.values()

      targets = prepare_sequence(tags, tag_to_ix).to(device)
      inputs = prepare_sequence(sentence, word_to_ix).to(device)
      
      tag_scores = model(inputs)
      y_pred = tag_scores.argmax(axis=1)
      sum += scorer(targets.cpu(), y_pred.cpu())

    return sum/len(dataset)  

In [21]:
import time
import warnings

def train(model, loss_function = nn.NLLLoss(), optimizer = optim.SGD,  n_epoch = 3):
    warnings.filterwarnings(action='once')
    start = time.time()
    
    optimizer = optimizer(model.parameters(), lr=0.1)
    
    for epoch in range(n_epoch):
        model.train()
        for line in train_set:
            d = dict(line)        # это надо убрать в какой-нибудь даталоадер
            sentence = d.keys()
            tags = d.values()

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_ix).to(device)
            targets = prepare_sequence(tags, tag_to_ix).to(device)

            # Step 3. Run our forward pass.
            tag_scores = model.forward(sentence=sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

        print(f"Epoch: {epoch}     Acc_train: {calculate_score(model, train_set)},  Acc_test: {calculate_score(model, test_set)}")    

    print("Elapsed time: ", time.time() - start)

In [25]:
sru = Tagger(sru_module, EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix)).to(device)
lstm = Tagger( lstm_module, EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix)).to(device)

In [26]:
train(sru, n_epoch = 10)



Epoch: 0     Acc_train: 0.9134083972665801,  Acc_test: 0.8717780134319727
Epoch: 1     Acc_train: 0.9573804354602311,  Acc_test: 0.8897232979056512
Epoch: 2     Acc_train: 0.9758656068017106,  Acc_test: 0.8995575013109696
Epoch: 3     Acc_train: 0.9866326241295851,  Acc_test: 0.9060705340588753
Epoch: 4     Acc_train: 0.9918186801916291,  Acc_test: 0.9088534869039906
Epoch: 5     Acc_train: 0.9957561515431828,  Acc_test: 0.9093400871456797
Epoch: 6     Acc_train: 0.9971287358465315,  Acc_test: 0.9112057387567097
Epoch: 7     Acc_train: 0.9984518165759537,  Acc_test: 0.9147247650698923
Epoch: 8     Acc_train: 0.998668492839349,  Acc_test: 0.915627818146878
Epoch: 9     Acc_train: 0.9987867921671787,  Acc_test: 0.9153728949561035
Elapsed time:  58.180185317993164


In [27]:
train(lstm, n_epoch = 10)

Epoch: 0     Acc_train: 0.8704674722212549,  Acc_test: 0.8475579741142379
Epoch: 1     Acc_train: 0.9299420555680269,  Acc_test: 0.8864265099331329
Epoch: 2     Acc_train: 0.9585335842306272,  Acc_test: 0.8988487330081532
Epoch: 3     Acc_train: 0.9726328193997663,  Acc_test: 0.9047405195299528
Epoch: 4     Acc_train: 0.9828264538087873,  Acc_test: 0.9073311489434541
Epoch: 5     Acc_train: 0.988841143521125,  Acc_test: 0.9057767908506523
Epoch: 6     Acc_train: 0.9918790984642692,  Acc_test: 0.9087998466421776
Epoch: 7     Acc_train: 0.994661851691341,  Acc_test: 0.9099361746200852
Epoch: 8     Acc_train: 0.9965485282465362,  Acc_test: 0.9124675602534866
Epoch: 9     Acc_train: 0.9972847208330112,  Acc_test: 0.9125284522705993
Elapsed time:  101.28886771202087


### Дальше идёт пока не законченная часть, где будет реализована загрузка через генератор батчами 

In [None]:
from torch.utils import data

class MyDataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, data):
        'Initialization'
        self.list_data = data
        
  def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_data)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        line = self.list_data[index]

        d = dict(line)       
        words = d.keys()
        tags = d.values()
        sentence_in = prepare_sequence(words, word_to_ix)#.to(device)
        targets = prepare_sequence(tags, tag_to_ix)#.to(device)
        
        return sentence_in, targets

In [46]:
training_set = MyDataset(train_set)
testing_set = MyDataset(test_set)

In [134]:
from torch.nn.utils import rnn

class PadSequence:
    def __call__(self, batch):
#         max_length = max(map(len, lines))
    
#         sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
        sequences = [x[0] for x in batch]
        tags = [x[1] for x in batch]

        sequences_padded = rnn.pad_sequence(sequences, batch_first = True, padding_value=-1)
        tags_padded = rnn.pad_sequence(tags, batch_first = True, padding_value=-1)
        # Also need to store the length of each sequence
        # This is later needed in order to unpad the sequences
#         lengths = torch.LongTensor([len(x) for x in sequences])
        # Don't forget to grab the labels of the *sorted* batch
        
#         new_batch = [(sequences_padded[i], batch[i][1]) for i in range(len(batch))]
        
        return sequences_padded, tags_padded

In [135]:
params = {'batch_size': 3,
          'shuffle': False,
          'num_workers': 6,
         'collate_fn':PadSequence()}

train_loader = data.DataLoader(training_set, **params)

test_loader = data.DataLoader(testing_set, **params)

In [117]:
next(enumerate(training_set))

(0,
 (tensor([ 6889,  1164, 11590,  2456,  5513,   133, 11821, 11912,  7286,  7245,
          11936,  6605,  3909,  7824,  9054,  2356,  3469,  9873,  3763,  9058,
          10547, 11321,  8374]),
  tensor([10, 11, 11, 11,  3, 11,  2, 10,  8, 11,  7,  0,  2,  4,  9,  1,  1,  6,
           2, 10, 11,  0,  7])))

In [139]:
batch_id, (sentences, tags) = next(enumerate(train_loader))

In [141]:
sentences.T.shape

torch.Size([38, 3])

In [144]:
import time
import warnings

def train_generator(model, train_loader, test_loader=None, loss_function = nn.NLLLoss(), n_epoch = 3):
    warnings.filterwarnings(action='once')
    start = time.time()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in range(n_epoch):
        model.train()
        for batch_id, (sentences, tags) in enumerate(train_loader):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            sentence_in = sentences.to(device)
            targets = tags.to(device)

            # Step 3. Run our forward pass.
            tag_scores = model.forward(sentence=sentence_in)
            
            print(tag_scores.shape)
            print(targets.shape)

            assert False
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()
            
        print(f"Epoch: {epoch}/{n_epoch}")
#         print(f"Epoch: {epoch}     Acc_train: {calculate_score(model, train_set)},  Acc_test: {calculate_score(model, test_set)}")    

    print("Elapsed time: ", time.time() - start)

In [145]:
train_generator(sru, train_loader)

RuntimeError: cuda runtime error (710) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:26

In [None]:
from catalyst.dl import SupervisedRunner

model = sru
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
loaders = {"train": train_loader, "valid": test_loader}
logdir = "./logs/sru"

# model runner
runner = SupervisedRunner()

# model training
runner.train(
    model=model,
    criterion=nn.CrossEntropyLoss(),
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    logdir=logdir,
    num_epochs=10,
    verbose=True
)