In [6]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from torch.optim import Adam

In [7]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertForMaskedLM.from_pretrained('bert-base-uncased')

#text = ("After Abraham Lincoln won the November 1860 presidential "
#        "election on an anti-slavery platform, an initial seven "
#        "slave states declared their secession from the country "
#        "to form the Confederacy. War broke out in April 1861 "
#        "when secessionist forces attacked Fort Sumter in South "
#        "Carolina, just over a month after Lincoln's "
#        "inauguration.")

In [8]:
#inputs = tokenizer(text, return_tensors='pt')


In [9]:
#inputs.keys()


In [10]:
#inputs


In [11]:
#inputs['labels'] = inputs.input_ids.detach().clone()

In [12]:
#inputs

In [13]:
# create random array of floats in equal dimension to input_ids
#rand = torch.rand(inputs.input_ids.shape)
# where the random array is less than 0.15, we set true
#mask_arr = rand < 0.15
#mask_arr

In [14]:
#(inputs.input_ids != 101) * (inputs.input_ids != 102)

In [15]:
#mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)
#mask_arr

In [16]:
# create selection from mask_arr
#selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
#selection

In [17]:
# apply selection index to inputs.input_ids, adding MASK tokens
#inputs.input_ids[0, selection] = 103

In [18]:
#inputs

## UTILS

In [123]:
"""
Utility functions for torch.
"""

import click, ast, torch
import matplotlib.pyplot as plt

### torch specific functions
def get_optimizer(name, parameters, lr, l2=0):
    if name == 'sgd':
        return torch.optim.SGD(parameters, lr=lr, weight_decay=l2)
    elif name == 'adam':
        return torch.optim.Adam(parameters, weight_decay=l2) # use default lr
    elif name == 'adamax':
        return torch.optim.Adamax(parameters, weight_decay=l2) # use default lr
    elif name == 'adadelta':
        return torch.optim.Adadelta(parameters, lr=lr, weight_decay=l2)
    else:
        raise Exception("Unsupported optimizer: {}".format(name))


def visualize_performance(performance, repo_name, show=False):
    plt.figure(figsize=(15, 10))

    # accuracy
    ax = plt.subplot(3, 2, 1)
    ax.set_title('Accuracy')
    ax.plot(performance['acc_test'], label='test')
    ax.plot(performance['acc_train'], label='train')

    ax.set_ylim([0.5, 1])
    ax.set_ylabel('%')
    ax.set_xlabel('epoch')
    plt.legend()

    # precision
    ax = plt.subplot(3, 2, 2)
    ax.set_title('Precision')
    ax.plot(performance['prec_test'], label='test')
    ax.plot(performance['prec_train'], label='train')

    ax.set_ylim([.5, 1])
    ax.set_ylabel('%')
    ax.set_xlabel('epoch')
    plt.legend()

    # recall
    ax = plt.subplot(3, 2, 3)
    ax.set_title('Recall')
    ax.plot(performance['recall_test'], label='test')
    ax.plot(performance['recall_train'], label='train')

    ax.set_ylim([.5, 1])
    ax.set_ylabel('%')
    ax.set_xlabel('epoch')
    plt.legend()

    # f1
    ax = plt.subplot(3, 2, 4)
    ax.set_title('F1')
    ax.plot(performance['f1_test'], label='test')
    ax.plot(performance['f1_train'], label='train')

    ax.set_ylim([.5, 1])
    ax.set_ylabel('%')
    ax.set_xlabel('epoch')
    plt.legend()

    # loss
    ax = plt.subplot(3, 1, 3)
    ax.set_title('loss')
    ax.plot(performance['loss'])

    ax.set_ylim([0, 0.5])
    ax.set_ylabel('loss')
    ax.set_xlabel('')

    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.35,
                        wspace=0.35)

    if not show:
        plt.savefig(repo_name + 'performance_vis.png')
    else:
        plt.show()

    plt.close()


class PythonLiteralOption(click.Option):
    def type_cast_value(self, ctx, value):
        # Either we denote a range, or a list with precise samples:
        # 1) Range:
        if 'range' in value:
            idx_open = value.find('(')
            idx_close = value.find(')')
            # Get the range input
            range_input = value[idx_open + 1:idx_close].split(',')
            # Make it to numbers:
            range_input = [int(num) for num in range_input]
            try :
                return list(range(*range_input))
            except:
                raise click.BadParameter(value)
        else:
            try:
                return ast.literal_eval(value)
            except:
                raise click.BadParameter(value)


## MODEL


In [124]:

import torch.nn as nn
import torch.nn.functional as F


# For general use
class GCNsimple(nn.Module):
    """ It's a simple version of a GCN module operated on dependency graphs."""
    def __init__(self,  in_dim, out_dim, use_cuda = False, bias=False):

        super().__init__()

        self.use_cuda = use_cuda
        self.out_dim = out_dim
        self.in_dim = in_dim

        self.W = nn.Linear(self.in_dim, self.out_dim, bias=bias)

        # self.weight = self.W.weight
        # self.bias = self.W.bias

    def forward(self, x, lamb, relu=True):
        # Push the data through gcn.
        Lx = lamb.bmm(x)
        LxW = self.W(Lx)
        if relu:
            gLxW = F.relu(LxW)
        else:
            gLxW = LxW

        return gLxW


class GCNModel(nn.Module):
    """ A module that represents the multi-layer GCN model.
    """
    def __init__(self, num_of_layer, input_dimension=1, hidden_dimension=1, bias=False):
        super().__init__()
        self.num_of_layer = num_of_layer
        self.hidden_dimension = hidden_dimension
        self.input_dimension = input_dimension
        self.all_layer = nn.ModuleList()

        if num_of_layer >0:
            # first layer.
            self.all_layer.append(GCNsimple(input_dimension,
                                            hidden_dimension,
                                            bias=bias))
            for _ in range(num_of_layer - 1):
                self.all_layer.append(GCNsimple(hidden_dimension, hidden_dimension))

    def forward(self, x, lamb):
        for i in range(self.num_of_layer):
            x = self.all_layer[i](x, lamb)

        return x

    def parameters(self):
        return self.all_layer.parameters()

    def cuda(self):
        self.all_layer.cuda()


# from VGAE https://github.com/DaehanKim/vgae_pytorch

In [670]:
class VGAE(nn.Module):
	def __init__(self):
		super(VGAE,self).__init__()
		self.base_gcn = GraphConvSparse(input_dim, hidden1_dim, activation=F.relu)
		self.base2_gcn = GraphConvSparse(hidden1_dim, hidden2_dim, activation=F.relu)
		self.gcn_mean = GraphConvSparse(hidden2_dim, hidden3_dim, activation=None)
		self.gcn_logstddev = GraphConvSparse(hidden2_dim, hidden3_dim, activation=None)
		self.dec_1 = GraphConvSparse(hidden3_dim, hidden2_dim, activation=F.relu)
		self.dec_2 = GraphConvSparse(hidden2_dim, hidden1_dim, activation=F.relu)
		self.dec_3 = GraphConvSparse(hidden1_dim, output_dim, activation=F.tanh)
		self.kl = 0
        
        
	def encode(self, X,adj):
		hidden = self.base_gcn(X,adj)
		hidden = self.base2_gcn(hidden,adj)
		self.mean = self.gcn_mean(hidden,adj)
		self.logstd = self.gcn_logstddev(hidden,adj)
		gaussian_noise = torch.randn(X.size(0), hidden3_dim)
		sampled_z = gaussian_noise*torch.exp(self.logstd) + self.mean
		self.kl = (self.logstd ** 2 + self.mean ** 2 - torch.log(self.logstd) - 0.5).sum()
		print(self.kl)
		return sampled_z
    
	def decode(self,Z,adj):
		A_pred = self.dec_1(Z,adj)
		A_pred = self.dec_2(A_pred,adj)
		A_pred = self.dec_3(A_pred,adj)
		return A_pred

	def forward(self, X,adj):
		Z = self.encode(X,adj)
		A_pred = self.decode(Z,adj)
		return A_pred

class GraphConvSparse(nn.Module):
	def __init__(self, input_dim, output_dim, activation = F.relu, **kwargs):
		super(GraphConvSparse, self).__init__(**kwargs)
		self.weight = glorot_init(input_dim, output_dim) 
		self.activation = activation

	def forward(self, inputs, adj):
		x = inputs
		x = torch.mm(x,self.weight)
		x = torch.mm(adj, x)
		if self.activation:
			outputs = self.activation(x)
			return outputs
		return x




def glorot_init(input_dim, output_dim):
	init_range = np.sqrt(6.0/(input_dim + output_dim))
	initial = torch.rand(input_dim, output_dim)*2*init_range - init_range
	return nn.Parameter(initial)


class GAE(nn.Module):
	def __init__(self,input_dim, hidden1_dim,hidden2_dim):
		super(GAE,self).__init__()
		self.base_gcn = GraphConvSparse(input_dim, hidden1_dim)
		self.gcn_mean = GraphConvSparse(hidden1_dim, hidden2_dim, activation=lambda x:x)

	def encode(self, X):
		hidden = self.base_gcn(X)
		z = self.mean = self.gcn_mean(hidden)
		return z

	def forward(self, X):
		Z = self.encode(X)
		A_pred = dot_product_decode(Z)
		return A_pred

In [671]:
input_dim = 450
hidden1_dim = 256
hidden2_dim = 128
hidden3_dim = 100
output_dim = 450
use_feature = True
num_epoch = 20
learning_rate = 0.001
batch_size = 32

In [672]:
import torch
import torch.nn as nn
from torch.autograd import Variable


import json


def unpack_batch(batch, cuda):
    if cuda:
        inputs = [Variable(b.cuda()) for b in batch[:10]]
        labels = Variable(batch[10].cuda())
    else:
        inputs = [Variable(b) for b in batch[:10]]
        labels = Variable(batch[10])

    # To have the possibility to pass custom adjacency matrix and words vectors to the prediction.
    inputs += [None, None]  # It will be referenced as [cust_adj, cust_words]
    tokens = batch[0]
    head = batch[5]
    subj_pos = batch[6]
    obj_pos = batch[7]
    lens = batch[1].eq(0).long().sum(1).squeeze()

    return inputs, labels, tokens, head, subj_pos, obj_pos, lens


# A helping function.
def make_eyes(lambs, lens):
    eyes = torch.zeros(lambs.shape)
    for i, le in enumerate(lens):
        eyes[i,:le, :le] = torch.eye(le)
    return eyes


class MLM:
    def __init__(self, cfg, cuda=False):
        # First check if we load the model:
        if cfg['load_model']:
            assert 'repo_name' in cfg, 'We need a file name to load the model.'
            # Get all hyperparameter:
            cfg.update(json.load(open(cfg['repo_name'] + 'config.json', 'r')))
            # Set cuda:
            cfg['cuda'] = cuda
            self.cuda = cuda
            checkpoint = torch.load(cfg['repo_name'] + 'model_params.pt', map_location=torch.device('cpu'))

            # just get the vocab dimensions:
            cfg['vocab_len'] = checkpoint['embedding']['weight'].shape[0]
            cfg['pos_vocab_len'] = checkpoint['pos_embedding']['weight'].shape[0]
            cfg['lemma_vocab_len'] = checkpoint['lemma_embedding']['weight'].shape[0]
            # don't need the checkpoints anymmore
            del checkpoint

            # Vocab initialisation:
            self.vocab = Vocab()
            self.pos_vocab = Vocab()
            self.lemma_vocab = Vocab()

            # make random model initialization:
            self.random_model_init_(cfg)

            # load pretrainded weights:
            self.load(cfg['repo_name'], cuda=cfg['cuda'])

        else:
            # Set cuda:
            cfg['cuda'] = cuda
            self.cuda = cuda

            # We need the vocabs given in cfg:
            self.vocab = cfg['vocab']
            self.pos_vocab = cfg['pos_vocab']
            self.lemma_vocab = cfg['lemma_vocab']

            # set up necessary vocab lens:
            cfg['vocab_len'] = len(self.vocab)
            cfg['pos_vocab_len'] = len(self.pos_vocab)
            cfg['lemma_vocab_len'] = len(self.lemma_vocab)

            # init parameter:
            self.random_model_init_(cfg)

            if self.vocab.word_embed is None: self.vocab.init_word_embed(cfg)
            self.embedding = self.vocab.word_embed


        self.model_type = cfg['model_type']
        self.new_model = True

        if self.cuda:
            self.mp_model.cuda()
            #self.output_layer.cuda()
            self.embedding.cuda()
            #self.first_layer.cuda()
            #self.last_layer.cuda()
            self.pos_embedding.cuda()
            self.word_embedding.cuda()
            self.lemma_embedding.cuda()

        # Init optimizer:
        #params = list(self.model.parameters()) \
        #         + list(self.pos_embedding.parameters()) \
        #         + list(self.word_embedding.parameters()) \
        #         + list(self.lemma_embedding.parameters())
                # + list(self.first_layer.parameters()) \
                # + list(self.last_layer.parameters()) \
                # + list(self.output_layer.parameters()) \

        #self.optim = get_optimizer(cfg['optimizer'],
        #                                         params,
        #                                         cfg['lr'])

        self.loss = nn.MSELoss()

    def random_model_init_(self, cfg):
        assert all(x in cfg for x in ['vocab_len', 'pos_vocab_len',
                                      'lemma_vocab_len']), 'Please indicate the dimensions of the vocabularies.'
        
        # Embedding layer:
        self.embedding = nn.Embedding(cfg['vocab_len'], cfg['input_dimension'])

        # Trainable word embedding
        self.word_embedding = nn.Embedding(cfg['vocab_len'], cfg['word_emb_dim'])
        with torch.no_grad(): self.word_embedding.weight[0] = 0. # set '<PAD>' to zero
        self.word_embedding.weight = nn.parameter.Parameter(self.word_embedding.weight, requires_grad=True) 
        # Make it to leaf variable again

        # Pos embedding layer
        self.pos_embedding = nn.Embedding(cfg['pos_vocab_len'], cfg['pos_emb_dim'])
        with torch.no_grad(): self.pos_embedding.weight[0] = 0. # set '<PAD>' to zero
        self.pos_embedding.weight = nn.parameter.Parameter(self.pos_embedding.weight, requires_grad=True)

        # lemma embedding laayer
        self.lemma_embedding = nn.Embedding(cfg['lemma_vocab_len'], cfg['lemma_emb_dim'])
        with torch.no_grad(): self.lemma_embedding.weight[0] = 0. # set '<PAD>' to zero
        self.lemma_embedding.weight = nn.parameter.Parameter(self.lemma_embedding.weight, requires_grad=True)
        
        # Add a first layer if wanted:
        
        #self.first_layer = GCNModel(num_of_layer=1,
        #                    input_dimension=cfg['input_dimension'] + cfg['pos_emb_dim'] + cfg[
         #                       'word_emb_dim'] + cfg['lemma_emb_dim'],
         #                   hidden_dimension=cfg['hidden_dimension'],
         #                   bias=cfg['bias'])
        print(cfg['input_dimension'] + cfg['pos_emb_dim'] + 
                                 cfg['word_emb_dim'] + cfg['lemma_emb_dim'])
        self.model = VGAE()
        self.optim = Adam(self.model.parameters(), lr=learning_rate)

        # Set up the model:
        #self.mp_model = GCNModel(num_of_layer=cfg['num_of_layer'],
        #                         input_dimension=cfg['hidden_dimension'],
        #                         hidden_dimension=cfg['hidden_dimension'],
        #                         bias=cfg['bias']
         #                        )

        # Last layer
        #self.last_layer = GCNModel(num_of_layer=1,
        #                           input_dimension=cfg['hidden_dimension'],
         #                          hidden_dimension=cfg['hidden_dimension'],
          #                         bias=cfg['bias'])
        # Output layer:

        #self.output_layer = nn.Linear(cfg['hidden_dimension'],
        #                              cfg['num_of_classes'],
        #                              bias=cfg['bias'])

    def update(self, batch):
        # Free the optimizer:
        #self.optim.zero_grad()
        # Unwrap batch
        batch_loss = []
        
        
        lambs, poss, texts, labels, lens, _, lemmas = batch[:7]
        lambs = lambs.to(torch.int64)
            # Adjacency in the first matrix are identity matrices:
            #eyes = make_eyes(lambs, lens)

            # Set on cuda:
#             if self.cuda:
#                 lambs = lambs.cuda()
#                 texts = texts.cuda()
#                 labels = labels.cuda()
#                 eyes = eyes.cuda()
#                 poss = poss.cuda()
#                 lemmas = lemmas.cuda()
        #print(np.shape(poss))
            # Propagate through the the model
            # Embedding layers
        
            
        const_word_vec = self.embedding(texts)
        word_vec = self.word_embedding(texts)
        pos_vec = self.pos_embedding(poss)
        lemma_vec = self.lemma_embedding(lemmas)



        #print(np.shape(texts),np.shape(poss),np.shape(lemmas))
        #x = torch.cat([texts, poss, lemmas], dim=0).transpose(0,1)

       # print(texts[:5],poss[:5],lemmas[:5],x[:5])
        x = torch.cat([word_vec, pos_vec, const_word_vec, lemma_vec], dim=2)
        #print(np.shape(x))
        #x = x[0]
        x = x.float()
        #print(x[0])
        lambs = lambs.float()
        #print(np.shape(lambs))

        #print(np.shape(x))
        #print(np.shape(lambs))
        const_word_vec_lab = self.embedding(labels)
        word_vec_lab = self.word_embedding(labels)
        pos_vec_lab = self.pos_embedding(poss)

        lemma_ve_lab = self.lemma_embedding(lemmas)
        labels = torch.cat([word_vec_lab, pos_vec_lab, const_word_vec_lab, lemma_ve_lab], dim=2)

        #labels = labels[0]
        #print(labels)

        norm = lambs.shape[0] * lambs.shape[0] / float((lambs.shape[0] * lambs.shape[0] - lambs.sum()) * 2)
        
        #print(np.shape(x),np.shape(lambs))
        for sample in range(len(batch[0])):
            #try:
            A_pred = self.model(x[sample],lambs[sample])
            #print(np.shape(A_pred))
            #print(np.shape(labels))
            self.optim.zero_grad()
            #make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png")

            #pos_weight = float(lambs.shape[0] * lambs.shape[0] - lambs.sum()) / lambs.sum()
            #weight_tensor = torch.ones(weight_mask.size(0)) 
            #weight_tensor[weight_mask] = pos_weight


            #for i in A_pred[0]:
            #    print(i)
            #print(labels)

            loss = log_lik = norm*F.cross_entropy(A_pred,labels[sample])
            print('loss : ', log_lik)

            #kl loss from gpt
            #kl_divergence = torch.sum(1 + self.model.logstd - self.model.mean.pow(2) - self.model.logstd.exp())

            #loss from vgae paper
            #kl_divergence = 0.5/ A_pred.size(0) * (1 + 2*self.model.logstd - self.model.mean**2 - torch.exp(self.model.logstd)**2).sum(1).mean()
            loss += self.model.kl
            loss.backward(retain_graph=True)
            self.optim.step()
            batch_loss.append(float(loss.detach()))
            #print('avg_loss = ',sum(batch_loss) / len(batch_loss))
            print('KLD : ',self.model.kl)
            #print ('lgstd, meansq, logstdexp', self.model.logstd, self.model.mean.pow(2), self.model.logstd.exp())
            #except:
            #    print('weird thing happening')
            #    pass
           
            
        #print('loss : ', loss)
        #loss_mse = nn.MSELoss()
        #output_mse = loss_mse(A_pred.view(-1), labels.view(-1))
        #output_mse.backward()
        #self.optim.step()
        # last layer:
        #x = self.last_layer(x, eyes)

        # Mean pooling: We expect xTs is of shape (batch_size, seq_len, hidden_dim)
        #x = x.sum(1) / x.shape[1]

        # Propagate through the last layer:
        #x = self.output_layer(x)

        # x.shape should be (batch_size, num_of_classes).
        # label.shape should be (batch_size).
        
        #loss_val = self.loss(x, labels.to(torch.float32))

        # Do the backward step:
        #loss_val.backward()

        # And the step 
        #self.optim.step()

        #print('loss_mse : ', output_mse)
        return batch_loss 

    def predict(self, batch, debug=False, custom_vect_input=None):
        lambs, poss, texts, labels, lens, _, lemmas = batch[:7]

        # Adjacency in the first matrix are identity matrices:
        eyes = make_eyes(lambs, lens)

        # Set on cuda
        if self.cuda:
            lambs = lambs.cuda()
            texts = texts.cuda()
            labels = labels.cuda()
            eyes = eyes.cuda()
            poss = poss.cuda()
            lemmas = lemmas.cuda()
        if custom_vect_input is None:
            # Embedding layers
            const_word_vec = self.embedding(texts)
            word_vec = self.word_embedding(texts)
            pos_vec = self.pos_embedding(poss)
            lemma_vec = self.lemma_embedding(lemmas)

            x = torch.cat([word_vec, pos_vec, const_word_vec, lemma_vec], dim=2)
        else:
            x = custom_vect_input

        # first layer:
        # Adjacency matrix is the identity matrix.
        x = self.first_layer(x, eyes)

        # MPNN model
        x = self.mp_model(x, lambs)

        # last layer:
        x = self.last_layer(x, eyes)

        # Mean pooling: We expect xTs is of shape (batch_size, seq_len, hidden_dim)
        x = x.sum(1) / x.shape[1]

        # Propagate through the last layer:
        logits = self.output_layer(x)

        return logits

    def load(self, filename, cuda=False):
        if cuda:
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        try:
            checkpoint = torch.load(filename + 'model_params.pt', map_location=device)
        except BaseException:
            print("Cannot load model from {}".format(filename + 'model_params.pt'))
            exit()

        self.embedding.load_state_dict(checkpoint['embedding'])
        self.pos_embedding.load_state_dict(checkpoint['pos_embedding'])
        self.word_embedding.load_state_dict(checkpoint['word_embedding'])
        self.lemma_embedding.load_state_dict(checkpoint['lemma_embedding'])
        self.mp_model.load_state_dict(checkpoint['mp_model'])
        #self.output_layer.load_state_dict(checkpoint['output_layer'])
        #self.last_layer.load_state_dict(checkpoint['last_layer'])
        #self.first_layer.load_state_dict(checkpoint['first_layer'])

        self.vocab.load(filename + 'vocab.p')
        self.pos_vocab.load(filename + 'pos_vocab.p')
        self.lemma_vocab.load(filename + 'lemma_vocab.p')

        self.vocab.word_embed = self.embedding
        self.pos_vocab.word_embed = self.pos_embedding

    def save(self, filename):
        params = {
            'embedding': self.embedding.state_dict(),
            'pos_embedding': self.pos_embedding.state_dict(),
            'mp_model': self.mp_model.state_dict(),
            #'output_layer': self.output_layer.state_dict(),
            #'last_layer': self.last_layer.state_dict(),
            #'first_layer': self.first_layer.state_dict(),
            'word_embedding': self.word_embedding.state_dict(),
            'lemma_embedding': self.lemma_embedding.state_dict()
        }
        try:
            torch.save(params, filename + 'model_params.pt')
            self.vocab.save(filename + 'vocab.p')
            self.pos_vocab.save(filename + 'pos_vocab.p')
            self.lemma_vocab.save(filename + 'lemma_vocab.p')

        except BaseException:
            print("[Warning: Saving failed... continuing anyway.]")


In [674]:
mp_trainer = MLM(cfg, cuda=cfg['cuda'])

450


In [675]:
epochs = cfg['epochs']
for epoch in tqdm(range(cfg['epochs'] + 1)):
    if epoch > int(epochs*0.8):
        for g in mp_trainer.optim.param_groups:
            g['lr'] = g['lr']/10


    if epoch == range(cfg['epochs']):
        # It's the last epoch, don't train again.
        break

    print('Train epoch {}:'.format(epoch))
    epoch_loss = []
    for i, batch in enumerate(train_loader):
        loss = mp_trainer.update(batch)
        epoch_loss += loss
        if i % 10 == 0:
            print('avg_loss = ',sum(epoch_loss) / len(epoch_loss))
    #performance['loss'].append(np.mean(np.array(epoch_loss)))

  0%|                                                                                           | 0/51 [00:00<?, ?it/s]

Train epoch 0:
tensor(inf, grad_fn=<SumBackward0>)
loss :  tensor(2.0474, grad_fn=<MulBackward0>)
KLD :  tensor(inf, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>)
tensor(nan, grad_fn=<SumBackward0>)
loss :  tensor(nan, grad_fn=<MulBackward0>)
KLD :  tensor(nan, grad_fn=<SumBackward0>

  0%|                                                                                           | 0/51 [00:02<?, ?it/s]


KeyboardInterrupt: 

## DATA

In [25]:
import os
import pickle
import torch

class Vocab(object):

    def __init__(self, filename='', load=False, threshold=5):
        if load:
            assert os.path.exists(filename), "Vocab file does not exist at " + filename

            self.id2word, self.word2id = self.load(filename)
            self.size = len(self.id2word)
            self.threshold = threshold
            self.wordCounter = None
        else:
            self.id2word, self.word2id = {}, {}
            self.size = 0
            self.threshold = threshold
            # We always add some custom tokens into the vocabulary.
            self.add_words(
                {'<PAD>': float('inf'), '<UNK>': float('inf'),'<MSK>' : 103})
        self.word_embed = None

    def add_words(self, counterOfTokens):
        for item, value in counterOfTokens.items():
            if value >= self.threshold:
                if item not in self.word2id:
                    # add it to the vocab
                    self.word2id[item] = self.size
                    self.id2word[self.size] = item
                    self.size += 1

    def load(self, filename):
        with open(filename, 'rb') as infile:
            id2word = pickle.load(infile)
            word2id = {word:id for id, word in id2word.items()}
            self.id2word, self.word2id = id2word, word2id
            self.size = len(self.id2word)

        return id2word, word2id

    def save(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
           
        with open(filename, 'wb') as outfile:
            pickle.dump(self.id2word, outfile)

    def __len__(self):
        return self.size


    def init_word_embed(self, cfg, cache_dir='datasets/.word_vectors_cache'):
        if cfg['word_vectors'] == 'Word2Vec':
            from torchnlp.word_to_vector import FastText
            all_word_vector = FastText(language=cfg['language'], cache=cache_dir, aligned=True)
        else:
            raise NotImplementedError('No word_vectors found which are called {}.'.format(cfg['word_vectors']))

        # The the vectors only correspond to lower character words:
        all_words = [word.lower() for word in list(self.word2id.keys())]
        weights = all_word_vector[all_words]
        
        word_embed = torch.nn.Embedding(*weights.shape, _weight=weights)
        #if cfg['device'] == 'cuda':
        #    word_embed.cuda()

        self.word_embed = word_embed
        self.embed_size = weights.shape[1]

    def words2vecs(self, words: list):
        if not self.word_embed:
            raise AttributeError("The word embeddings aren't initialized yet.")
        else:
            vecs = self.word_embed(torch.tensor(self.map(words), requires_grad=False))
        return vecs

    def one_hot_ids2vecs(self, ids):
        vecs = self.word_embed(ids)
        return vecs

    def map(self, token_list):
        """
        Map a list of tokens to their ids.
        """
        return [self.word2id[w] if w in self.word2id else self.word2id['<UNK>'] for w in token_list]

    def unmap(self, idx_list):
        """
        Unmap ids back to tokens.
        """
        return [self.id2word[idx] for idx in idx_list]
    
def get_pos_vocab():
    """
    Function to set up a part of speech vocabulary handcrafed.
    """
    pos_id2word = {0: '<PAD>', 1: '<UNK>', 2: 'DET', 3: 'PROPN', 4: 'VERB', 5: 'PART', 6: 'ADJ', 7: 'PUNCT', 8: 'CCONJ',
                   9: 'ADP', 10: 'PRON', 11: 'NOUN', 12: 'ADV', 13: 'INTJ', 14: 'NUM', 15: 'X', 16: 'SYM'}
    pos_word2id = {word: id for id, word in pos_id2word.items()}
    pos_vocab = Vocab()
    pos_vocab.id2word = pos_id2word
    pos_vocab.word2id = pos_word2id
    pos_vocab.size = len(pos_vocab.id2word)
    
    return pos_vocab


In [590]:
# Code to load sst data
from torch.utils.data import Dataset
import torch
import numpy as np


class SSTData(Dataset):
    def __init__(self,
                 sst_data,
                 vocab,
                 nlp,
                 lemma_vocab,
                 pos_vocab = None,
                 self_loop=True):

        self.lemma_vocab = lemma_vocab
        self.self_loop = self_loop
        
        
        #sst_data = [sample for sample in sst_data if sample['label'] != 'neutral']
        self.sst_data = sst_data
        #self.sentiment_vocab = {'negative': 0, 'positive': 1}
        
        # Add sentencizer in the nlp if not already in it:
        if "sentencizer" not in nlp.pipe_names:
            # sentencizer = nlp.create_pipe("sentencizer")
            nlp.add_pipe('sentencizer', first=True)
        self.nlp = nlp

        self.vocab = vocab
        if pos_vocab is None:
            self.pos_vocab = get_pos_vocab()
        else:
            self.pos_vocab = pos_vocab

    def __getitem__(self, idx):
        txt = self.sst_data[idx]
        if len(txt) <64:
            txt = txt + ' PAD'*(64-len(txt))
            doc = self.nlp(txt)
        elif len(txt) > 64:
            txt = txt[:65]
            
        doc = self.nlp(txt)

        token_ids = self.vocab.map([token.text for token in doc])
        # create random array of floats in equal dimension to input_ids
        rand = torch.rand(np.shape(vocab.map([token.text for token in doc])))
        # where the random array is less than 0.15, we set true
        mask_arr = rand < 0.15
        # create selection from mask_arr
        selection = torch.flatten((mask_arr).nonzero()).tolist()
        for i in selection: 
            token_ids[i] = 103
            break
            
            
        
        # make lambda
        adj, root_id = doc_to_adj(doc, directed=False, self_loop=self.self_loop)

        lamb = adj

        # normalize
        denom = lamb.sum(1)
        lamb /= denom

        # make text indices
        
        
        # make pos ids
        pos_ids = self.pos_vocab.map([token.pos_ for token in doc])
        for i in selection: 
            pos_ids[i] = 1
            break

        # make lemma ids
        lemma_ids = self.lemma_vocab.map([token.lemma_ for token in doc])
        for i in selection: 
            lemma_ids[i] = 103
            break
        # make label
        label = self.vocab.map([token.text for token in doc])

        return token_ids, pos_ids, lamb, label, root_id, lemma_ids

    
    def data_check(self):
        for idx,sent in enumerate(self.sst_data):
            if len(sent) < 10:
                self.sst_data.remove(sent)
            
    def __len__(self):
        return len(self.sst_data)



In [591]:

def collate_fn_sentim(batch):
    lens = []
    for sample in batch:
        text, pos, lamb, label, root_id, lemma = sample[:6]
        lens.append(lamb.shape[1])

    max_len = max(lens)
    lambs = []
    texts = []
    poss = []
    lemmas = []
    labels = []
    root_ids = []
    for sample in batch:
        text, pos, lamb, label, root_id, lemma = sample[:6]
        # Big lamb
        lamb_ = torch.zeros(1, max_len, max_len)
        lamb_[0, :lamb.shape[0], :lamb.shape[1]] = lamb
        lambs.append(lamb_)

        # Big text
        text_ = torch.zeros(1, max_len, dtype=torch.long)
        text_[0, :len(text)] = torch.tensor(text)
        texts.append(text_)

        # Big pos
        pos_ = torch.zeros(1, max_len, dtype=torch.long)
        pos_[0, :len(pos)] = torch.tensor(pos)
        poss.append(pos_)

        # Big lemma
        lemma_ = torch.zeros(1, max_len, dtype=torch.long)
        lemma_[0, :len(lemma)] = torch.tensor(lemma)
        lemmas.append(lemma_)

        # Big label:
        label_ = torch.zeros(1, max_len, dtype=torch.long)
        label_[0, :len(label)] = torch.tensor(label)
        labels.append(label_)

        # Big root_id:
        root_id_ = torch.ones(1) * root_id
        root_ids.append(root_id_.long())

    lambs = torch.cat(lambs, dim=0)
    texts = torch.cat(texts, dim=0)
    poss = torch.cat(poss, dim=0)
    labels = torch.cat(labels, dim=0)
    root_ids = torch.cat(root_ids, dim=0)
    lemmas = torch.cat(lemmas, dim=0)

    return lambs, poss, texts, labels, lens, root_ids, lemmas


In [592]:


def doc_to_adj(sent, directed=True, self_loop=False):
    # Sent should be a spacy document. Can also be longer than a sentence.
    sent_len = len(sent)
    root_id = 1
    ret = torch.zeros(sent_len, sent_len, dtype=torch.float32)

    for token in sent:
        for child in token.children:
            if child.i >= sent_len:
                #print('Something goes wrong here.')
                print(child.i, sent_len, sent, token.i, token)
                pass
            ret[token.i, child.i] = 1
        if token.dep_ == 'ROOT':
            root_id = token.i

    if not directed:
        ret = ret + ret.transpose(0, 1)

    if self_loop:
        for i in range(sent_len):
            ret[i, i] = 1
        
    return ret, root_id



In [593]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

from torchnlp.datasets import smt_dataset
from torchtext.datasets import WikiText103
import click

import spacy

import json
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter


In [594]:
cfg = {"repo_name": "saved_models/sst_model1+/", 
       "epochs": 50, "optimizer": "adam", 
       "cuda": False, 
       "lr": 0.0002, 
       "num_of_layer": 3, 
       "hidden_dimension": 10, 
       "batch_size": 1,
       "word_vectors": "Word2Vec",
       "bias": False, 
       "pos_emb_dim": 30,
       "model_type": "gcn",
       "data_amount": 1.0, 
       "data_set": "sst", 
       "word_emb_dim": 70,
       "lemma_emb_dim": 50, 
       "trainerfilename": "saved_models/sst_model1+/",
       "logfilename": "saved_models/sst_model1+/log.txt",
       "num_of_classes": 64,
       "language": "en",
       "normalize_lamb": True,
       "laplacian": False,
       "input_dimension": 300,
       'load_model' : False}

In [595]:
from pathlib import Path
import re
train_data = Path('datasets/wikitext-103/wiki.train.tokens').read_text(encoding='utf-8')
val_data = Path('datasets/wikitext-103/wiki.valid.tokens').read_text(encoding='utf-8')
test_data = Path('datasets/wikitext-103/wiki.test.tokens').read_text(encoding='utf-8')

In [596]:
heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'

In [597]:
# Split out train headings and articles
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

# Split out validation headings and articles
val_split = re.split(heading_pattern, val_data)
val_headings = [x[7:-7] for x in val_split[1::2]]
val_articles = [x for x in val_split[2::2]]

# Split out test headings and articles
test_split = re.split(heading_pattern, test_data)
test_headings = [x[7:-7] for x in test_split[1::2]]
test_articles = [x for x in test_split[2::2]]



In [598]:
train_data = [i.split('. ') for i in train_articles]
val_data = [i.split('. ') for i in val_articles]
test_data = [i.split('. ') for i in test_articles]

In [599]:
test_data = [item for sublist in test_data for item in sublist]
val_data = [item for sublist in val_data for item in sublist]
train_data = [item for sublist in train_data for item in sublist]

In [600]:
data_amount = .005

print('Loading data...')
nlp = spacy.load('en_core_web_sm')
cw = Counter()
cl = Counter()



train_set, test_set = train_data, val_data

# only use the the percentage of data we want:
train_set = train_set[:int(len(train_set) * data_amount)]
test_set = test_set[:int(len(test_set) * data_amount)]




Loading data...


In [None]:

# Count words:
for sample in train_set + test_set: cw += Counter([token.text for token in nlp(sample)])
# Count lemma:
for sample in train_set + test_set: cl += Counter([token.lemma_ for token in nlp(sample)])

vocab = Vocab()
lemma_vocab = Vocab()


# prepare vocab
vocab.add_words(cw)
#cfg['input_dimension'] = 300

lemma_vocab.add_words(cl)
pos_vocab = get_pos_vocab()

# Save the parameter:
#with open(repo_name + 'config.json', 'w') as fp:
#    json.dump(cfg.get_as_dict(), fp)

loss = nn.CrossEntropyLoss()

In [601]:
print('Prepare data ...')


train_data = SSTData(
    train_set,
    vocab,
    nlp,
    lemma_vocab,
)
test_data = SSTData(
    test_set,
    vocab,
    nlp,
    lemma_vocab,
)

cfg['vocab'] = vocab
cfg['pos_vocab'] = lemma_vocab
cfg['lemma_vocab'] = lemma_vocab


Prepare data ...


In [602]:
train_data.__len__()

19438

In [603]:
train_data.data_check()

In [604]:
train_data.__len__()

19043

In [605]:
train_set

['The Tower Building of the Little Rock Arsenal , also known as U.S',
 'Arsenal Building , is a building located in MacArthur Park in downtown Little Rock , Arkansas ',
 "Built in 1840 , it was part of Little Rock 's first military installation ",
 'Since its decommissioning , The Tower Building has housed two museums ',
 'It was home to the Arkansas Museum of Natural History and Antiquities from 1942 to 1997 and the MacArthur Museum of Arkansas Military History since 2001 ',
 'It has also been the headquarters of the Little Rock Æsthetic Club since 1894 ',
 '\n The building receives its name from its distinct octagonal tower ',
 'Besides being the last remaining structure of the original Little Rock Arsenal and one of the oldest buildings in central Arkansas , it was also the birthplace of General Douglas MacArthur , who became the supreme commander of US forces in the South Pacific during World War II ',
 'It was also the starting place of the Camden Expedition ',
 'In 2011 it was na

In [606]:
collate_fn = collate_fn_sentim

train_loader = DataLoader(train_data,
                          batch_size=32,
                          collate_fn=collate_fn)

test_loader = DataLoader(test_data,
                         batch_size=32,
                         collate_fn=collate_fn)


In [618]:
for i, batch in enumerate(train_loader):
    print(len(batch[0]))

32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
32
3

## Train

In [70]:
def test_performance(data_loader, mp_trainer, num_of_classes = 64):
    # Evaluation
    f1_micro = 0
    prec_micro = 0
    recall_micro = 0
    accuracy = 0.
    for batch in data_loader:
        lambs, poss, texts, labels, lens = batch[:5]
        logits = mp_trainer.predict(batch)

        predicted_classes = torch.argmax(logits, dim=1)
        if num_of_classes == 2:
            average = 'binary'
        else:
            average = 'micro'
        f1_micro += f1_score(labels.cpu(), predicted_classes.cpu(), average= average)
        prec_micro += precision_score(labels.cpu(), predicted_classes.cpu(), average= average)
        recall_micro += recall_score(labels.cpu(), predicted_classes.cpu(), average= average)
        accuracy += accuracy_score(labels.cpu(), predicted_classes.cpu())

    f1_micro /= len(data_loader)
    prec_micro /= len(data_loader)
    recall_micro /= len(data_loader)
    accuracy /= len(data_loader)

    return f1_micro, prec_micro, recall_micro, accuracy


In [71]:
repo_name = 'saved_models/sst_model1+/'
cfg['trainerfilename'] = repo_name
cfg['logfilename'] = repo_name + 'log.txt'

# Task specific settings
cfg['num_of_classes'] = 64
cfg['language'] = 'en'

# model specific settings
cfg['normalize_lamb'] = True
cfg['laplacian'] = False


In [72]:
with open(cfg['logfilename'], 'a+') as logfile:
    logfile.write(
        '{} training samples set into {} batches, {} test samples set into {} batches\n'.format(len(train_data),
                                                                                                len(train_loader),
                                                                                                len(test_data),
                                                                                                len(test_loader)))
    logfile.write('Start training.\n')

all_losses = []
performance = {'acc_test': [], 'prec_test': [], 'f1_test': [], 'recall_test': [],
               'acc_train': [], 'prec_train': [], 'f1_train': [], 'recall_train': [],
               'loss': []}
print('Start training')
best_accuracy = 0.


Start training


In [73]:
mp_trainer = MLM(cfg, cuda=cfg['cuda'])

450


In [32]:
epochs = cfg['epochs']
for epoch in tqdm(range(cfg['epochs'] + 1)):
    if epoch > int(epochs*0.8):
        for g in mp_trainer.optim.param_groups:
            g['lr'] = g['lr']/10

#     f1_macro_test, prec_macro_test, recall_macro_test, accuracy_test = test_performance(test_loader,
#                                                                                         mp_trainer,
#                                                                                         num_of_classes = cfg['output_dimesion'])
#     f1_macro_train, prec_macro_train, recall_macro_train, accuracy_train = test_performance(train_loader,
#                                                                                             mp_trainer,
#                                                                                             num_of_classes = cfg['output_dimesion'])
#     # Save vals for test:
#     performance['acc_test'].append(accuracy_test)
#     performance['prec_test'].append(prec_macro_test)
#     performance['f1_test'].append(f1_macro_test)
#     performance['recall_test'].append(recall_macro_test)

#     # Save vals for train:
#     performance['acc_train'].append(accuracy_train)
#     performance['prec_train'].append(prec_macro_train)
#     performance['f1_train'].append(f1_macro_train)
#     performance['recall_train'].append(recall_macro_train)

#     with open(cfg['logfilename'], 'a+') as logfile:
#         logfile.write('On test we have accuracy = {}, f1 = {}, prec = {}, recall = {}\n'.format(accuracy_test,
#                                                                                                 f1_macro_test,
#                                                                                                 prec_macro_test,
#                                                                                                 recall_macro_test))
#         logfile.write('On train we have accuracy = {}, f1 = {}, prec = {}, recall = {}\n'.format(accuracy_train,
#                                                                                                 f1_macro_train,
#                                                                                                 prec_macro_train,
#                                                                                                 recall_macro_train))

#     if accuracy_test > best_accuracy:
#         mp_trainer.save(cfg['trainerfilename'])
#         with open(cfg['logfilename'], 'a+') as logfile:
#             logfile.write(
#                 'New best model with accuracy = {} saved at {}\n'.format(accuracy_test, cfg['trainerfilename']))
#         best_accuracy = accuracy_test
#     else:
#         with open(cfg['logfilename'], 'a+') as logfile:
#             logfile.write('Best model has accuracy = {}\n'.format(best_accuracy))

#     # save current performance.
#     json.dump(performance, open(repo_name + 'performance.json', 'w'))
#     # plot performance
#     visualize_performance(performance, repo_name)

    if epoch == range(cfg['epochs']):
        # It's the last epoch, don't train again.
        break

    print('Train epoch {}:'.format(epoch))
    epoch_loss = []
    for i, batch in tqdm(enumerate(train_loader)):
        loss = mp_trainer.update(batch)
        epoch_loss.append(float(loss.detach()))
        print(loss)
        if i % 50 == 0:
            with open(cfg['logfilename'], 'a+') as logfile:
                logfile.write('sample {} of {} in epoch {} of {}.\n'.format(i,
                                                                            len(train_loader),
                                                                            epoch,
                                                                            cfg['epochs']))

    #performance['loss'].append(np.mean(np.array(epoch_loss)))


  0%|                                                                                           | 0/51 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
  0%|                                                                                           | 0/51 [00:00<?, ?it/s]


Train epoch 0:


TypeError: forward() takes 2 positional arguments but 3 were given

In [None]:
10,64,450

## MISC

In [211]:
for batch in test_loader:
    #print(np.shape(batch))
    print(np.shape(batch[0]))
    #print(batch[0][3][2])
    

torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])
torch.Size([20, 64, 64])


In [22]:
txt = "election on an anti-slavery platform, an initial seven slave states declared their secession from the country to form the Confederacy. War broke out in April 1861 "
        
doc = nlp(txt)

In [23]:
doc

election on an anti-slavery platform, an initial seven slave states declared their secession from the country to form the Confederacy. War broke out in April 1861 

In [24]:
(64-len(doc))

34

In [25]:
if len(doc) <64:
    txt = txt + ' PAD'*(63-len(doc))
    doc = nlp(txt)
elif len(doc) > 64:
    doc = doc[:64]

In [26]:
vocab.map([token.text for token in doc])

[9114,
 114,
 117,
 2873,
 30,
 7398,
 1,
 28,
 117,
 7540,
 6888,
 7323,
 16878,
 301,
 191,
 1,
 455,
 8,
 5943,
 6,
 3110,
 8,
 1,
 37,
 4693,
 6097,
 220,
 125,
 14842,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [27]:
[token.text for token in doc]

['election',
 'on',
 'an',
 'anti',
 '-',
 'slavery',
 'platform',
 ',',
 'an',
 'initial',
 'seven',
 'slave',
 'states',
 'declared',
 'their',
 'secession',
 'from',
 'the',
 'country',
 'to',
 'form',
 'the',
 'Confederacy',
 '.',
 'War',
 'broke',
 'out',
 'in',
 'April',
 '1861',
 ' ',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [28]:
[token.pos_ for token in doc]

['NOUN',
 'ADP',
 'DET',
 'ADJ',
 'ADJ',
 'ADJ',
 'NOUN',
 'PUNCT',
 'DET',
 'ADJ',
 'NUM',
 'NOUN',
 'NOUN',
 'VERB',
 'PRON',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'PART',
 'VERB',
 'DET',
 'PROPN',
 'PUNCT',
 'NOUN',
 'VERB',
 'ADP',
 'ADP',
 'PROPN',
 'NUM',
 'SPACE',
 'PROPN',
 'PROPN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'NOUN',
 'PROPN',
 'PROPN',
 'PROPN']

In [29]:
[token.lemma_ for token in doc]

['election',
 'on',
 'an',
 'anti',
 '-',
 'slavery',
 'platform',
 ',',
 'an',
 'initial',
 'seven',
 'slave',
 'state',
 'declare',
 'their',
 'secession',
 'from',
 'the',
 'country',
 'to',
 'form',
 'the',
 'Confederacy',
 '.',
 'war',
 'break',
 'out',
 'in',
 'April',
 '1861',
 ' ',
 'PAD',
 'PAD',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'PAD',
 'PAD',
 'PAD']

In [30]:
adj, root_id = doc_to_adj(doc, directed=False, self_loop=True)

In [31]:
adj

tensor([[1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 1.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.]])

In [32]:
root_id

25

# data preprocess

In [None]:
labels = vocab.map([token.text for token in doc])

In [None]:
inputs = vocab.map([token.text for token in doc])

In [None]:
# create random array of floats in equal dimension to input_ids
rand = torch.rand(np.shape(vocab.map([token.text for token in doc])))
# where the random array is less than 0.15, we set true
mask_arr = rand < 0.15
mask_arr

In [None]:
# create selection from mask_arr
selection = torch.flatten((mask_arr).nonzero()).tolist()
selection

In [None]:
for i in selection: 
    inputs[i] = 103

In [None]:
inputs