In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import nltk
import string
import os
import torch
from collections import Counter
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from gensim.models import KeyedVectors

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Carrega a BERT

In [37]:
#bert_tokenizer = AutoTokenizer.from_pretrained('/root/.cache/torch/transformers/neuralmind-bert-large-portuguese-cased/')
#bert_model = AutoModel.from_pretrained('/root/.cache/torch/transformers/neuralmind-bert-large-portuguese-cased/')

bert_tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')
bert_model = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')

Carrega os dados de entrada

In [17]:
pwd = %pwd
pwd = os.path.join(os.path.dirname(pwd), 'data')
print(pwd)

D:\Mestrado\2020-02\Deep Learning\relation-extraction-deep-learning\data


In [18]:
data = pd.read_csv(os.path.join(pwd, 'input/data_full.csv'), sep=';').replace({np.NaN: None})
data.columns = ['sentenca','entidade1','entidade1_tipo','relacao','entidade2','entidade2_tipo']

In [19]:
data['class'] = data['relacao'].apply(lambda x: 0 if x is None else 1)

In [20]:
data.head()

Unnamed: 0,sentenca,entidade1,entidade1_tipo,relacao,entidade2,entidade2_tipo,class
0,A Kroton anunciou uma parceria com o Cubo Itaú...,Kroton,ORG,anunciou uma parceria com,Cubo Itaú,ORG,1
1,O evento Summit AgriHub reuniu nesta quinta-fe...,Summit AgriHub,ORG,reuniu em,Cuiabá,PLC,1
2,O evento Summit AgriHub reuniu nesta quinta-fe...,Summit AgriHub,ORG,,Mato Grosso,PLC,0
3,O evento Summit AgriHub reuniu nesta quinta-fe...,Summit AgriHub,ORG,,Brasil,PLC,0
4,"Ousuário consegue comparar prazos , condições ...",Bradesco,ORG,,Banco do Brasil,ORG,0


Essa sera nossa estrategia de encoding para propagar esses dados pela rede neural

Verifica em quantos registros existem relacoes

In [21]:
Counter(data['class'])

Counter({1: 463, 0: 446})

Vamos pegar o tamanho maximo das sentencas

In [22]:
max_len = data['sentenca'].apply(lambda x: len(nltk.word_tokenize(x)))

In [23]:
pd.Series(max_len).describe()

count    909.000000
mean      40.207921
std       15.261946
min        8.000000
25%       30.000000
50%       39.000000
75%       48.000000
max      128.000000
Name: sentenca, dtype: float64

In [24]:
max_len = max(max_len)

In [25]:
max_len

128

In [26]:
sample_txt = 'A Kroton anunciou uma parceria com o Cubo Itaú , que dará origem à vertical “Cubo Education” , um espaço dedicado ao fomento de edtechs – tecnologia aplicada à educação .'
print(f"BERT tokens: {bert_tokenizer.tokenize(sample_txt)}")
print(f"NLTK tokens: {nltk.word_tokenize(sample_txt)}")

BERT tokens: ['a', 'k', '##ro', '##ton', 'anunciou', 'uma', 'parceria', 'com', 'o', 'cub', '##o', 'it', '##au', ',', 'que', 'dar', '##a', 'origem', 'a', 'vertical', '“', 'cub', '##o', 'educa', '##tion', '”', ',', 'um', 'espa', '##co', 'dedicado', 'ao', 'fome', '##nto', 'de', 'ed', '##tec', '##h', '##s', '–', 'tecnologia', 'aplicada', 'a', 'educa', '##ca', '##o', '.']
NLTK tokens: ['A', 'Kroton', 'anunciou', 'uma', 'parceria', 'com', 'o', 'Cubo', 'Itaú', ',', 'que', 'dará', 'origem', 'à', 'vertical', '“', 'Cubo', 'Education', '”', ',', 'um', 'espaço', 'dedicado', 'ao', 'fomento', 'de', 'edtechs', '–', 'tecnologia', 'aplicada', 'à', 'educação', '.']


In [28]:
tokens = nltk.word_tokenize(sample_txt)
token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
token_ids = token_ids + [0]*(max_len-len(token_ids))

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

we = bert_tokenizer.encode_plus(
  sample_txt,
  max_length=max_len,
  add_special_tokens=False, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=False,
)
print(f'WE: {we}')

 Sentence: A Kroton anunciou uma parceria com o Cubo Itaú , que dará origem à vertical “Cubo Education” , um espaço dedicado ao fomento de edtechs – tecnologia aplicada à educação .
   Tokens: ['A', 'Kroton', 'anunciou', 'uma', 'parceria', 'com', 'o', 'Cubo', 'Itaú', ',', 'que', 'dará', 'origem', 'à', 'vertical', '“', 'Cubo', 'Education', '”', ',', 'um', 'espaço', 'dedicado', 'ao', 'fomento', 'de', 'edtechs', '–', 'tecnologia', 'aplicada', 'à', 'educação', '.']
Token IDs: [177, 100, 3127, 230, 4495, 170, 146, 100, 100, 117, 179, 100, 2008, 353, 14357, 1112, 100, 100, 22354, 117, 222, 2363, 8055, 320, 100, 125, 100, 1379, 4277, 11107, 353, 3478, 119, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
WE: {'input_ids': [123, 1396, 157, 897, 3127, 230, 4495, 

Com o exemplo acima podemos ver que o nltk.word_tokenize parece ser mais preciso que o bert_tokenizer ao quebrar as palavras

### Processamento dos dados

Vamos entao passar todas as frases para lista de tokens_id <br>
Uma maneira legal de fazer isso é usando namedtuples, assim conseguimos colocar varios dados dentro de uma mesma tupla e depois fazer o parse deles

In [29]:
from collections import namedtuple

In [33]:
Row = namedtuple("Row", ["sentenca","entidade1","entidade2","relacao"])
corpus = data.apply(
    lambda x: Row(
        sentenca=x['sentenca'],
        entidade1=x['entidade1'],
        entidade2=x['entidade2'],
        relacao= None if x['relacao'] is None else x['relacao']
    )
    , axis=1
)

### Treinamento

Podemos fazer com que a rede tenha apenas um ou mais modelos internos. Por exemplo, um modelo para verificar se existe relacao e outro para extrair essa relacao

É possivel receber modelos por parametros dentro das classes de nn.Module. Assim, incorporamos o modelo recebido as configuracao da rede que estamos propondo. <br>
Abaixo é possivel ver que incorporamos a BERT nos modelos ModeloVerificaRelacao e ModeloExtraiRelacao. Alem disso, incorporamos esses dois modelos tambem em ModeloDaniel

Deste modo, temos as seguintes possibilidades:<br>
    - Os modelos ModeloVerificaRelacao e ModeloExtraiRelacao podem propagar suas entradas pela rede da BERT<br>
    - Treinar apenas o ModeloDaniel e fazer com que ele propague o erro para ModeloVerificaRelacao, ModeloExtraiRelacao e BERT (Isso é considerado um fine-tuning da BERT; O desafio sera propagar as saidas de ModeloVerificaRelacao para ModeloExtraiRelacao)<br>
    - Treinar ModeloVerificaRelacao e ModeloExtraiRelacao separados e depois adicionalos ao ModeloDaniel apenas para unificacao (Essa abordagem é mais didatica; O desafio sera elaborar um metodo de treinamento consistente que permita apenas um treinamento, caso contrario deverao existir varios em separado e apenas um local para unificar suas chamadas)<br>

In [34]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [35]:
class DeepRelationIdentifier(nn.Module):
    def __init__(self, embedding_model, max_sentence_length, features=3, embedding_size=1024, output_size=1, n_hidden=1, drop_prob=0.1):
        super().__init__()
        self._embedding_model = embedding_model
        self._max_len = max_sentence_length
        
        self._drop_prob = drop_prob
        self._n_hidden = n_hidden
        
        # Avoid tuning BERT to this problem
        self._embedding_model.requires_grad = False
        
        self.fc = nn.Sequential(
            nn.Linear(embedding_size * features, self._n_hidden),
            nn.ReLU(),
            nn.Linear(self._n_hidden, self._n_hidden),
            nn.ReLU(),
            nn.Dropout(self._drop_prob),
            nn.Linear(self._n_hidden, output_size),
            nn.Sigmoid()
        )
        
    def _pad(self, x):
        return x + [0]*(self._max_len-len(x))   
    
    def parse_sentence_to_ids(self, sentence, pad=True):
        sentence = bert_tokenizer.convert_tokens_to_ids(nltk.word_tokenize(sentence))
        return self._pad(x=sentence) if pad else sentence
    
    def to_embeddings(self, x, is_token_ids=True):
        if is_token_ids:
            return self._embedding_model(torch.tensor(x).unsqueeze(0))[0]
        else:
            return self._embedding_model(torch.tensor(self.parse_sentence_to_ids(sentence=x, pad=True)).unsqueeze(0))[0]

    def prepare_sentence(self, sentence:str, entity1:str, entity2:str):
        features = (self.to_embeddings(sentence, is_token_ids=False) for f in [sentence, entity1, entity2])
        return torch.cat(tuple(f.squeeze().sum(axis=0) for f in features), dim=0)
                    
    def forward(self, inputs):
        ''' Forward pass through the network'''
        ## TODO: put x through the fully-connected layer
        out = self.fc(inputs)
        
        # return the final output and the hidden state
        return out

Com as definicoes acima, vamos instanciar o nosso modelo

In [38]:
ri_model = DeepRelationIdentifier(
    embedding_model=bert_model,
    max_sentence_length=max_len,
    features=3,
    embedding_size=1024,
    output_size=1,
    n_hidden=1,
    drop_prob=0.1
)

Vamos imprimir o schema do modelo

In [39]:
print(ri_model)

DeepRelationIdentifier(
  (_embedding_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), 

)


Agora que temos as namedtuples, precisamos converter as sentencas e entidades em um vetor para a nossa rede neural

In [64]:
row = 0
dataset = [
    (
        #print('Sentence [' + str(row) + ']')
        
        ri_model.prepare_sentence(sentence=x.sentenca, entity1=x.entidade1, entity2=x.entidade2).cpu().detach().numpy(), 
        0 if x.relacao is None else 1
        
        #row += 1
    ) for x in corpus[:1000]
]

Agora que temos o dataset, entao é so correr para o abraco e dividir o conjunto entre treino e teste

In [65]:
test_ration = 0.2
test_idx = int(len(dataset)*(1-test_ration))
trainset, testset = dataset[:test_idx], dataset[test_idx:]

In [66]:
print(f"Trainset size: {len(trainset)}")
print(f"Testset size: {len(testset)}")

Trainset size: 727
Testset size: 182


Agora vamos definir o nosso metodo de treinamento

In [67]:
def train(net, data, epochs=10, batch_size=10, lr=0.001, clip=5, val_frac=0.1, print_every=10, earlyStopping=False, max_epochs_no_improve=2):
    '''
        Training a network 
    '''
    net.train()
    
    opt = torch.optim.SGD(net.parameters(), lr=lr)
    criterion = nn.BCELoss()
    
    # create training and validation data
    valid_idx = int(len(data)*(1-val_frac))
    data, valid_data = data[:valid_idx], data[valid_idx:]
    
    # create Tensor datasets
    train_data = TensorDataset(torch.from_numpy(np.array([x[0] for x in data])), torch.from_numpy(np.array([x[1] for x in data])))
    valid_data = TensorDataset(torch.from_numpy(np.array([x[0] for x in valid_data])), torch.from_numpy(np.array([x[1] for x in valid_data])))

    # shuffling and batching data
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
    
    if(torch.cuda.is_available()):  
        net.cuda()
    
    epoch_no_improve = 0
    last_epoch_loss = None
    
    for e in range(epochs):
        counter = 0

        losses = []
        for inputs, targets in train_loader:
            counter += 1

            if(torch.cuda.is_available()):
                inputs, targets = inputs.cuda(), targets.cuda()

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output = net(inputs)
            # calculate the loss and perform backprop
            loss = criterion(output.float(), targets.float())
               
            loss.backward()
            opt.step()

            losses.append(loss.item())
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_losses = []
                net.eval()
                
                for inputs, targets in valid_loader:

                    if(torch.cuda.is_available()):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output = net(inputs)
                    val_loss = criterion(output.float(), targets.float())

                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data

                _loss = loss.item()
                _val_loss = 1 if len(val_losses) == 0 else np.nanmean(val_losses)
                
                print('{0}{1}{2}{3}'.format(
                    "Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter),
                    "Loss: {:.4f}...".format(_loss),
                    "Val Loss: {:.4f}".format(_val_loss)
                    )
                )
                print("loss:{:0.4f}".format(_loss))
                print("val_loss:{:0.4f}".format(_val_loss))


        # Early stopping strategy
        current_epoch_loss = 1 if len(losses) == 0 else np.nanmean(losses)
        if last_epoch_loss is None:
            last_epoch_loss = current_epoch_loss
        elif last_epoch_loss < current_epoch_loss:
            epoch_no_improve += 1
        else:
            last_epoch_loss = current_epoch_loss
            epoch_no_improve = 0

        if earlyStopping:
            if epoch_no_improve >= max_epochs_no_improve:
                print('[EarlyStopping] Reached max epochs without improvement')
                print("early_stopping:1")
                break
            else:
                continue

Pronto, temos tudo preparado para iniciar o treinamento

In [68]:
train(
    net=ri_model,
    data=trainset,
    epochs=2,
    batch_size=10,
    lr=0.001,
    clip=5,
    val_frac=0.1, 
    print_every=10,
    earlyStopping=False,
    max_epochs_no_improve=2
)

Epoch: 1/2...Step: 10...Loss: 1.1107...Val Loss: 0.5839
loss:1.1107
val_loss:0.5839
Epoch: 1/2...Step: 20...Loss: 1.1229...Val Loss: 0.5837
loss:1.1229
val_loss:0.5837
Epoch: 1/2...Step: 30...Loss: 1.1997...Val Loss: 0.5835
loss:1.1997
val_loss:0.5835
Epoch: 1/2...Step: 40...Loss: 1.0013...Val Loss: 0.6182
loss:1.0013
val_loss:0.6182
Epoch: 1/2...Step: 50...Loss: 0.7471...Val Loss: 0.5831
loss:0.7471
val_loss:0.5831
Epoch: 1/2...Step: 60...Loss: 0.7034...Val Loss: 0.5828
loss:0.7034
val_loss:0.5828


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 2/2...Step: 10...Loss: 1.3557...Val Loss: 0.5826
loss:1.3557
val_loss:0.5826
Epoch: 2/2...Step: 20...Loss: 0.9906...Val Loss: 0.6507
loss:0.9906
val_loss:0.6507
Epoch: 2/2...Step: 30...Loss: 0.8688...Val Loss: 0.5822
loss:0.8688
val_loss:0.5822
Epoch: 2/2...Step: 40...Loss: 0.6164...Val Loss: 0.5820
loss:0.6164
val_loss:0.5820
Epoch: 2/2...Step: 50...Loss: 0.8659...Val Loss: 0.6493
loss:0.8659
val_loss:0.6493
Epoch: 2/2...Step: 60...Loss: 1.0933...Val Loss: 0.5482
loss:1.0933
val_loss:0.5482


### Metricas

In [69]:
import sklearn.metrics as metrics

In [70]:
def evaluate_binary_classification(y_true, y_pred, pos_label=1):
    
    def metric_auc(y_true, y_pred, pos_label=True):
        try:
            fpr, tpr, _threshold = metrics.roc_curve(
                y_true,
                y_pred,
                pos_label=pos_label
            )
            return metrics.auc(fpr, tpr)
        except:
            return None

    dct = {
        'ACCURACY': metrics.accuracy_score(y_true, y_pred),
        'F1': metrics.f1_score(y_true, y_pred, average='weighted', pos_label=pos_label),
        'F1_MICRO': metrics.f1_score(y_true, y_pred, average='micro', pos_label=pos_label),
        'F1_MACRO': metrics.f1_score(y_true, y_pred, average='macro', pos_label=pos_label),
        'PRECISION': metrics.precision_score(y_true, y_pred, average='binary', pos_label=pos_label),
        'RECALL': metrics.recall_score(y_true, y_pred, average='binary', pos_label=pos_label),
        'AUC': metric_auc(y_true, y_pred),
        'KAPPA': metrics.cohen_kappa_score(y_true, y_pred)
    }
    return {k: 0.0 if not v else v for k, v in dct.items()}

Para usar como entrada em uma MLP simples, temos de converter de lista de tensores para lista de listas

In [71]:
X_test = [torch.from_numpy(np.array(x[0])) for x in testset]
y_test = [x[1] for x in testset]

In [72]:
y_pred = [1 if ri_model(x) > 0.5 else 0 for x in X_test]

In [73]:
evaluate_binary_classification(y_test, y_pred)

{'ACCURACY': 0.6758241758241759,
 'F1': 0.5450909745991714,
 'F1_MICRO': 0.6758241758241759,
 'F1_MACRO': 0.4032786885245902,
 'PRECISION': 0.6758241758241759,
 'RECALL': 1.0,
 'AUC': 0.5,
 'KAPPA': 0.0}