In [81]:
import pickle
import random
import json

import torch
from torch import nn, LongTensor, Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

from transformers import BertModel, BertTokenizer

In [15]:
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
print(str(PYTORCH_PRETRAINED_BERT_CACHE))

/home/tin/.cache/huggingface/hub


In [16]:
re_labels = ["rel:None", "rel:True"]

In [17]:
model_name = 'bert-base-uncased'
bert = BertModel.from_pretrained(model_name, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE))
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
num_labels = len(re_labels)

dropout = nn.Dropout(bert.config.hidden_dropout_prob)
layer_norm = nn.LayerNorm(bert.config.hidden_size * 2)
classifer = nn.Linear(bert.config.hidden_size * 2, num_labels)

In [19]:
with open('../data/data.pkl', 'rb') as f:
    features = pickle.load(f)

In [20]:
random.shuffle(features)

features_len = len(features)
print(features_len)
start = 0
end = start + int(features_len * 0.15)
val_features = features[start:end]

start = end
end = start + int(features_len * 0.15)
test_features = features[start:end]

start = end
train_features = features[start:]
print(len(train_features))
print(len(val_features))
print(len(test_features))

45413
31791
6811
6811


In [21]:
def _create_a_dataset(features, batch_size):

    all_input_ids = pad_sequence([LongTensor(f.input_ids) for f in features],
                                batch_first=True)
    all_input_mask = pad_sequence([LongTensor(f.input_mask) for f in features],
                                batch_first=True)
    all_segment_ids = pad_sequence([LongTensor(f.segment_ids) for f in features],
                                batch_first=True)
    all_label_id = LongTensor([f.label_id for f in features]).unsqueeze(1)
    all_sub_idx = pad_sequence([LongTensor([f.sub_idx]) for f in features],
                                batch_first=True)
    all_obj_idx = pad_sequence([LongTensor([f.obj_idx]) for f in features],
                                batch_first=True)
    data = TensorDataset(all_input_ids, all_input_mask,
                        all_segment_ids, all_label_id, all_sub_idx,
                        all_obj_idx)
    dataloader = DataLoader(data,
                            batch_size=batch_size)
    return dataloader

In [22]:
train_loader = _create_a_dataset(train_features, 32)
len(train_loader)

994

In [23]:
train_batches = [batch for batch in train_loader]
for item in train_batches[0]:
    print(item.shape)

torch.Size([32, 170])
torch.Size([32, 170])
torch.Size([32, 170])
torch.Size([32, 1])
torch.Size([32, 1])
torch.Size([32, 1])


In [24]:
train_batches[0]

[tensor([[  101,   100,   100,  ...,     0,     0,     0],
         [  101,   100,   100,  ...,     0,     0,     0],
         [  101,   100,  2744,  ...,     0,     0,     0],
         ...,
         [  101,   100,   100,  ...,     0,     0,     0],
         [  101,   100,   100,  ...,     0,     0,     0],
         [  101, 30522,   100,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [

In [25]:
# len(tokenizer) = pre calculated
tokenizer_len = 30538
bert.resize_token_embeddings(tokenizer_len)


Embedding(30538, 768)

In [71]:
input_ids, input_mask, segment_ids, label_ids, sub_idx, obj_idx = train_batches[0]

# loss = model(input_ids, segment_ids, input_mask, label_ids,
#              sub_idx, obj_idx)

outputs = bert(input_ids,
                    token_type_ids=segment_ids,
                    attention_mask=input_mask,
                    output_hidden_states=False,
                    output_attentions=False)

In [72]:
sequence_output = outputs[0]
sequence_output.shape

torch.Size([32, 170, 768])

In [73]:
sub_output = torch.cat(
            [a[i].unsqueeze(0) for a, i in zip(sequence_output, sub_idx)]).squeeze(1)
obj_output = torch.cat(
            [a[i].unsqueeze(0) for a, i in zip(sequence_output, obj_idx)]).squeeze(1)
sub_output.shape, obj_output.shape

(torch.Size([32, 768]), torch.Size([32, 768]))

In [74]:
rep = torch.cat((sub_output, obj_output), dim=1)
rep.shape

torch.Size([32, 1536])

In [75]:
rep = layer_norm(rep)
rep.shape

torch.Size([32, 1536])

In [76]:
rep = dropout(rep)
rep.shape

torch.Size([32, 1536])

In [77]:
logits = classifer(rep)
logits.shape

torch.Size([32, 2])

In [88]:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, 2), label_ids.view(-1))
loss

tensor(0.4979, grad_fn=<NllLossBackward0>)

In [90]:
label_ids.view(-1).shape

torch.Size([32])

In [85]:
logits.view(-1, 2)

tensor([[ 0.4741, -0.7999],
        [-0.3342, -0.4986],
        [-0.8379, -0.3600],
        [ 0.0315, -0.2533],
        [-0.3554, -0.9357],
        [ 0.2258, -0.6719],
        [ 0.6394, -0.6025],
        [ 0.1216, -0.9999],
        [ 0.1489, -1.7862],
        [ 0.1915, -1.3544],
        [ 0.0734, -0.6052],
        [ 0.5974, -0.6603],
        [ 0.2991, -0.9130],
        [ 0.0100, -0.6190],
        [-0.3008, -0.8990],
        [-0.2774, -0.9363],
        [-0.2936, -0.5965],
        [ 0.3739, -1.0638],
        [-0.6579, -0.6168],
        [-0.2230, -0.7594],
        [-0.5448, -0.9943],
        [ 0.1981, -0.2109],
        [-0.1782, -1.2148],
        [ 0.5937, -0.5384],
        [-0.3755, -0.7256],
        [-0.2742, -0.8289],
        [-0.1143, -0.2231],
        [-0.2555, -0.9518],
        [-0.7545, -0.8190],
        [ 0.0631, -0.5730],
        [ 0.2937, -0.6628],
        [ 0.2568, -0.6015]], grad_fn=<ViewBackward0>)

In [86]:
label_ids.view(-1)

tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0])

In [78]:
F.softmax(logits, dim=1)

tensor([[0.7814, 0.2186],
        [0.5410, 0.4590],
        [0.3827, 0.6173],
        [0.5707, 0.4293],
        [0.6411, 0.3589],
        [0.7105, 0.2895],
        [0.7759, 0.2241],
        [0.7543, 0.2457],
        [0.8738, 0.1262],
        [0.8243, 0.1757],
        [0.6634, 0.3366],
        [0.7786, 0.2214],
        [0.7707, 0.2293],
        [0.6523, 0.3477],
        [0.6452, 0.3548],
        [0.6590, 0.3410],
        [0.5751, 0.4249],
        [0.8081, 0.1919],
        [0.4897, 0.5103],
        [0.6310, 0.3690],
        [0.6105, 0.3895],
        [0.6009, 0.3991],
        [0.7382, 0.2618],
        [0.7562, 0.2438],
        [0.5866, 0.4134],
        [0.6352, 0.3648],
        [0.5272, 0.4728],
        [0.6674, 0.3326],
        [0.5161, 0.4839],
        [0.6539, 0.3461],
        [0.7224, 0.2776],
        [0.7023, 0.2977]], grad_fn=<SoftmaxBackward0>)

In [60]:
F.softmax(logits, dim=1)

tensor([[0.8547, 0.1453],
        [0.4956, 0.5044],
        [0.5554, 0.4446],
        [0.5967, 0.4033],
        [0.6468, 0.3532],
        [0.6322, 0.3678],
        [0.8572, 0.1428],
        [0.6937, 0.3063],
        [0.8479, 0.1521],
        [0.6960, 0.3040],
        [0.6972, 0.3028],
        [0.6432, 0.3568],
        [0.7775, 0.2225],
        [0.7266, 0.2734],
        [0.7077, 0.2923],
        [0.3962, 0.6038],
        [0.6270, 0.3730],
        [0.7101, 0.2899],
        [0.5694, 0.4306],
        [0.5693, 0.4307],
        [0.5657, 0.4343],
        [0.6056, 0.3944],
        [0.7152, 0.2848],
        [0.6122, 0.3878],
        [0.5844, 0.4156],
        [0.6143, 0.3857],
        [0.5862, 0.4138],
        [0.7572, 0.2428],
        [0.4904, 0.5096],
        [0.6593, 0.3407],
        [0.6805, 0.3195],
        [0.6168, 0.3832]], grad_fn=<SoftmaxBackward0>)

In [1]:
# Process data and create dataloader for models
# For binary relation extraction

import random
import pickle
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split

from torch import LongTensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertTokenizer
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE


class TempRelationProcessor:

    def __init__(self, data: list[dict], model_name: str = 'bert-base-uncased', batch_size: int = 128):
        self.data = data
        self.model_name = model_name
        self.batch_size = batch_size

    def run(self):
        tokenizer = BertTokenizer.from_pretrained(self.model_name, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE))
        ner_labels = ['concept', 'writing', 'person', 'place']

        tokenizer = self._add_marker_tokens(tokenizer, ner_labels)

        features = self._add_typed_markers(self.data, tokenizer)
        print(len(tokenizer))

        
    def _add_marker_tokens(self, tokenizer, ner_labels):
        new_tokens = []

        for label in ner_labels:
            new_tokens.append('<SUBJ_START=%s>' % label)
            new_tokens.append('<SUBJ_END=%s>' % label)
            new_tokens.append('<OBJ_START=%s>' % label)
            new_tokens.append('<OBJ_END=%s>' % label)

        tokenizer.add_tokens(new_tokens)

        return tokenizer


    def _add_typed_markers(self, data, tokenizer):
        """
        Inspired by PURE
        """
        
        CLS = "[CLS]"
        SEP = "[SEP]"

        def get_special_token(w):
            return ('<' + w + '>').lower()
        
        max_tokens = 0
        total_tokens = 0
        features = []
        for (sent_idx, sentence) in enumerate(data):
            if sent_idx % 10000 == 0:
                print(f"Adding typed markers: {sent_idx} of {len(data)}")

            subj_entity = sentence['entities'][0]
            obj_entity  = sentence['entities'][1]

            # Create typed markers
            SUBJECT_START_MARKER = get_special_token(f"SUBJ_START={subj_entity['type']}")
            SUBJECT_END_MARKER  = get_special_token(f"SUBJ_END={subj_entity['type']}")
            OBJECT_START_MARKER  = get_special_token(f"OBJ_START={obj_entity['type']}")
            OBJECT_END_MARKER  = get_special_token(f"OBJ_END={obj_entity['type']}")

            # Create marked sentence
            sub_idx = 0
            obj_idx = 0
            marked_sentence = []
            marked_sentence.append(CLS)
            for i, token in enumerate(sentence['sentence']):
                if i == subj_entity['start_pos']:
                    sub_idx = len(marked_sentence)
                    marked_sentence.append(SUBJECT_START_MARKER)
                if i == obj_entity['start_pos']:
                    obj_idx = len(marked_sentence)
                    marked_sentence.append(OBJECT_START_MARKER)

                for sub_token in tokenizer.tokenize(token):
                    marked_sentence.append(token)

                if i == subj_entity['end_pos']:
                    marked_sentence.append(SUBJECT_END_MARKER)
                if i == obj_entity['end_pos']:
                    marked_sentence.append(OBJECT_END_MARKER)
            marked_sentence.append(SEP)

            max_tokens = max(max_tokens, len(marked_sentence))
            total_tokens += len(marked_sentence)

            input_ids = tokenizer.convert_tokens_to_ids(marked_sentence)
            input_mask = [1] * len(input_ids)
            segment_ids = [0] * len(marked_sentence)
            label_id = sentence['relation']

            features.append(
                InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            sub_idx=sub_idx,
                            obj_idx=obj_idx))

        print("Adding typed markers: Done")
        print(f"Total tokens: {total_tokens}")
        print(f"Max tokens  : {max_tokens}")


        return features
     

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, sub_idx,
                 obj_idx):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.sub_idx = sub_idx
        self.obj_idx = obj_idx

In [12]:
import json
data = [json.loads(line) for line in open('../data/all_data_transformed.json', 'r')]

processor = TempRelationProcessor(data)


In [13]:
processor.run()

Adding typed markers: 0 of 45413
Adding typed markers: 10000 of 45413
Adding typed markers: 20000 of 45413
Adding typed markers: 30000 of 45413
Adding typed markers: 40000 of 45413
Adding typed markers: Done
Total tokens: 3107446
Max tokens  : 170
30538


In [11]:
model_name = 'bert-base-uncased'
temp = BertTokenizer.from_pretrained(model_name, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE))

In [9]:
model_name = 'bert-base-uncased'
bert = BertModel.from_pretrained(model_name, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE))
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
