In [2]:
import sys
sys.path.append(os.path.abspath("./../"))

import os
import pandas as pd
import numpy as np
import json
import pickle

import torch

# Exploration of Pretrained BERT

### Documentation:

- https://pypi.org/project/pytorch-pretrained-bert/
- https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

### From documentation of pytorch-pretrained-bert

This model outputs a tuple composed of:

- `encoded_layers`: controled by the value of the output_encoded_layers argument:

    - `output_all_encoded_layers=True`: outputs a list of the encoded-hidden-states at the end of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a torch.FloatTensor of size **[batch_size, sequence_length, hidden_size]**,

    - `output_all_encoded_layers=False`: outputs only the encoded-hidden-states corresponding to the last attention block, i.e. a single torch.FloatTensor of size **[batch_size, sequence_length, hidden_size]**,

- `pooled_output`: a torch.FloatTensor of size **[batch_size, hidden_size]** which is the output of a classifier pretrained on top of the hidden state associated to the first character of the input (`CLF`) to train on the Next-Sentence task (see BERT's paper).

In [9]:
from pytorch_pretrained_bert import BertTokenizer, BertModel

In [44]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [53]:
text = "I want to makiring the bloob to catch good bufcarting."
marked_text = "[CLS] " + text + " [SEP]"

tokenizer.wordpiece_tokenizer.tokenize(marked_text)

['[CLS]',
 '[UNK]',
 'want',
 'to',
 'ma',
 '##kir',
 '##ing',
 'the',
 'b',
 '##lo',
 '##ob',
 'to',
 'catch',
 'good',
 'bu',
 '##fc',
 '##art',
 '##ing',
 '##.',
 '[SEP]']

In [197]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "I want to makiring the bloob to catch good bufcarting."
# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"
# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)
tokenized_text = tokenizer.basic_tokenizer.tokenize(marked_text)
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [1] * len(tokenized_text)
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [11]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
print()




In [None]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, encoded_output = model(tokens_tensor, segments_tensors)
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()

In [12]:
model.config

{
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

---

In [6]:
from data_processing.data_prep import EntityRelationsAligner, get_dataset
from data_processing.BERTinizer import SentenceBERTinizer

In [7]:
data, ne_set, rel_set = get_dataset("./../data/preproc_NYT_json/train.json")

In [8]:
tokenizer = SentenceBERTinizer()
era = EntityRelationsAligner(tokenizer, list(ne_set), list(rel_set))

In [None]:
# CONTINUE

In [None]:
# CONTINUE

In [None]:
# CONTINUE

In [None]:
# CONTINUE

In [None]:
# CONTINUE

In [None]:
# CONTINUE

---

In [27]:
trs0 = torch.tensor([[[1.,2.,3.]], [[4.,5.,6.]]])
trs1 = torch.tensor([[[7.,8.,9.]], [[8.,6.,4.]]])

trs0.size() # seq_len, batch, h

torch.Size([2, 1, 3])

In [28]:
trs0 = trs0.view((trs0.shape[0], 1, trs0.shape[1], trs0.shape[2]))
trs1 = trs1.view((1, trs1.shape[0], trs1.shape[1], trs1.shape[2]))

trs0 = trs0.expand((trs0.shape[0], trs0.shape[0], trs0.shape[2], trs0.shape[3]))
trs1 = trs1.expand((trs1.shape[1], trs1.shape[1], trs1.shape[2], trs1.shape[3]))

In [29]:
trs0

tensor([[[[1., 2., 3.]],

         [[1., 2., 3.]]],


        [[[4., 5., 6.]],

         [[4., 5., 6.]]]])

In [30]:
trs1

tensor([[[[7., 8., 9.]],

         [[8., 6., 4.]]],


        [[[7., 8., 9.]],

         [[8., 6., 4.]]]])

In [32]:
trs = torch.cat([trs0, trs1], dim=3)
trs.size()

torch.Size([2, 2, 1, 6])

In [33]:
trs

tensor([[[[1., 2., 3., 7., 8., 9.]],

         [[1., 2., 3., 8., 6., 4.]]],


        [[[4., 5., 6., 7., 8., 9.]],

         [[4., 5., 6., 8., 6., 4.]]]])

---

In [39]:
trs.permute(2, 0, 1, 3)

tensor([[[[1., 2., 3., 7., 8., 9.],
          [1., 2., 3., 8., 6., 4.]],

         [[4., 5., 6., 7., 8., 9.],
          [4., 5., 6., 8., 6., 4.]]]])

In [38]:
trs.permute(2, 0, 1, 3).transpose(1, 2)

tensor([[[[1., 2., 3., 7., 8., 9.],
          [4., 5., 6., 7., 8., 9.]],

         [[1., 2., 3., 8., 6., 4.],
          [4., 5., 6., 8., 6., 4.]]]])

In [None]:
# CONTINUE

In [None]:
# CONTINUE

In [None]:
# CONTINUE

# Other data exploration (DRAFT)

In [38]:
doc_path = "./data/re3d-master/Delegation of the European Union to Syria/documents.json"
ent_path = "./data/re3d-master/Delegation of the European Union to Syria/entities.json"
rel_path = "./data/re3d-master/Delegation of the European Union to Syria/relations.json"

with open(doc_path, "r", encoding="utf8") as f:
    doc_text = f.read()
with open(ent_path, "r", encoding="utf8") as f:
    ent_text = f.read()
with open(rel_path, "r", encoding="utf8") as f:
    rel_text = f.read()

doc_text_list = doc_text.replace("}\n{", "}###{").split("###")
ent_text_list = ent_text.replace("}\n{", "}###{").split("###")
rel_text_list = rel_text.replace("}\n{", "}###{").split("###")

doc_json = [json.loads(e) for e in doc_text_list]
ent_json = [json.loads(e) for e in ent_text_list]
rel_json = [json.loads(e) for e in rel_text_list]

In [42]:
text = doc_json[0]["text"]

In [44]:
rel_json[0]

{'_id': '26E0189E3D774CB2B8F078A082E5088C-4-566-567-589-630-CommWith',
 'begin': 1020,
 'end': 1089,
 'sourceBegin': 1018,
 'sourceEnd': 1019,
 'source': 'I',
 'targetBegin': 1041,
 'targetEnd': 1082,
 'target': 'Turkish Foreign Minister Mevlüt Çavuşoğlu',
 'type': 'CommWith',
 'value': 'have spoken today to Turkish Foreign Minister Mevlüt Çavuşoğ lu and to',
 'documentId': '26E0189E3D774CB2B8F078A082E5088C',
 'confidence': 1}

In [49]:
l = []
for r in rel_json:
    l.append([r["source"], r["target"], r["type"]])

In [50]:
pd.DataFrame(l)

Unnamed: 0,0,1,2
0,I,Turkish Foreign Minister Mevlüt Çavuşoğlu,CommWith
1,I,the UN Special Envoy Staffan de Mistura,CommWith
2,Turkish Foreign Minister Mevlüt Çavuşoğlu,I,CommWith
3,the UN Special Envoy Staffan de Mistura,I,CommWith
4,We,Brussels,CoLocated
5,the EU,Brussels,CoLocated
6,over 9 billion euros,The EU,BelongsTo
7,Daesh,Syria,CoLocated
8,Mogherini,EU leaders,CommWith
9,EU leaders,Mogherini,CommWith


In [356]:
# CONTINUE