In [1]:
import os
import nltk
import pandas as pd
import numpy as np
import torch.nn.functional as F
from IPython.display import clear_output
from nltk import WordNetLemmatizer
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, random_split

from string import punctuation
import tensorflow as tf

import torch
from transformers import BertTokenizer, BertModel, BertForPreTraining, BertForSequenceClassification

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dima\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data = pd.read_csv('data/dataset_base.csv')

data.head()

Unnamed: 0,text,loc
0,"Light shone through the wintry branches, shado...",ancient ruins
1,"The columns were the only complete thing, ever...",ancient ruins
2,Pre-Columbian civilisations firmly left their ...,ancient ruins
3,Uxmal and its giant Pyramid of the Magician ap...,ancient ruins
4,"When picturing ancient ruins, the Middle East ...",ancient ruins


In [4]:
un_texts = list(data['text'])

un_labels = list(data['loc'])

print(len(un_texts))
print(len(un_labels))

261
261


In [5]:
texts = []
labels = []

for text, label_str in list(zip(un_texts, un_labels)):
    sentences = nltk.tokenize.sent_tokenize(text)
    label = label_str.split()
    for sentence in sentences:
        lowered_text = (text.translate(str.maketrans('', '', punctuation))).lower()
        if all(word in lowered_text for word in label):
            texts.append(lowered_text)
            labels.append(label)

print(texts[300])
print(labels[300])
print(len(texts))
print(len(labels))

the hills that lie friendly in the day  like the pillows of the land  are darkly ominous by night the paths that were illuminated just hours before become lost in a blackness that even moonlight cannot help the trees that are magnificent in sunshine tower over james as he steps across the borderline between the seen and unseen
['hill']
583
583


In [6]:
max_words = 256

In [7]:
def one_hot_background(words, label_array):
    words_no = words.split(' ')

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in words_no]
    N = max_words
    one_hot_label = np.zeros(N)
    
    for i in range(len(words)):
        if words[i] in label_array:
            one_hot_label[i] = 1
    
    return np.array(one_hot_label, dtype=np.float64)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

max_length = 256

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  light shone through the wintry branches shadowy arms stretching across the ancient ruins what was left stood in spite of itself defying gravity in its precarious way yet this place kept secret by the trees was safe it had avoided modern mans destructive touch and so had become a sanctuary for the animals
Token IDs: tensor([  101,  2422, 14707,  2083,  1996,  2663, 11129,  5628, 22801,  2608,
        10917,  2408,  1996,  3418,  8435,  2054,  2001,  2187,  2768,  1999,
         8741,  1997,  2993, 13366, 14147,  8992,  1999,  2049,  3653, 10010,
         6313,  2126,  2664,  2023,  2173,  2921,  3595,  2011,  1996,  3628,
         2001,  3647,  2009,  2018,  9511,  2715, 16042, 15615,  3543,  1998,
         2061,  2018,  2468,  1037,  8493,  2005,  1996,  4176,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0

In [9]:
print(input_ids.size())


torch.Size([583, 256])


In [10]:
decoded_texts = list(map(lambda id: tokenizer.decode(id), input_ids))

for i  in range(len(labels)):
    label = labels[i]
    labels[i] = one_hot_background(decoded_texts[i], label)

labels = np.array(labels)
print(decoded_texts[0])
print(labels[0])

[CLS] light shone through the wintry branches shadowy arms stretching across the ancient ruins what was left stood in spite of itself defying gravity in its precarious way yet this place kept secret by the trees was safe it had avoided modern mans destructive touch and so had become a sanctuary for the animals [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [11]:
batch_size = 8
N = len(texts)

train_size = int(N * 0.8)
val_size = int(N * 0.1)
test_size = (N - train_size - val_size)

dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))
print(len(dataset))
print(sum([train_size, val_size, test_size]))

583
583


In [12]:
test_dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

In [13]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print("cuda selected")
# If not...
else:
    device = torch.device("cpu")
    print("cpu selected")
    
device = torch.device( "cpu")


cuda selected


In [14]:
from torch import nn

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True, )
        self.linear = nn.Linear(768, 1)

    def forward(self, input_ids, input_mask=None, token_type_ids=None):
        output = self.model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
        output = output[0]
        # print(" BERT output: {}".format(output))
        y_pred = self.linear(output)
        return y_pred

In [15]:
model = BERT()
clear_output(wait=True)

In [16]:
torch.cuda.empty_cache()
model.load_state_dict(torch.load('models/bert-location-transformer-epoch-5.pt'))
model.to(device)
clear_output(wait=True)

In [17]:
def accuracy(pred, label, verbose=False):
    pred_words = []
    label_words = []
    for i in range(len(pred)):
        if pred[i] == 1:
            pred_words.append(i)
    for i in range(len(label)):
        if label[i] == 1:
            label_words.append(i)
    if verbose:
        print(pred_words)
        print(label_words)
    return all(word in pred_words for word in label_words)


In [20]:
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
log_steps = 5


print(5*"=", "Testing", 5*"=")

for step, batch in enumerate(test_dataloader):
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    b_labels = batch[2].long()
        
    with torch.no_grad():        
        logits = model(b_input_ids, b_input_mask).squeeze()
        logits = logits.detach().cpu().numpy()
        logits = np.array([np.array(list(map(lambda x: 1 if x > 0 else 0, l))) for l in logits])
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = 0
        for (logit, label) in zip(logits, label_ids):
            tmp_eval_accuracy += accuracy(logit, label)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += batch_size
        
    if step % log_steps == 0 and step:
        print('  STEP {} accuracy: {}'.format(step, eval_accuracy  / nb_eval_steps * 100))
    
    

print("Test accuracy: {0:.2f}%".format(eval_accuracy / nb_eval_steps * 100))

===== Testing =====
tensor([[-13.0026, -13.8098,  11.4634,  ..., -14.7691, -14.7786, -14.8253],
        [ -9.2334,  -8.4910,  10.5309,  ...,  -6.7507, -11.7169, -12.4587],
        [-11.0499,  -8.7605,   6.1448,  ..., -13.9319, -14.1909, -14.2747],
        ...,
        [-12.8546,   0.3021, -13.5244,  ..., -14.8782, -14.8911, -14.8895],
        [-12.6446, -13.5865,  -5.6381,  ..., -14.9370, -14.9371, -14.9426],
        [-14.2321, -14.4980, -14.0887,  ..., -15.0386, -15.0378, -15.0399]])
torch.Size([8, 256])


ZeroDivisionError: division by zero

In [None]:
text = 'London is the capital and largest city of England and the United Kingdom, and is the largest urban area in Greater London.'

text = text.translate(str.maketrans('', '', punctuation)).lower()

encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

input_id = encoded_dict['input_ids'][0]
attention_mask = encoded_dict['attention_mask'][0]

print(input_id)

In [None]:
label = ['city', 'capital', 'area']

In [None]:
decoded_text = tokenizer.decode(input_id)

label = one_hot_background(decoded_text, label)

print(label)

In [None]:
input_id = torch.tensor(np.atleast_2d(input_id))
attention_mask = torch.tensor(np.atleast_2d(attention_mask))

with torch.no_grad():        
    logits = model(input_id, attention_mask).squeeze()
    logits = logits.detach().cpu().numpy()
    logits = np.array(list(map(lambda x: 1 if x > 0 else 0, logits)))
    label_ids = label
    print(accuracy(logits, label, verbose=True))
