In [24]:
import os
import nltk
import pandas as pd
import numpy as np
import torch.nn.functional as F
from IPython.display import clear_output
from nltk import WordNetLemmatizer
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, random_split

from string import punctuation
import tensorflow as tf

In [25]:
data = pd.read_csv('data/COCO-locations.csv')

In [26]:
un_texts = list(data['cap'])

un_labels = list(data['background'])

print(len(un_texts))
print(len(un_labels))

24113
24113


In [27]:
texts = []
labels = []

for i in range(len(un_labels)):
    if un_labels[i] != "[]":
        texts.append((un_texts[i].translate(str.maketrans('', '', punctuation))).lower())
        labels.append(un_labels[i])
    
labels = list(map(lambda array_str: array_str[1:-1].split(', '), labels))

labels = list(map(lambda array: list(map(lambda str: str[1:-1], array)), labels))

print(len(labels))

24113


In [28]:
max_words = 256

def get_words(text):
    words = nltk.word_tokenize(text.lower())
    no_punkt = [word for word in words if word.isalnum()]
    tags = nltk.pos_tag(no_punkt)
    nounsAndAdjs = [word for word,pos in tags]

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in nounsAndAdjs]
    return words

for text in texts:
    max_words = max(max_words, len(get_words(text)))

In [29]:
def one_hot_background(words, label_array):
    words_no = words.split(' ')

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in words_no]
    N = max_words
    one_hot_label = np.zeros(N)
    
    for i in range(len(words)):
        if words[i] in label_array:
            one_hot_label[i] = 1
    
    return np.array(one_hot_label, dtype=np.float64)

In [30]:
import torch
from transformers import BertTokenizer, BertModel, BertForPreTraining, BertForSequenceClassification

In [31]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

max_length = 256

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  a black and white image of a city street in the 1950s or early 1960s
Token IDs: tensor([ 101, 1037, 2304, 1998, 2317, 3746, 1997, 1037, 2103, 2395, 1999, 1996,
        4856, 2030, 2220, 4120,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    

In [32]:
decoded_texts = list(map(lambda id: tokenizer.decode(id), input_ids))

for i  in range(len(labels)):
    label = labels[i]
    labels[i] = one_hot_background(decoded_texts[i], label)

labels = np.array(labels)
print(labels[0])

[0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [33]:
batch_size = 8
N = len(texts)

train_size = int(N * 0.8)
val_size = int(N * 0.1)
test_size = (N - train_size - val_size)

dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))
print(len(dataset))
print(sum([train_size, val_size, test_size]))

24113
24113


In [34]:
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=RandomSampler(val_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=batch_size)

In [35]:
from torch import nn

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True, )
        self.linear = nn.Linear(768, 1)

    def forward(self, input_ids, input_mask=None, token_type_ids=None):
        output = self.model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
        output = output[0]
        # print(" BERT output: {}".format(output))
        y_pred = self.linear(output)
        return y_pred

In [36]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print("cuda selected")
# If not...
else:
    device = torch.device("cpu")
    print("cpu selected")
    
device = torch.device("cpu")

cuda selected


In [37]:
torch.cuda.empty_cache()
model = BERT()
clear_output(wait=True)

In [38]:
model.load_state_dict(torch.load('models/bert-location-transformer-epoch-2.pkl'))
model.to(device)
clear_output(wait=True)

In [39]:
# criterion = torch.nn.functional.binary_cross_entropy_with_logits(reduction = 'sum')
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5, eps = 1e-8)

In [40]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 201 different named parameters.

==== Embedding Layer ====

model.embeddings.word_embeddings.weight                 (30522, 768)
model.embeddings.position_embeddings.weight               (512, 768)
model.embeddings.token_type_embeddings.weight               (2, 768)
model.embeddings.LayerNorm.weight                             (768,)
model.embeddings.LayerNorm.bias                               (768,)

==== First Transformer ====

model.encoder.layer.0.attention.self.query.weight         (768, 768)
model.encoder.layer.0.attention.self.query.bias               (768,)
model.encoder.layer.0.attention.self.key.weight           (768, 768)
model.encoder.layer.0.attention.self.key.bias                 (768,)
model.encoder.layer.0.attention.self.value.weight         (768, 768)
model.encoder.layer.0.attention.self.value.bias               (768,)
model.encoder.layer.0.attention.output.dense.weight       (768, 768)
model.encoder.layer.0.attention.output.dense.bias             (

In [41]:
epochs = 3
log_steps = 100
sum_loss = 0

model.train()
for epoch in range(3, epochs + 1):
    print(5*'=', ' EPOCH {} '.format(epoch), 5 * '=')
    for step, batch in enumerate(train_dataloader):
#       print('STEP: {}'.format(step))
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2].long()
        pred = model(b_input_ids, b_input_mask).squeeze()
        optimizer.zero_grad()
        loss = F.binary_cross_entropy_with_logits(pred, b_labels.type(torch.float32), reduction = 'sum')
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # if step % 10 == 0:
        #     print(' LOSS: {}'.format(loss.item()))

        sum_loss += loss.item()
        
        if step % log_steps == 0 and step:
            print('  Average loss: {}'.format(sum_loss / log_steps))
            sum_loss = 0

    torch.save(model.state_dict(), os.path.join('./models', 'bert-location-transformer-epoch-%d.pkl' % epoch))


=====  EPOCH 3  =====
  Average loss: 5.674999035596848
  Average loss: 5.657656173408031
  Average loss: 5.080606339871883
  Average loss: 5.050971520543098
  Average loss: 5.13747044056654
  Average loss: 5.042037989571691
  Average loss: 5.623338522315025
  Average loss: 5.188880655616522
  Average loss: 5.24067159563303
  Average loss: 5.916424127817154
  Average loss: 4.844217336028814
  Average loss: 5.367175480909646
  Average loss: 5.351773069426417
  Average loss: 5.3929282353818415
  Average loss: 4.971278113946319
  Average loss: 5.48309552192688
  Average loss: 5.081045084744692
  Average loss: 4.575990427434444
  Average loss: 5.773209786042571
  Average loss: 5.24028655320406
  Average loss: 5.246273134201765
  Average loss: 4.692979204505682
  Average loss: 5.298817750513553
  Average loss: 5.301729244142771


In [42]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [53]:
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

print(5*"=", "Validation", 5*"=")

for step, batch in enumerate(val_dataloader):
    # print('STEP: {}'.format(step))
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    b_labels = batch[2].long()
        
    with torch.no_grad():        
        logits = model(b_input_ids, b_input_mask).squeeze()
        logits = logits.detach().cpu().numpy()
        logits = np.array([np.array(list(map(lambda x: 1 if x > 0 else 0, l))) for l in logits])
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = 0
        for (logit, label) in zip(logits, label_ids):
            tmp_eval_accuracy += int((logit == label).all())
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += batch_size

print("Validation accuracy: {0:.2f}%".format(eval_accuracy / nb_eval_steps * 100))

===== Validation =====
Validation accuracy: 84.15%


In [54]:
torch.save(model, 'models/bert-location-transformer-epoch-last.pkl')