In [1]:
import nltk
import pandas as pd
import numpy as np
import torch as torch
from IPython.display import clear_output
from nltk import WordNetLemmatizer
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, random_split

from string import punctuation
import tensorflow as tf

In [2]:
data = pd.read_csv('data/COCO-locations.csv')

In [3]:
un_texts = list(data['cap'])

un_labels = list(data['background'])

print(len(un_texts))
print(len(un_labels))

24774
24774


In [4]:
texts = []
labels = []

for i in range(len(un_labels)):
    if un_labels[i] != "[]":
        texts.append((un_texts[i].translate(str.maketrans('', '', punctuation))).lower())
        labels.append(un_labels[i])
    
labels = list(map(lambda array_str: array_str[1:-1].split(', '), labels))

labels = list(map(lambda array: list(map(lambda str: str[1:-1], array)), labels))

print(len(labels))

23626


In [5]:
max_words = 256

def get_words(text):
    words = nltk.word_tokenize(text.lower())
    no_punkt = [word for word in words if word.isalnum()]
    tags = nltk.pos_tag(no_punkt)
    nounsAndAdjs = [word for word,pos in tags]

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in nounsAndAdjs]
    return words

for text in texts:
    max_words = max(max_words, len(get_words(text)))

In [6]:
def one_hot_background(words, label_array):
    words_no = words.split(' ')

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in words_no]
    N = max_words
    one_hot_label = np.zeros(N)
    
    for i in range(len(words)):
        if words[i] in label_array:
            one_hot_label[i] = 1
    
    return np.array(one_hot_label, dtype=np.float64)

In [7]:
import torch
from transformers import BertTokenizer, BertModel

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

max_length = 256

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  a man is in a kitchen making pizzas
Token IDs: tensor([  101,  1037,  2158,  2003,  1999,  1037,  3829,  2437, 10733,  2015,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [9]:
decoded_texts = list(map(lambda id: tokenizer.decode(id), input_ids))

for i  in range(len(labels)):
    label = labels[i]
    labels[i] = one_hot_background(decoded_texts[i], label)

labels = np.array(labels)
print(labels[0])

[0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [10]:
batch_size = 8
N = len(texts)

train_size = int(N * 0.8)
val_size = int(N * 0.1)
test_size = (N - train_size - val_size)

dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))
print(len(dataset))
print(sum([train_size, val_size, test_size]))

23626
23626


In [11]:
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=RandomSampler(val_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=batch_size)

In [12]:
from torch import nn

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True, )
        self.linear = nn.Linear(768, 1)

    def forward(self, x, y, z):
        output = self.model(x, y, z)
        y_pred = self.linear(output['last_hidden_state'])
        return y_pred

In [13]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
# If not...
else:
    device = torch.device("cpu")


In [14]:
torch.cuda.empty_cache()
model = BERT()
model.model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [15]:
import tensorflow as tf

criterion = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters())

In [16]:
epochs = 3
log_steps = 100
sum_loss = 0

model.model.train()
for epoch in range(1, epochs + 1):
    print(5*'=', ' EPOCH {} '.format(epoch), 5 * '=')
    for step, batch in enumerate(train_dataloader):
         b_input_ids = batch[0]
         b_input_mask = batch[1]
         b_labels = batch[2].long().detach()
         
         pred = model(b_input_ids, b_input_mask, b_labels).detach().squeeze()
         optimizer.zero_grad()
         loss = criterion(pred, b_labels.type(torch.float32))
         loss.requires_grad = True
         loss.backward()
         optimizer.step()
         
         sum_loss += loss.item()
         
         if step % log_steps == 0 and step:
             print('  Average loss: {}'.format(sum_loss / log_steps))
             sum_loss = 0


=====  EPOCH 1  =====
  Average loss: 1338.5518005371093
  Average loss: 1324.7773999023439
  Average loss: 1325.7650537109375
  Average loss: 1320.7601574707032
  Average loss: 1325.4566870117187
  Average loss: 1324.1529724121094
  Average loss: 1322.8171350097657
  Average loss: 1323.980791015625
  Average loss: 1327.1594384765624
  Average loss: 1320.0510107421876
  Average loss: 1319.4956457519531
  Average loss: 1320.5223083496094
  Average loss: 1321.6315856933593
  Average loss: 1319.8626940917968
  Average loss: 1325.4214782714844
  Average loss: 1327.6216455078124
  Average loss: 1321.9420178222656
  Average loss: 1320.0827014160157
  Average loss: 1328.9084631347657
  Average loss: 1313.9796435546875
  Average loss: 1319.5895935058593
  Average loss: 1326.7041809082032
  Average loss: 1326.640118408203
=====  EPOCH 2  =====
  Average loss: 2144.668826904297
  Average loss: 1323.5594616699218
  Average loss: 1323.52615234375
  Average loss: 1321.2627551269532
  Average loss: 