In [None]:
import nltk
import pandas as pd
import numpy as np
import torch as torch
from IPython.display import clear_output
import matplotlib.pyplot as plt
from nltk import WordNetLemmatizer
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, random_split

from string import punctuation
import tensorflow as tf

In [None]:
data = pd.read_csv('data/COCO-locations.csv')

In [None]:
un_texts = list(data['cap'])

un_labels = list(data['background'])

print(len(un_texts))
print(len(un_labels))

In [None]:
texts = []
labels = []

for i in range(len(un_labels)):
    if un_labels[i] != "[]":
        texts.append((un_texts[i].translate(str.maketrans('', '', punctuation))).lower())
        labels.append(un_labels[i])
    
labels = list(map(lambda array_str: array_str[1:-1].split(', '), labels))

labels = list(map(lambda array: list(map(lambda str: str[1:-1], array)), labels))

print(len(labels))

In [None]:
max_words = 256

def get_words(text):
    words = nltk.word_tokenize(text.lower())
    no_punkt = [word for word in words if word.isalnum()]
    tags = nltk.pos_tag(no_punkt)
    nounsAndAdjs = [word for word,pos in tags]

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in nounsAndAdjs]
    return words

for text in texts:
    max_words = max(max_words, len(get_words(text)))

In [None]:
def one_hot_background(words, label_array):
    words_no = words.split(' ')

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in words_no]
    N = max_words
    one_hot_label = np.zeros(N)
    
    for i in range(len(words)):
        if words[i] in label_array:
            one_hot_label[i] = 1
    
    return np.array(one_hot_label, dtype=np.float64)

In [None]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

max_length = 256

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

In [None]:
decoded_texts = list(map(lambda id: tokenizer.decode(id), input_ids))

for i  in range(len(labels)):
    label = labels[i]
    labels[i] = one_hot_background(decoded_texts[i], label)

labels = np.array(labels)
print(labels[0])

In [None]:
batch_size = 8
N = len(texts)

train_size = int(N * 0.8)
val_size = int(N * 0.1)
test_size = (N - train_size - val_size)

dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))
print(len(dataset))
print(sum([train_size, val_size, test_size]))

In [None]:
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=RandomSampler(val_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=batch_size)

In [None]:
from torch import nn

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True, )
        self.linear = nn.Linear(768, 1)

    def forward(self, x, y, z):
        output = self.model(x, y, z)
        y_pred = self.linear(output['last_hidden_state'])
        return y_pred

In [None]:
# import torch
# # If there's a GPU available...
# if torch.cuda.is_available():    
#     # Tell PyTorch to use the GPU.    
#     device = torch.device("cuda")
#     print('There are %d GPU(s) available.' % torch.cuda.device_count())
#     print('We will use the GPU:', torch.cuda.get_device_name(0))
# # If not...
# else:
#     print('No GPU available, using the CPU instead.')
device = torch.device("cpu")


In [None]:
torch.cuda.empty_cache()
model = BERT()
model.model.to(device)

In [None]:
import tensorflow as tf

criterion = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters())

In [None]:
epochs = 3
log_steps = 100
sum_loss = 0

model.model.train()
for epoch in range(1, epochs + 1):
    print(5*'=', ' EPOCH {} '.format(epoch), 5 * '=')
    for step, batch in enumerate(train_dataloader):
         b_input_ids = batch[0]
         b_input_mask = batch[1]
         b_labels = batch[2].long().detach()
         
         pred = model(b_input_ids, b_input_mask, b_labels).detach().squeeze()
         optimizer.zero_grad()
         loss = criterion(pred, b_labels.type(torch.float32))
         loss.requires_grad = True
         loss.backward()
         optimizer.step()
         
         sum_loss += loss.item()
         
         if step % log_steps == 0 and step:
             print('  Average loss: {}'.format(sum_loss / log_steps))
             sum_loss = 0
