In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
!pip install pytorch-pretrained-bert pytorch-nlp



In [3]:
pip install transformers



In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [6]:
df = pd.read_csv('data/traindata.csv', sep='\t', header=None, names=['polarity', 'aspect', 'target', 'position', 'sentence'])
df.head(3)

Unnamed: 0,polarity,aspect,target,position,sentence
0,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
1,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
2,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

# Create sentence and label lists
first_sentences = df.sentence.values
first_tokens = [tokenizer.tokenize(sentence) for sentence in first_sentences]
first_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in first_tokens]

print('first_sentences: ', first_sentences[3], '\nfirst_input_ids: ', first_input_ids[3])

first_sentences:  The menu looked great, and the waiter was very nice, but when the food came, it was average. 
first_input_ids:  [1996, 12183, 2246, 2307, 1010, 1998, 1996, 15610, 2001, 2200, 3835, 1010, 2021, 2043, 1996, 2833, 2234, 1010, 2009, 2001, 2779, 1012]


In [8]:
auxiliary_sentences = ["" + aspect + " - " + target 
                       for aspect,target 
                       in list(zip(df.aspect.values, df.target.values))]
auxiliary_tokens = [tokenizer.tokenize(sentence) for sentence in auxiliary_sentences]
auxiliary_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in auxiliary_tokens]

print('auxiliary_sentence: ', auxiliary_sentences[3], '\nauxiliary_input_ids: ', auxiliary_input_ids[3])

auxiliary_sentence:  FOOD#STYLE_OPTIONS - menu 
auxiliary_input_ids:  [2833, 1001, 2806, 1035, 7047, 1011, 12183]


In [0]:
encoder = LabelEncoder()
labels = encoder.fit_transform(df.polarity.values)

In [10]:
len(max(first_input_ids, key=len)) + len(max(auxiliary_input_ids, key=len))

112

In [0]:
max_length = 128

In [12]:
input_ids_prepared = [tokenizer.
             prepare_for_model(
                 input_ids_0, 
                 input_ids_1,
                 max_length=max_length,
                 truncation_strategy='only_first', 
                 pad_to_max_length=True, 
                 return_token_type_ids=True,
                 return_attention_mask=True) for 
             input_ids_0, input_ids_1 in 
             list(zip(first_input_ids, auxiliary_input_ids))]

df_input_ids_prepared = pd.DataFrame(input_ids_prepared)

input_ids = df_input_ids_prepared.input_ids.values
token_type_ids = df_input_ids_prepared.token_type_ids.values
attention_masks = df_input_ids_prepared.attention_mask.values

print('input_ids',input_ids[1])
print('token_type_ids',token_type_ids[1])
print('attention_mask',attention_masks[1])

input_ids [101, 1996, 2031, 2058, 2531, 2367, 18007, 2000, 3749, 16215, 3771, 4113, 2061, 2008, 2081, 2026, 3129, 2200, 3407, 1998, 1996, 2833, 2001, 12090, 1010, 2065, 1045, 2442, 16755, 1037, 9841, 2009, 2442, 2022, 1996, 16405, 2213, 4939, 17153, 9834, 5498, 1012, 102, 2833, 1001, 3737, 1011, 2833, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [0]:
# Use train_test_split to split our data into train and validation sets for training

data = train_test_split(input_ids, labels, attention_masks,token_type_ids, random_state=2018, test_size=0.1)

train_inputs, validation_inputs = list(data[0]), list(data[1])
train_labels, validation_labels = list(data[2]), list(data[3])
train_attention_masks, validation_attention_masks = list(data[4]), list(data[5])
train_token_type_ids, validation_token_type_ids = list(data[6]), list(data[7])

In [0]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_attention_masks = torch.tensor(train_attention_masks)
validation_attention_masks = torch.tensor(validation_attention_masks)

train_token_type_ids = torch.tensor(train_token_type_ids)
validation_token_type_ids = torch.tensor(validation_token_type_ids)

In [0]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_attention_masks, train_token_type_ids, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_attention_masks, validation_token_type_ids, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [16]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [0]:

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]



In [18]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [20]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_attention_mask, b_input_token_type_ids, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    print(type(b_input_ids))
    loss = model(b_input_ids, token_type_ids = b_input_token_type_ids, attention_mask = b_input_attention_mask, labels=b_labels)
    print(loss)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_attention_mask, b_input_token_type_ids, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids = b_input_token_type_ids, attention_mask = b_input_attention_mask)  

  
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

<class 'torch.Tensor'>
tensor(1.0820, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.9732, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.9100, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.9802, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.9221, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6940, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.8086, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.7957, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6614, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.8888, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Te

Epoch:  25%|██▌       | 1/4 [00:24<01:14, 24.70s/it]

Validation Accuracy: 0.6589285714285714
<class 'torch.Tensor'>
tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.5998, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.7826, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6933, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.8805, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6240, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6905, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(1.1318, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.6408, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.5788, device='cuda:0', gra

Epoch:  50%|█████     | 2/4 [00:49<00:49, 24.70s/it]

Validation Accuracy: 0.6651785714285714
<class 'torch.Tensor'>
tensor(1.1649, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2900, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.1258, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.1901, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.4094, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.1214, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.3824, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2963, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2497, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.7116, device='cuda:0', gra

Epoch:  75%|███████▌  | 3/4 [01:14<00:24, 24.72s/it]

Validation Accuracy: 0.84375
<class 'torch.Tensor'>
tensor(0.3038, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2592, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.3255, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.0633, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2979, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2154, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2972, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2592, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
<class 'torch.Tensor'>
tensor(0.2851, device='cuda:0', grad_fn=<NllLo

Epoch: 100%|██████████| 4/4 [01:38<00:00, 24.73s/it]

Validation Accuracy: 0.86875



