In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
!pip install pytorch-pretrained-bert pytorch-nlp



In [3]:
pip install transformers



In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertModel, BertForSequenceClassification, AdamW
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [6]:
df = pd.read_csv('data/traindata.csv', sep='\t', header=0, names=['polarity', 'aspect', 'target', 'position', 'sentence'])
df.head(3)

df_dev = pd.read_csv('data/devdata.csv', sep='\t', header=0, names=['polarity', 'aspect', 'target', 'position', 'sentence'])
df

Unnamed: 0,polarity,aspect,target,position,sentence
0,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
1,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
2,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
3,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."
4,positive,FOOD#QUALITY,tuna,4:8,The tuna and wasabe potatoes are excellent.
...,...,...,...,...,...
1497,positive,DRINKS#QUALITY,expresso,29:37,One of us actually liked the expresso - that's...
1498,negative,SERVICE#GENERAL,waitress,20:28,The hostess and the waitress were incredibly r...
1499,positive,RESTAURANT#PRICES,place,12:17,this little place has a cute interior decor an...
1500,positive,RESTAURANT#GENERAL,restaurant,30:40,Nice Family owned traditional restaurant.


In [0]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-large-uncased', 
    do_lower_case=True)

In [8]:
tokenizer.add_tokens(list(df.aspect.str.lower().unique()))

12

In [9]:
list(df.aspect.str.lower().unique())

['ambience#general',
 'food#quality',
 'service#general',
 'food#style_options',
 'drinks#quality',
 'restaurant#miscellaneous',
 'restaurant#general',
 'drinks#prices',
 'food#prices',
 'location#general',
 'drinks#style_options',
 'restaurant#prices']

In [10]:
def create_first_input_ids(df):
  first_sentences = df.sentence.values
  first_tokens = [tokenizer.tokenize(sentence) for sentence in first_sentences]
  first_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in first_tokens]
  print('first_sentences: ', first_sentences[3], '\nfirst_input_ids: ', first_input_ids[3])
  return first_input_ids

first_input_ids = create_first_input_ids(df)
first_input_ids_dev = create_first_input_ids(df_dev)


first_sentences:  The menu looked great, and the waiter was very nice, but when the food came, it was average. 
first_input_ids:  [1996, 12183, 2246, 2307, 1010, 1998, 1996, 15610, 2001, 2200, 3835, 1010, 2021, 2043, 1996, 2833, 2234, 1010, 2009, 2001, 2779, 1012]
first_sentences:  The food we ordered was excellent, although I wouldn't say the margaritas were anything to write home about. 
first_input_ids:  [1996, 2833, 2057, 3641, 2001, 6581, 1010, 2348, 1045, 2876, 1005, 1056, 2360, 1996, 24570, 2015, 2020, 2505, 2000, 4339, 2188, 2055, 1012]


In [11]:
def create_auxiliary_input_ids(df):
  auxiliary_sentences = ["" + aspect + " - " + target 
                         for aspect,target 
                         in list(zip(df.aspect.str.lower().values, df.target.str.lower().values))]
  # auxiliary_sentences = ['neutral' for aspect,target 
  #                        in list(zip(df.aspect.values, df.target.values))]
  auxiliary_tokens = [tokenizer.tokenize(sentence) for sentence in auxiliary_sentences]
  auxiliary_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in auxiliary_tokens]
  print('auxiliary_sentence: ', auxiliary_sentences[0], '\nauxiliary_tokens: ', auxiliary_tokens[0], '\nauxiliary_input_ids: ', auxiliary_input_ids[0])  
  return auxiliary_input_ids

auxiliary_input_ids = create_auxiliary_input_ids(df)
auxiliary_input_ids_dev = create_auxiliary_input_ids(df_dev)

auxiliary_sentence:  ambience#general - trattoria 
auxiliary_tokens:  ['ambience#general', '-', 'tr', '##att', '##oria'] 
auxiliary_input_ids:  [30522, 1011, 19817, 19321, 11069]
auxiliary_sentence:  restaurant#general - place 
auxiliary_tokens:  ['restaurant#general', '-', 'place'] 
auxiliary_input_ids:  [30528, 1011, 2173]


In [12]:
auxiliary_input_ids

[[30522, 1011, 19817, 19321, 11069],
 [30523, 1011, 2833],
 [30524, 1011, 3095],
 [30525, 1011, 12183],
 [30523, 1011, 24799],
 [30524, 1011, 3095],
 [30524, 1011, 2326],
 [30525, 1011, 22861, 4160, 10335],
 [30522, 1011, 2173],
 [30523, 1011, 10439, 20624, 6290, 1997, 9724, 2015],
 [30523, 1011, 9440],
 [30526, 1011, 8974],
 [30523, 1011, 9372],
 [30523, 1011, 5431, 8081, 2063, 12183],
 [30523, 1011, 8040, 8095, 11923],
 [30522, 1011, 2572, 11283, 5897],
 [30522, 1011, 2397, 2305, 7224],
 [30523, 1011, 28253, 17153, 2618],
 [30523, 1011, 25482, 3059, 8808],
 [30523, 1011, 5371, 2102],
 [30523, 1011, 10861, 3736, 4305, 4571],
 [30527, 1011, 2173],
 [30523, 1011, 2833],
 [30524, 1011, 2326],
 [30523, 1011, 4524, 9050],
 [30528, 1011, 4825],
 [30523, 1011, 2413, 22201],
 [30523, 1011, 5785, 10447],
 [30522, 1011, 2572, 11283, 5897],
 [30528, 1011, 2173],
 [30523, 1011, 2833],
 [30524, 1011, 15610],
 [30524, 1011, 17917, 2278],
 [30523, 1011, 23621],
 [30526, 1011, 4392],
 [30523, 1011, 9

In [0]:
encoder = LabelEncoder()
labels = encoder.fit_transform(df.polarity.values)
labels_dev = encoder.transform(df_dev.polarity.values)

In [14]:
len(max(first_input_ids, key=len)) + len(max(auxiliary_input_ids, key=len))

110

In [15]:
len(max(first_input_ids_dev, key=len)) + len(max(auxiliary_input_ids_dev, key=len))

105

In [0]:
max_length = 128

In [17]:
def prepare_input_ids(first_input_ids, auxiliary_input_ids):
  input_ids_prepared = [tokenizer.
              prepare_for_model(
                  input_ids_0, 
                  input_ids_1,
                  max_length=max_length,
                  truncation_strategy='only_first', 
                  pad_to_max_length=True, 
                  return_token_type_ids=True,
                  return_attention_mask=True) for 
              input_ids_0, input_ids_1 in 
              list(zip(first_input_ids, auxiliary_input_ids))]

  df_input_ids_prepared = pd.DataFrame(input_ids_prepared)

  input_ids = list(df_input_ids_prepared.input_ids.values)
  token_type_ids = list(df_input_ids_prepared.token_type_ids.values)
  attention_masks = list(df_input_ids_prepared.attention_mask.values)

  print('input_ids',input_ids[0])
  print('token_type_ids',token_type_ids[0])
  print('attention_mask',attention_masks[0])

  return input_ids, token_type_ids, attention_masks

input_ids, token_type_ids, attention_masks = prepare_input_ids(
    first_input_ids, auxiliary_input_ids)

input_ids_dev, token_type_ids_dev, attention_masks_dev = prepare_input_ids(
    first_input_ids_dev, auxiliary_input_ids_dev)

input_ids [101, 2023, 24209, 22325, 1998, 6298, 19817, 19321, 11069, 2003, 2012, 1996, 2327, 1997, 2026, 7128, 4825, 2862, 1012, 102, 30522, 1011, 19817, 19321, 11069, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [0]:
# Convert all of our data into torch tensors, the required datatype for our model

def convert_to_tensors(input_ids, token_type_ids, attention_masks, labels):
  input_ids = torch.tensor(input_ids)
  labels = torch.tensor(labels)
  attention_masks = torch.tensor(attention_masks)
  token_type_ids = torch.tensor(token_type_ids)
  return input_ids, token_type_ids, attention_masks, labels

input_ids, token_type_ids, attention_masks, labels = convert_to_tensors(
    input_ids, token_type_ids, attention_masks, labels)

input_ids_dev, token_type_ids_dev, attention_masks_dev, labels_dev = convert_to_tensors(
    input_ids_dev, token_type_ids_dev, attention_masks_dev, labels_dev)

In [0]:
def create_data_loader(input_ids, token_type_ids, attention_masks, labels, batch_size=32):
  data = TensorDataset(input_ids, token_type_ids, attention_masks, labels)
  sampler = RandomSampler(data)
  dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
  return dataloader

dataloader = create_data_loader(
    input_ids, token_type_ids, attention_masks, labels)

dataloader_dev = create_data_loader(
    input_ids_dev, token_type_ids_dev, attention_masks_dev, labels_dev)

In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.resize_token_embeddings(len(tokenizer))
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30534, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]


In [0]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(params = model.parameters(), lr=1.5e-5, weight_decay=0.00)

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [24]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_type_ids, b_attention_masks, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss, logits = model(b_input_ids, token_type_ids = b_token_type_ids, attention_mask = b_attention_masks, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in dataloader_dev:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_type_ids, b_attention_masks, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      (loss, logits) = model(b_input_ids, token_type_ids = b_token_type_ids, attention_mask = b_attention_masks, labels=b_labels) 
  
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Train loss: 0.77819147769441


Epoch:  25%|██▌       | 1/4 [00:33<01:41, 33.81s/it]

Validation Accuracy: 0.7647192028985508
Train loss: 0.453491322537686


Epoch:  50%|█████     | 2/4 [01:07<01:07, 33.64s/it]

Validation Accuracy: 0.8370697463768115
Train loss: 0.31842512050841715


Epoch:  75%|███████▌  | 3/4 [01:40<00:33, 33.45s/it]

Validation Accuracy: 0.8464673913043478
Train loss: 0.2481942484353451


Epoch: 100%|██████████| 4/4 [02:13<00:00, 33.37s/it]

Validation Accuracy: 0.8516757246376812



