In [None]:
import tensorflow as tf
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not  found')

Found GPU at: /device:GPU:0


In [None]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip install transformers==3.1.0

In [None]:
train = pd.read_csv("./traindata.csv", delimiter='\t', header=None, names=['sentiment', 'general', 'aspect_term', 'location','review'])
test = pd.read_csv("./devdata.csv", delimiter='\t', header=None, names=['sentiment', 'general', 'aspect_term', 'location','review'])
train.head()

Unnamed: 0,sentiment,general,aspect_term,location,review
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."


In [None]:
from sklearn.preprocessing import LabelEncoder
lab_enc=LabelEncoder()
train['sentiment'] = lab_enc.fit_transform(train['sentiment'])
test['sentiment'] = lab_enc.fit_transform(test['sentiment'])


In [None]:
'''stn = {'positive': 1,'neutral':0,'negative': -1}
train['sentiment'] = train['sentiment'].map(stn)
test['sentiment'] = test['sentiment'].map(stn)
train.head()'''

In [None]:
train["text"] = train["review"]+" "+train["aspect_term"] 
test["text"] = test["review"]+" "+test["aspect_term"] 
test.head()

Unnamed: 0,sentiment,general,aspect_term,location,review,text
0,2,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ...","great food, great wine list, great service in ..."
1,0,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.,I thought this place was totally overrated. place
2,2,FOOD#QUALITY,Fish,0:4,Fish is so very fresh.,Fish is so very fresh. Fish
3,0,SERVICE#GENERAL,manager,19:26,"I showed it to the manager, and he smilingly a...","I showed it to the manager, and he smilingly a..."
4,1,DRINKS#QUALITY,margaritas,63:73,"The food we ordered was excellent, although I ...","The food we ordered was excellent, although I ..."


In [None]:
#train 是t，validation 是v
sentences_t = train.text.values
labels_t = train.sentiment.values
sentences_v = test.text.values
labels_v = test.sentiment.values

In [None]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
#当做大批量tokennizer的时候，我们会使用tokenize.encode

In [None]:
input_ids_t = []
# For every sentence...
for sent in sentences_t:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True,
                   )
    
    # Add the encoded sentence to the list.
    input_ids_t.append(encoded_sent)
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences_t[0])
print('Token IDs:', input_ids_t[0])
######################################33
#for validation set!
input_ids_v = []
# For every sentence...
for sent in sentences_v:
   
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, 
                   )
   
    input_ids_v.append(encoded_sent)






Original:  short and sweet – seating is great:it's romantic,cozy and private. seating
Token IDs: [101, 2460, 1998, 4086, 1516, 10747, 2003, 2307, 1024, 2009, 1005, 1055, 6298, 1010, 26931, 1998, 2797, 1012, 10747, 102]


In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids_v]))

Max sentence length:  94


In [None]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN= 100
input_ids_t = pad_sequences(input_ids_t, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
input_ids_v = pad_sequences(input_ids_v, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")


In [None]:
# Create attention masks
attention_masks_t = []
# For each sentence...
for sent in input_ids_t:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks_t.append(att_mask)
##########################33
#for validation
attention_masks_v = []
# For each sentence...
for sent in input_ids_v:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks_v.append(att_mask)

In [None]:
# training

#可以把这一步当做废话，换了个名字。

train_inputs,  train_labels  = input_ids_t, labels_t
validation_inputs, validation_labels = input_ids_v, labels_v     
train_masks=attention_masks_t
validation_masks= attention_masks_v

In [None]:
import torch
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#作者推荐 batch=16 or 32
batch_size = 32
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)#这里选用的是每次batch随机取数的方法，和shuffle=True差不多的。
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=len(validation_data))

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
#The epsilon parameter eps = 1e-8 is “a very small number to prevent any division by zero in the implementation”
optimizer = AdamW(model.parameters(),lr = 2e-5, eps = 1e-8)
from transformers import get_linear_schedule_with_warmup
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.#这一步其实可以省略，有了更好。
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so we can plot them.
loss_values = []
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
   
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
       
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]
        
        total_loss += loss.item()
       
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  
        optimizer.step()
      
        scheduler.step()
   
    avg_train_loss = total_loss / len(train_dataloader)            
    
   
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:#这个大loop可以省略，因为这里我们validation set设置的是整个而不是batch，所以就只有一个大的val dataloader
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
           
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
print("")
print("Training complete!")



Training...
  Batch    40  of     47.    Elapsed: 0:00:21.

  Average training loss: 0.67
  Training epcoh took: 0:00:25

Running Validation...
  Accuracy: 0.85
  Validation took: 0:00:03

Training...
  Batch    40  of     47.    Elapsed: 0:00:22.

  Average training loss: 0.40
  Training epcoh took: 0:00:26

Running Validation...
  Accuracy: 0.86
  Validation took: 0:00:03

Training...
  Batch    40  of     47.    Elapsed: 0:00:22.

  Average training loss: 0.30
  Training epcoh took: 0:00:26

Running Validation...
  Accuracy: 0.86
  Validation took: 0:00:02

Training...
  Batch    40  of     47.    Elapsed: 0:00:21.

  Average training loss: 0.26
  Training epcoh took: 0:00:25

Running Validation...
  Accuracy: 0.86
  Validation took: 0:00:02

Training complete!
