In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

We will use the GPU: NVIDIA GeForce RTX 3060 Ti


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


In [7]:
df = pd.read_csv("./tweets_BERT.csv",  encoding='latin-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])
print('Number of dataset sentences: {:,}\n'.format(df.shape[0]))
print(df['target'].value_counts())
df.sample(10)

Number of dataset sentences: 1,600,000

0    800000
4    800000
Name: target, dtype: int64


Unnamed: 0,target,ids,date,flag,user,text
279393,0,1991775067,Mon Jun 01 07:38:32 PDT 2009,NO_QUERY,nadiaparry,I have a massive headache Boohoo me!
1555186,4,2185059028,Mon Jun 15 16:49:25 PDT 2009,NO_QUERY,iammerveilleuse,@nacturnal thankyou ;)
248884,0,1982820309,Sun May 31 11:58:07 PDT 2009,NO_QUERY,mkoby,I think somethings wrong with the ac compresso...
889418,4,1687627980,Sun May 03 09:11:22 PDT 2009,NO_QUERY,Yamakizi,@pratama nope I'm prefer the 200... I'm gonna...
716679,0,2259769838,Sat Jun 20 18:07:59 PDT 2009,NO_QUERY,ThePheNom24,907 and no ones here yet
1553990,4,2184762757,Mon Jun 15 16:23:03 PDT 2009,NO_QUERY,ashleyyross,showering then stephs
1524423,4,2176723372,Mon Jun 15 04:36:02 PDT 2009,NO_QUERY,ErikaDunne,had a great day yesterday stilll don't know w...
505728,0,2188467466,Mon Jun 15 21:57:15 PDT 2009,NO_QUERY,jmbatchelor,@CHRISDJMOYLES I know that feeling... What's w...
1597735,4,2193006811,Tue Jun 16 07:51:59 PDT 2009,NO_QUERY,makenziecrane,hopefully today will be a better day
847480,4,1564602721,Mon Apr 20 04:13:36 PDT 2009,NO_QUERY,SirCrumpet,#Tweetie #Help @atebits_support Nevermind - Cm...


In [11]:
df = df[['target', 'text']]
df.sample(10)

Unnamed: 0,target,text
354065,0,@Croconaw @Buizels I miss soup so much
22054,0,Bye Indiana See you again this summer.
1148862,4,"I apologize in advance, I may tweet a lot toni..."
1090885,4,@lucyleopard i hope you have a super saturday x
112577,0,"@kylerhea (sorry very delayed) but YES, and th..."
111121,0,a great cycling training of 46km and the half ...
1022150,4,"Going to the strawberry patch this am, then le..."
1047819,4,Watching Pokemon with Em
1349844,4,@djblootx attempting to blow-out my surround s...
291629,0,miss you


In [12]:
#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, stopwords, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    text =' '.join(word for word in text.split() if len(word) < 14) # remove words longer than 14 characters
    return text

#remove contractions
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the "#" symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as "&" and "$" present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

#Remove multiple sequential spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)

#Stemming
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

#Lemmatization 
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

def deep_clean(text):
    text = strip_emoji(text)
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)
    return text

In [13]:
#Get the cleaned tweets
texts_new = []
i = 0
for t in df.text:
    i = i+1
    if i%100000==0:
        print(i)
    texts_new.append(deep_clean(t))
df['text_clean'] = texts_new
df.drop_duplicates("text_clean", inplace=True)
df.head()

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000


Unnamed: 0,target,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save 50 rest go bound
3,0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behav mad see


In [14]:
df = df[['target', 'text_clean']]
df.to_csv("tweets_cleaned.csv", encoding='utf-8')

In [3]:
df = pd.read_csv("./tweets_cleaned.csv")
df = df[['target', 'text_clean']]
df = df.dropna()
print('Number of dataset sentences: {:,}\n'.format(df.shape[0]))
print(df['target'].value_counts())
df['target'] = df['target'].replace(4, 1)
df.sample(10)

Number of dataset sentences: 1,469,782

0    744029
4    725753
Name: target, dtype: int64


Unnamed: 0,target,text_clean
781143,1,thank fun beach today
836781,1,laugh loud bu read statement ethic peopl move ...
1293984,1,love first week work x
976985,1,look yuppi stiam de mai demult ca merg sa fac ...
882285,1,good night follow soontob till manana
684840,0,ohh horribl feel go away
1299017,1,awh darn yeah went pretti soon went escal like...
398722,0,wow mess guess peopl never understand let get ...
1175006,1,woop woop smokk danc lol realli want one els c...
410135,0,phone might broken


In [4]:
sentences = df.text_clean.values
labels = df.target.values

In [5]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [6]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  awww bummer shoulda got david carr third day
Tokenized:  ['aw', '##w', '##w', 'bum', '##mer', 'should', '##a', 'got', 'david', 'carr', 'third', 'day']
Token IDs:  [22091, 2860, 2860, 26352, 5017, 2323, 2050, 2288, 2585, 12385, 2353, 2154]


In [None]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [7]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 70,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  awww bummer shoulda got david carr third day
Token IDs: tensor([  101, 22091,  2860,  2860, 26352,  5017,  2323,  2050,  2288,  2585,
        12385,  2353,  2154,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [9]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate the number of samples to include in each set.
train_size = int(0.7 * len(dataset))
test_size = int(0.1 * len(dataset))
val_size = len(dataset) - train_size - test_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

1,028,847 training samples
293,957 validation samples
146,978 test samples


In [10]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            test_dataset, # The test samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [12]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    return_dict=False,
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [14]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [22]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [16]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [17]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of  32,152.    Elapsed: 0:00:10.
  Batch    80  of  32,152.    Elapsed: 0:00:16.
  Batch   120  of  32,152.    Elapsed: 0:00:22.
  Batch   160  of  32,152.    Elapsed: 0:00:28.
  Batch   200  of  32,152.    Elapsed: 0:00:35.
  Batch   240  of  32,152.    Elapsed: 0:00:41.
  Batch   280  of  32,152.    Elapsed: 0:00:48.
  Batch   320  of  32,152.    Elapsed: 0:00:54.
  Batch   360  of  32,152.    Elapsed: 0:01:00.
  Batch   400  of  32,152.    Elapsed: 0:01:06.
  Batch   440  of  32,152.    Elapsed: 0:01:12.
  Batch   480  of  32,152.    Elapsed: 0:01:18.
  Batch   520  of  32,152.    Elapsed: 0:01:24.
  Batch   560  of  32,152.    Elapsed: 0:01:31.
  Batch   600  of  32,152.    Elapsed: 0:01:37.
  Batch   640  of  32,152.    Elapsed: 0:01:44.
  Batch   680  of  32,152.    Elapsed: 0:01:50.
  Batch   720  of  32,152.    Elapsed: 0:01:56.
  Batch   760  of  32,152.    Elapsed: 0:02:03.
  Batch   800  of  32,152.    Elapsed: 0:02:09.
  Batch   840  of  32,152. 

  Batch 6,840  of  32,152.    Elapsed: 0:18:13.
  Batch 6,880  of  32,152.    Elapsed: 0:18:20.
  Batch 6,920  of  32,152.    Elapsed: 0:18:26.
  Batch 6,960  of  32,152.    Elapsed: 0:18:32.
  Batch 7,000  of  32,152.    Elapsed: 0:18:38.
  Batch 7,040  of  32,152.    Elapsed: 0:18:45.
  Batch 7,080  of  32,152.    Elapsed: 0:18:51.
  Batch 7,120  of  32,152.    Elapsed: 0:18:57.
  Batch 7,160  of  32,152.    Elapsed: 0:19:03.
  Batch 7,200  of  32,152.    Elapsed: 0:19:09.
  Batch 7,240  of  32,152.    Elapsed: 0:19:15.
  Batch 7,280  of  32,152.    Elapsed: 0:19:21.
  Batch 7,320  of  32,152.    Elapsed: 0:19:28.
  Batch 7,360  of  32,152.    Elapsed: 0:19:34.
  Batch 7,400  of  32,152.    Elapsed: 0:19:40.
  Batch 7,440  of  32,152.    Elapsed: 0:19:46.
  Batch 7,480  of  32,152.    Elapsed: 0:19:52.
  Batch 7,520  of  32,152.    Elapsed: 0:19:58.
  Batch 7,560  of  32,152.    Elapsed: 0:20:04.
  Batch 7,600  of  32,152.    Elapsed: 0:20:10.
  Batch 7,640  of  32,152.    Elapsed: 0

  Batch 13,600  of  32,152.    Elapsed: 0:35:54.
  Batch 13,640  of  32,152.    Elapsed: 0:36:00.
  Batch 13,680  of  32,152.    Elapsed: 0:36:06.
  Batch 13,720  of  32,152.    Elapsed: 0:36:12.
  Batch 13,760  of  32,152.    Elapsed: 0:36:18.
  Batch 13,800  of  32,152.    Elapsed: 0:36:24.
  Batch 13,840  of  32,152.    Elapsed: 0:36:30.
  Batch 13,880  of  32,152.    Elapsed: 0:36:37.
  Batch 13,920  of  32,152.    Elapsed: 0:36:43.
  Batch 13,960  of  32,152.    Elapsed: 0:36:49.
  Batch 14,000  of  32,152.    Elapsed: 0:36:55.
  Batch 14,040  of  32,152.    Elapsed: 0:37:01.
  Batch 14,080  of  32,152.    Elapsed: 0:37:07.
  Batch 14,120  of  32,152.    Elapsed: 0:37:13.
  Batch 14,160  of  32,152.    Elapsed: 0:37:20.
  Batch 14,200  of  32,152.    Elapsed: 0:37:26.
  Batch 14,240  of  32,152.    Elapsed: 0:37:32.
  Batch 14,280  of  32,152.    Elapsed: 0:37:38.
  Batch 14,320  of  32,152.    Elapsed: 0:37:45.
  Batch 14,360  of  32,152.    Elapsed: 0:37:51.
  Batch 14,400  of  

  Batch 20,320  of  32,152.    Elapsed: 0:53:56.
  Batch 20,360  of  32,152.    Elapsed: 0:54:02.
  Batch 20,400  of  32,152.    Elapsed: 0:54:08.
  Batch 20,440  of  32,152.    Elapsed: 0:54:14.
  Batch 20,480  of  32,152.    Elapsed: 0:54:20.
  Batch 20,520  of  32,152.    Elapsed: 0:54:26.
  Batch 20,560  of  32,152.    Elapsed: 0:54:33.
  Batch 20,600  of  32,152.    Elapsed: 0:54:39.
  Batch 20,640  of  32,152.    Elapsed: 0:54:45.
  Batch 20,680  of  32,152.    Elapsed: 0:54:51.
  Batch 20,720  of  32,152.    Elapsed: 0:54:57.
  Batch 20,760  of  32,152.    Elapsed: 0:55:03.
  Batch 20,800  of  32,152.    Elapsed: 0:55:09.
  Batch 20,840  of  32,152.    Elapsed: 0:55:16.
  Batch 20,880  of  32,152.    Elapsed: 0:55:22.
  Batch 20,920  of  32,152.    Elapsed: 0:55:28.
  Batch 20,960  of  32,152.    Elapsed: 0:55:34.
  Batch 21,000  of  32,152.    Elapsed: 0:55:40.
  Batch 21,040  of  32,152.    Elapsed: 0:55:46.
  Batch 21,080  of  32,152.    Elapsed: 0:55:53.
  Batch 21,120  of  

  Batch 27,040  of  32,152.    Elapsed: 1:11:15.
  Batch 27,080  of  32,152.    Elapsed: 1:11:21.
  Batch 27,120  of  32,152.    Elapsed: 1:11:27.
  Batch 27,160  of  32,152.    Elapsed: 1:11:33.
  Batch 27,200  of  32,152.    Elapsed: 1:11:39.
  Batch 27,240  of  32,152.    Elapsed: 1:11:45.
  Batch 27,280  of  32,152.    Elapsed: 1:11:52.
  Batch 27,320  of  32,152.    Elapsed: 1:11:58.
  Batch 27,360  of  32,152.    Elapsed: 1:12:04.
  Batch 27,400  of  32,152.    Elapsed: 1:12:10.
  Batch 27,440  of  32,152.    Elapsed: 1:12:16.
  Batch 27,480  of  32,152.    Elapsed: 1:12:22.
  Batch 27,520  of  32,152.    Elapsed: 1:12:28.
  Batch 27,560  of  32,152.    Elapsed: 1:12:35.
  Batch 27,600  of  32,152.    Elapsed: 1:12:41.
  Batch 27,640  of  32,152.    Elapsed: 1:12:47.
  Batch 27,680  of  32,152.    Elapsed: 1:12:53.
  Batch 27,720  of  32,152.    Elapsed: 1:12:59.
  Batch 27,760  of  32,152.    Elapsed: 1:13:05.
  Batch 27,800  of  32,152.    Elapsed: 1:13:11.
  Batch 27,840  of  

  Batch 1,480  of  32,152.    Elapsed: 0:03:48.
  Batch 1,520  of  32,152.    Elapsed: 0:03:54.
  Batch 1,560  of  32,152.    Elapsed: 0:04:00.
  Batch 1,600  of  32,152.    Elapsed: 0:04:06.
  Batch 1,640  of  32,152.    Elapsed: 0:04:12.
  Batch 1,680  of  32,152.    Elapsed: 0:04:19.
  Batch 1,720  of  32,152.    Elapsed: 0:04:25.
  Batch 1,760  of  32,152.    Elapsed: 0:04:31.
  Batch 1,800  of  32,152.    Elapsed: 0:04:37.
  Batch 1,840  of  32,152.    Elapsed: 0:04:43.
  Batch 1,880  of  32,152.    Elapsed: 0:04:49.
  Batch 1,920  of  32,152.    Elapsed: 0:04:56.
  Batch 1,960  of  32,152.    Elapsed: 0:05:02.
  Batch 2,000  of  32,152.    Elapsed: 0:05:08.
  Batch 2,040  of  32,152.    Elapsed: 0:05:14.
  Batch 2,080  of  32,152.    Elapsed: 0:05:20.
  Batch 2,120  of  32,152.    Elapsed: 0:05:26.
  Batch 2,160  of  32,152.    Elapsed: 0:05:33.
  Batch 2,200  of  32,152.    Elapsed: 0:05:39.
  Batch 2,240  of  32,152.    Elapsed: 0:05:45.
  Batch 2,280  of  32,152.    Elapsed: 0

  Batch 8,320  of  32,152.    Elapsed: 0:21:22.
  Batch 8,360  of  32,152.    Elapsed: 0:21:28.
  Batch 8,400  of  32,152.    Elapsed: 0:21:34.
  Batch 8,440  of  32,152.    Elapsed: 0:21:40.
  Batch 8,480  of  32,152.    Elapsed: 0:21:46.
  Batch 8,520  of  32,152.    Elapsed: 0:21:52.
  Batch 8,560  of  32,152.    Elapsed: 0:21:58.
  Batch 8,600  of  32,152.    Elapsed: 0:22:04.
  Batch 8,640  of  32,152.    Elapsed: 0:22:10.
  Batch 8,680  of  32,152.    Elapsed: 0:22:16.
  Batch 8,720  of  32,152.    Elapsed: 0:22:23.
  Batch 8,760  of  32,152.    Elapsed: 0:22:29.
  Batch 8,800  of  32,152.    Elapsed: 0:22:35.
  Batch 8,840  of  32,152.    Elapsed: 0:22:41.
  Batch 8,880  of  32,152.    Elapsed: 0:22:47.
  Batch 8,920  of  32,152.    Elapsed: 0:22:53.
  Batch 8,960  of  32,152.    Elapsed: 0:22:59.
  Batch 9,000  of  32,152.    Elapsed: 0:23:05.
  Batch 9,040  of  32,152.    Elapsed: 0:23:11.
  Batch 9,080  of  32,152.    Elapsed: 0:23:17.
  Batch 9,120  of  32,152.    Elapsed: 0

  Batch 15,080  of  32,152.    Elapsed: 0:38:34.
  Batch 15,120  of  32,152.    Elapsed: 0:38:40.
  Batch 15,160  of  32,152.    Elapsed: 0:38:46.
  Batch 15,200  of  32,152.    Elapsed: 0:38:52.
  Batch 15,240  of  32,152.    Elapsed: 0:38:58.
  Batch 15,280  of  32,152.    Elapsed: 0:39:05.
  Batch 15,320  of  32,152.    Elapsed: 0:39:11.
  Batch 15,360  of  32,152.    Elapsed: 0:39:17.
  Batch 15,400  of  32,152.    Elapsed: 0:39:23.
  Batch 15,440  of  32,152.    Elapsed: 0:39:29.
  Batch 15,480  of  32,152.    Elapsed: 0:39:35.
  Batch 15,520  of  32,152.    Elapsed: 0:39:41.
  Batch 15,560  of  32,152.    Elapsed: 0:39:47.
  Batch 15,600  of  32,152.    Elapsed: 0:39:53.
  Batch 15,640  of  32,152.    Elapsed: 0:39:59.
  Batch 15,680  of  32,152.    Elapsed: 0:40:06.
  Batch 15,720  of  32,152.    Elapsed: 0:40:12.
  Batch 15,760  of  32,152.    Elapsed: 0:40:18.
  Batch 15,800  of  32,152.    Elapsed: 0:40:24.
  Batch 15,840  of  32,152.    Elapsed: 0:40:30.
  Batch 15,880  of  

  Batch 21,800  of  32,152.    Elapsed: 0:55:44.
  Batch 21,840  of  32,152.    Elapsed: 0:55:50.
  Batch 21,880  of  32,152.    Elapsed: 0:55:56.
  Batch 21,920  of  32,152.    Elapsed: 0:56:02.
  Batch 21,960  of  32,152.    Elapsed: 0:56:08.
  Batch 22,000  of  32,152.    Elapsed: 0:56:14.
  Batch 22,040  of  32,152.    Elapsed: 0:56:21.
  Batch 22,080  of  32,152.    Elapsed: 0:56:27.
  Batch 22,120  of  32,152.    Elapsed: 0:56:33.
  Batch 22,160  of  32,152.    Elapsed: 0:56:39.
  Batch 22,200  of  32,152.    Elapsed: 0:56:45.
  Batch 22,240  of  32,152.    Elapsed: 0:56:51.
  Batch 22,280  of  32,152.    Elapsed: 0:56:57.
  Batch 22,320  of  32,152.    Elapsed: 0:57:03.
  Batch 22,360  of  32,152.    Elapsed: 0:57:09.
  Batch 22,400  of  32,152.    Elapsed: 0:57:16.
  Batch 22,440  of  32,152.    Elapsed: 0:57:22.
  Batch 22,480  of  32,152.    Elapsed: 0:57:28.
  Batch 22,520  of  32,152.    Elapsed: 0:57:34.
  Batch 22,560  of  32,152.    Elapsed: 0:57:40.
  Batch 22,600  of  

  Batch 28,520  of  32,152.    Elapsed: 1:12:53.
  Batch 28,560  of  32,152.    Elapsed: 1:12:59.
  Batch 28,600  of  32,152.    Elapsed: 1:13:05.
  Batch 28,640  of  32,152.    Elapsed: 1:13:11.
  Batch 28,680  of  32,152.    Elapsed: 1:13:17.
  Batch 28,720  of  32,152.    Elapsed: 1:13:24.
  Batch 28,760  of  32,152.    Elapsed: 1:13:30.
  Batch 28,800  of  32,152.    Elapsed: 1:13:36.
  Batch 28,840  of  32,152.    Elapsed: 1:13:42.
  Batch 28,880  of  32,152.    Elapsed: 1:13:49.
  Batch 28,920  of  32,152.    Elapsed: 1:13:55.
  Batch 28,960  of  32,152.    Elapsed: 1:14:01.
  Batch 29,000  of  32,152.    Elapsed: 1:14:08.
  Batch 29,040  of  32,152.    Elapsed: 1:14:14.
  Batch 29,080  of  32,152.    Elapsed: 1:14:21.
  Batch 29,120  of  32,152.    Elapsed: 1:14:27.
  Batch 29,160  of  32,152.    Elapsed: 1:14:34.
  Batch 29,200  of  32,152.    Elapsed: 1:14:40.
  Batch 29,240  of  32,152.    Elapsed: 1:14:47.
  Batch 29,280  of  32,152.    Elapsed: 1:14:53.
  Batch 29,320  of  

  Batch 3,000  of  32,152.    Elapsed: 0:07:46.
  Batch 3,040  of  32,152.    Elapsed: 0:07:52.
  Batch 3,080  of  32,152.    Elapsed: 0:07:59.
  Batch 3,120  of  32,152.    Elapsed: 0:08:05.
  Batch 3,160  of  32,152.    Elapsed: 0:08:11.
  Batch 3,200  of  32,152.    Elapsed: 0:08:17.
  Batch 3,240  of  32,152.    Elapsed: 0:08:23.
  Batch 3,280  of  32,152.    Elapsed: 0:08:29.
  Batch 3,320  of  32,152.    Elapsed: 0:08:35.
  Batch 3,360  of  32,152.    Elapsed: 0:08:41.
  Batch 3,400  of  32,152.    Elapsed: 0:08:48.
  Batch 3,440  of  32,152.    Elapsed: 0:08:54.
  Batch 3,480  of  32,152.    Elapsed: 0:09:00.
  Batch 3,520  of  32,152.    Elapsed: 0:09:06.
  Batch 3,560  of  32,152.    Elapsed: 0:09:12.
  Batch 3,600  of  32,152.    Elapsed: 0:09:18.
  Batch 3,640  of  32,152.    Elapsed: 0:09:25.
  Batch 3,680  of  32,152.    Elapsed: 0:09:31.
  Batch 3,720  of  32,152.    Elapsed: 0:09:38.
  Batch 3,760  of  32,152.    Elapsed: 0:09:44.
  Batch 3,800  of  32,152.    Elapsed: 0

  Batch 9,840  of  32,152.    Elapsed: 0:25:10.
  Batch 9,880  of  32,152.    Elapsed: 0:25:17.
  Batch 9,920  of  32,152.    Elapsed: 0:25:23.
  Batch 9,960  of  32,152.    Elapsed: 0:25:29.
  Batch 10,000  of  32,152.    Elapsed: 0:25:35.
  Batch 10,040  of  32,152.    Elapsed: 0:25:41.
  Batch 10,080  of  32,152.    Elapsed: 0:25:47.
  Batch 10,120  of  32,152.    Elapsed: 0:25:53.
  Batch 10,160  of  32,152.    Elapsed: 0:25:59.
  Batch 10,200  of  32,152.    Elapsed: 0:26:05.
  Batch 10,240  of  32,152.    Elapsed: 0:26:11.
  Batch 10,280  of  32,152.    Elapsed: 0:26:17.
  Batch 10,320  of  32,152.    Elapsed: 0:26:24.
  Batch 10,360  of  32,152.    Elapsed: 0:26:30.
  Batch 10,400  of  32,152.    Elapsed: 0:26:36.
  Batch 10,440  of  32,152.    Elapsed: 0:26:42.
  Batch 10,480  of  32,152.    Elapsed: 0:26:48.
  Batch 10,520  of  32,152.    Elapsed: 0:26:54.
  Batch 10,560  of  32,152.    Elapsed: 0:27:00.
  Batch 10,600  of  32,152.    Elapsed: 0:27:06.
  Batch 10,640  of  32,1

  Batch 16,560  of  32,152.    Elapsed: 0:42:16.
  Batch 16,600  of  32,152.    Elapsed: 0:42:22.
  Batch 16,640  of  32,152.    Elapsed: 0:42:28.
  Batch 16,680  of  32,152.    Elapsed: 0:42:34.
  Batch 16,720  of  32,152.    Elapsed: 0:42:40.
  Batch 16,760  of  32,152.    Elapsed: 0:42:46.
  Batch 16,800  of  32,152.    Elapsed: 0:42:52.
  Batch 16,840  of  32,152.    Elapsed: 0:42:59.
  Batch 16,880  of  32,152.    Elapsed: 0:43:05.
  Batch 16,920  of  32,152.    Elapsed: 0:43:11.
  Batch 16,960  of  32,152.    Elapsed: 0:43:17.
  Batch 17,000  of  32,152.    Elapsed: 0:43:23.
  Batch 17,040  of  32,152.    Elapsed: 0:43:29.
  Batch 17,080  of  32,152.    Elapsed: 0:43:35.
  Batch 17,120  of  32,152.    Elapsed: 0:43:41.
  Batch 17,160  of  32,152.    Elapsed: 0:43:47.
  Batch 17,200  of  32,152.    Elapsed: 0:43:54.
  Batch 17,240  of  32,152.    Elapsed: 0:44:00.
  Batch 17,280  of  32,152.    Elapsed: 0:44:06.
  Batch 17,320  of  32,152.    Elapsed: 0:44:12.
  Batch 17,360  of  

  Batch 23,280  of  32,152.    Elapsed: 0:59:23.
  Batch 23,320  of  32,152.    Elapsed: 0:59:29.
  Batch 23,360  of  32,152.    Elapsed: 0:59:35.
  Batch 23,400  of  32,152.    Elapsed: 0:59:41.
  Batch 23,440  of  32,152.    Elapsed: 0:59:47.
  Batch 23,480  of  32,152.    Elapsed: 0:59:53.
  Batch 23,520  of  32,152.    Elapsed: 0:59:59.
  Batch 23,560  of  32,152.    Elapsed: 1:00:05.
  Batch 23,600  of  32,152.    Elapsed: 1:00:11.
  Batch 23,640  of  32,152.    Elapsed: 1:00:17.
  Batch 23,680  of  32,152.    Elapsed: 1:00:24.
  Batch 23,720  of  32,152.    Elapsed: 1:00:30.
  Batch 23,760  of  32,152.    Elapsed: 1:00:36.
  Batch 23,800  of  32,152.    Elapsed: 1:00:42.
  Batch 23,840  of  32,152.    Elapsed: 1:00:48.
  Batch 23,880  of  32,152.    Elapsed: 1:00:54.
  Batch 23,920  of  32,152.    Elapsed: 1:01:00.
  Batch 23,960  of  32,152.    Elapsed: 1:01:06.
  Batch 24,000  of  32,152.    Elapsed: 1:01:12.
  Batch 24,040  of  32,152.    Elapsed: 1:01:18.
  Batch 24,080  of  

  Batch 30,000  of  32,152.    Elapsed: 1:16:32.
  Batch 30,040  of  32,152.    Elapsed: 1:16:38.
  Batch 30,080  of  32,152.    Elapsed: 1:16:45.
  Batch 30,120  of  32,152.    Elapsed: 1:16:51.
  Batch 30,160  of  32,152.    Elapsed: 1:16:57.
  Batch 30,200  of  32,152.    Elapsed: 1:17:03.
  Batch 30,240  of  32,152.    Elapsed: 1:17:09.
  Batch 30,280  of  32,152.    Elapsed: 1:17:15.
  Batch 30,320  of  32,152.    Elapsed: 1:17:21.
  Batch 30,360  of  32,152.    Elapsed: 1:17:27.
  Batch 30,400  of  32,152.    Elapsed: 1:17:33.
  Batch 30,440  of  32,152.    Elapsed: 1:17:39.
  Batch 30,480  of  32,152.    Elapsed: 1:17:46.
  Batch 30,520  of  32,152.    Elapsed: 1:17:52.
  Batch 30,560  of  32,152.    Elapsed: 1:17:58.
  Batch 30,600  of  32,152.    Elapsed: 1:18:04.
  Batch 30,640  of  32,152.    Elapsed: 1:18:10.
  Batch 30,680  of  32,152.    Elapsed: 1:18:16.
  Batch 30,720  of  32,152.    Elapsed: 1:18:22.
  Batch 30,760  of  32,152.    Elapsed: 1:18:28.
  Batch 30,800  of  

  Batch 4,520  of  32,152.    Elapsed: 0:11:32.
  Batch 4,560  of  32,152.    Elapsed: 0:11:38.
  Batch 4,600  of  32,152.    Elapsed: 0:11:44.
  Batch 4,640  of  32,152.    Elapsed: 0:11:51.
  Batch 4,680  of  32,152.    Elapsed: 0:11:57.
  Batch 4,720  of  32,152.    Elapsed: 0:12:03.
  Batch 4,760  of  32,152.    Elapsed: 0:12:09.
  Batch 4,800  of  32,152.    Elapsed: 0:12:15.
  Batch 4,840  of  32,152.    Elapsed: 0:12:21.
  Batch 4,880  of  32,152.    Elapsed: 0:12:27.
  Batch 4,920  of  32,152.    Elapsed: 0:12:33.
  Batch 4,960  of  32,152.    Elapsed: 0:12:39.
  Batch 5,000  of  32,152.    Elapsed: 0:12:46.
  Batch 5,040  of  32,152.    Elapsed: 0:12:52.
  Batch 5,080  of  32,152.    Elapsed: 0:12:58.
  Batch 5,120  of  32,152.    Elapsed: 0:13:04.
  Batch 5,160  of  32,152.    Elapsed: 0:13:10.
  Batch 5,200  of  32,152.    Elapsed: 0:13:16.
  Batch 5,240  of  32,152.    Elapsed: 0:13:22.
  Batch 5,280  of  32,152.    Elapsed: 0:13:28.
  Batch 5,320  of  32,152.    Elapsed: 0

  Batch 11,320  of  32,152.    Elapsed: 0:28:51.
  Batch 11,360  of  32,152.    Elapsed: 0:28:58.
  Batch 11,400  of  32,152.    Elapsed: 0:29:04.
  Batch 11,440  of  32,152.    Elapsed: 0:29:10.
  Batch 11,480  of  32,152.    Elapsed: 0:29:16.
  Batch 11,520  of  32,152.    Elapsed: 0:29:22.
  Batch 11,560  of  32,152.    Elapsed: 0:29:28.
  Batch 11,600  of  32,152.    Elapsed: 0:29:34.
  Batch 11,640  of  32,152.    Elapsed: 0:29:41.
  Batch 11,680  of  32,152.    Elapsed: 0:29:47.
  Batch 11,720  of  32,152.    Elapsed: 0:29:53.
  Batch 11,760  of  32,152.    Elapsed: 0:29:59.
  Batch 11,800  of  32,152.    Elapsed: 0:30:05.
  Batch 11,840  of  32,152.    Elapsed: 0:30:11.
  Batch 11,880  of  32,152.    Elapsed: 0:30:17.
  Batch 11,920  of  32,152.    Elapsed: 0:30:23.
  Batch 11,960  of  32,152.    Elapsed: 0:30:29.
  Batch 12,000  of  32,152.    Elapsed: 0:30:36.
  Batch 12,040  of  32,152.    Elapsed: 0:30:42.
  Batch 12,080  of  32,152.    Elapsed: 0:30:48.
  Batch 12,120  of  

  Batch 18,040  of  32,152.    Elapsed: 0:46:08.
  Batch 18,080  of  32,152.    Elapsed: 0:46:15.
  Batch 18,120  of  32,152.    Elapsed: 0:46:22.
  Batch 18,160  of  32,152.    Elapsed: 0:46:29.
  Batch 18,200  of  32,152.    Elapsed: 0:46:36.
  Batch 18,240  of  32,152.    Elapsed: 0:46:43.
  Batch 18,280  of  32,152.    Elapsed: 0:46:50.
  Batch 18,320  of  32,152.    Elapsed: 0:46:57.
  Batch 18,360  of  32,152.    Elapsed: 0:47:05.
  Batch 18,400  of  32,152.    Elapsed: 0:47:12.
  Batch 18,440  of  32,152.    Elapsed: 0:47:19.
  Batch 18,480  of  32,152.    Elapsed: 0:47:25.
  Batch 18,520  of  32,152.    Elapsed: 0:47:31.
  Batch 18,560  of  32,152.    Elapsed: 0:47:37.
  Batch 18,600  of  32,152.    Elapsed: 0:47:43.
  Batch 18,640  of  32,152.    Elapsed: 0:47:49.
  Batch 18,680  of  32,152.    Elapsed: 0:47:55.
  Batch 18,720  of  32,152.    Elapsed: 0:48:02.
  Batch 18,760  of  32,152.    Elapsed: 0:48:08.
  Batch 18,800  of  32,152.    Elapsed: 0:48:14.
  Batch 18,840  of  

  Batch 24,760  of  32,152.    Elapsed: 1:04:18.
  Batch 24,800  of  32,152.    Elapsed: 1:04:24.
  Batch 24,840  of  32,152.    Elapsed: 1:04:30.
  Batch 24,880  of  32,152.    Elapsed: 1:04:36.
  Batch 24,920  of  32,152.    Elapsed: 1:04:42.
  Batch 24,960  of  32,152.    Elapsed: 1:04:49.
  Batch 25,000  of  32,152.    Elapsed: 1:04:55.
  Batch 25,040  of  32,152.    Elapsed: 1:05:01.
  Batch 25,080  of  32,152.    Elapsed: 1:05:07.
  Batch 25,120  of  32,152.    Elapsed: 1:05:13.
  Batch 25,160  of  32,152.    Elapsed: 1:05:19.
  Batch 25,200  of  32,152.    Elapsed: 1:05:25.
  Batch 25,240  of  32,152.    Elapsed: 1:05:31.
  Batch 25,280  of  32,152.    Elapsed: 1:05:37.
  Batch 25,320  of  32,152.    Elapsed: 1:05:43.
  Batch 25,360  of  32,152.    Elapsed: 1:05:49.
  Batch 25,400  of  32,152.    Elapsed: 1:05:56.
  Batch 25,440  of  32,152.    Elapsed: 1:06:02.
  Batch 25,480  of  32,152.    Elapsed: 1:06:08.
  Batch 25,520  of  32,152.    Elapsed: 1:06:14.
  Batch 25,560  of  

  Batch 31,480  of  32,152.    Elapsed: 1:21:23.
  Batch 31,520  of  32,152.    Elapsed: 1:21:29.
  Batch 31,560  of  32,152.    Elapsed: 1:21:35.
  Batch 31,600  of  32,152.    Elapsed: 1:21:41.
  Batch 31,640  of  32,152.    Elapsed: 1:21:48.
  Batch 31,680  of  32,152.    Elapsed: 1:21:54.
  Batch 31,720  of  32,152.    Elapsed: 1:22:00.
  Batch 31,760  of  32,152.    Elapsed: 1:22:06.
  Batch 31,800  of  32,152.    Elapsed: 1:22:12.
  Batch 31,840  of  32,152.    Elapsed: 1:22:18.
  Batch 31,880  of  32,152.    Elapsed: 1:22:24.
  Batch 31,920  of  32,152.    Elapsed: 1:22:30.
  Batch 31,960  of  32,152.    Elapsed: 1:22:36.
  Batch 32,000  of  32,152.    Elapsed: 1:22:43.
  Batch 32,040  of  32,152.    Elapsed: 1:22:49.
  Batch 32,080  of  32,152.    Elapsed: 1:22:55.
  Batch 32,120  of  32,152.    Elapsed: 1:23:01.

  Average training loss: 0.29
  Training epcoh took: 1:23:06

Running Validation...
  Accuracy: 0.80
  Validation Loss: 0.51
  Validation took: 0:06:23

Training comp

In [22]:
torch.save(model.state_dict(), "./model/BERT.pt")

In [23]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.45989577979883856,
  'Valid. Loss': 0.4428888430993887,
  'Valid. Accur.': 0.796372591705671,
  'Training Time': '1:24:22',
  'Validation Time': '0:06:21'},
 {'epoch': 2,
  'Training Loss': 0.4021230549775166,
  'Valid. Loss': 0.4314697826735548,
  'Valid. Accur.': 0.8017640415804942,
  'Training Time': '1:22:12',
  'Validation Time': '0:06:21'},
 {'epoch': 3,
  'Training Loss': 0.34604859756920825,
  'Valid. Loss': 0.46742750130592964,
  'Valid. Accur.': 0.7982556873843474,
  'Training Time': '1:22:00',
  'Validation Time': '0:06:21'},
 {'epoch': 4,
  'Training Loss': 0.291953750455353,
  'Valid. Loss': 0.5094183790417903,
  'Valid. Accur.': 0.7954426091215848,
  'Training Time': '1:23:06',
  'Validation Time': '0:06:23'}]

In [23]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    return_dict=False,
)

model.load_state_dict(torch.load("./model/BERT_4.pt"))

# Tell pytorch to run this model on the GPU.
model.cuda()





# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    
    torch.save(model.state_dict(), "./model/BERT_"+str(epoch_i)+".pt")

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Training...
  Batch    40  of  32,152.    Elapsed: 0:00:05.
  Batch    80  of  32,152.    Elapsed: 0:00:10.
  Batch   120  of  32,152.    Elapsed: 0:00:15.
  Batch   160  of  32,152.    Elapsed: 0:00:20.
  Batch   200  of  32,152.    Elapsed: 0:00:26.
  Batch   240  of  32,152.    Elapsed: 0:00:31.
  Batch   280  of  32,152.    Elapsed: 0:00:36.
  Batch   320  of  32,152.    Elapsed: 0:00:41.
  Batch   360  of  32,152.    Elapsed: 0:00:46.
  Batch   400  of  32,152.    Elapsed: 0:00:51.
  Batch   440  of  32,152.    Elapsed: 0:00:56.
  Batch   480  of  32,152.    Elapsed: 0:01:01.
  Batch   520  of  32,152.    Elapsed: 0:01:06.
  Batch   560  of  32,152.    Elapsed: 0:01:11.
  Batch   600  of  32,152.    Elapsed: 0:01:16.
  Batch   640  of  32,152.    Elapsed: 0:01:21.
  Batch   680  of  32,152.    Elapsed: 0:01:27.
  Batch   720  of  32,152.    Elapsed: 0:01:32.
  Batch   760  of  32,152.    Elapsed: 0:01:37.
  Batch   800  of  32,152.    Elapsed: 0:01:42.
  Batch   840  of  32,152. 

  Batch 6,840  of  32,152.    Elapsed: 0:14:40.
  Batch 6,880  of  32,152.    Elapsed: 0:14:45.
  Batch 6,920  of  32,152.    Elapsed: 0:14:50.
  Batch 6,960  of  32,152.    Elapsed: 0:14:55.
  Batch 7,000  of  32,152.    Elapsed: 0:15:00.
  Batch 7,040  of  32,152.    Elapsed: 0:15:06.
  Batch 7,080  of  32,152.    Elapsed: 0:15:11.
  Batch 7,120  of  32,152.    Elapsed: 0:15:16.
  Batch 7,160  of  32,152.    Elapsed: 0:15:21.
  Batch 7,200  of  32,152.    Elapsed: 0:15:26.
  Batch 7,240  of  32,152.    Elapsed: 0:15:31.
  Batch 7,280  of  32,152.    Elapsed: 0:15:36.
  Batch 7,320  of  32,152.    Elapsed: 0:15:42.
  Batch 7,360  of  32,152.    Elapsed: 0:15:47.
  Batch 7,400  of  32,152.    Elapsed: 0:15:52.
  Batch 7,440  of  32,152.    Elapsed: 0:15:57.
  Batch 7,480  of  32,152.    Elapsed: 0:16:02.
  Batch 7,520  of  32,152.    Elapsed: 0:16:07.
  Batch 7,560  of  32,152.    Elapsed: 0:16:12.
  Batch 7,600  of  32,152.    Elapsed: 0:16:17.
  Batch 7,640  of  32,152.    Elapsed: 0

  Batch 13,600  of  32,152.    Elapsed: 0:29:07.
  Batch 13,640  of  32,152.    Elapsed: 0:29:12.
  Batch 13,680  of  32,152.    Elapsed: 0:29:17.
  Batch 13,720  of  32,152.    Elapsed: 0:29:22.
  Batch 13,760  of  32,152.    Elapsed: 0:29:27.
  Batch 13,800  of  32,152.    Elapsed: 0:29:32.
  Batch 13,840  of  32,152.    Elapsed: 0:29:37.
  Batch 13,880  of  32,152.    Elapsed: 0:29:43.
  Batch 13,920  of  32,152.    Elapsed: 0:29:48.
  Batch 13,960  of  32,152.    Elapsed: 0:29:53.
  Batch 14,000  of  32,152.    Elapsed: 0:29:58.
  Batch 14,040  of  32,152.    Elapsed: 0:30:03.
  Batch 14,080  of  32,152.    Elapsed: 0:30:08.
  Batch 14,120  of  32,152.    Elapsed: 0:30:13.
  Batch 14,160  of  32,152.    Elapsed: 0:30:19.
  Batch 14,200  of  32,152.    Elapsed: 0:30:24.
  Batch 14,240  of  32,152.    Elapsed: 0:30:29.
  Batch 14,280  of  32,152.    Elapsed: 0:30:34.
  Batch 14,320  of  32,152.    Elapsed: 0:30:39.
  Batch 14,360  of  32,152.    Elapsed: 0:30:44.
  Batch 14,400  of  

  Batch 20,320  of  32,152.    Elapsed: 0:43:28.
  Batch 20,360  of  32,152.    Elapsed: 0:43:34.
  Batch 20,400  of  32,152.    Elapsed: 0:43:39.
  Batch 20,440  of  32,152.    Elapsed: 0:43:44.
  Batch 20,480  of  32,152.    Elapsed: 0:43:49.
  Batch 20,520  of  32,152.    Elapsed: 0:43:54.
  Batch 20,560  of  32,152.    Elapsed: 0:44:00.
  Batch 20,600  of  32,152.    Elapsed: 0:44:05.
  Batch 20,640  of  32,152.    Elapsed: 0:44:10.
  Batch 20,680  of  32,152.    Elapsed: 0:44:15.
  Batch 20,720  of  32,152.    Elapsed: 0:44:20.
  Batch 20,760  of  32,152.    Elapsed: 0:44:25.
  Batch 20,800  of  32,152.    Elapsed: 0:44:30.
  Batch 20,840  of  32,152.    Elapsed: 0:44:35.
  Batch 20,880  of  32,152.    Elapsed: 0:44:41.
  Batch 20,920  of  32,152.    Elapsed: 0:44:46.
  Batch 20,960  of  32,152.    Elapsed: 0:44:51.
  Batch 21,000  of  32,152.    Elapsed: 0:44:56.
  Batch 21,040  of  32,152.    Elapsed: 0:45:01.
  Batch 21,080  of  32,152.    Elapsed: 0:45:06.
  Batch 21,120  of  

  Batch 27,040  of  32,152.    Elapsed: 0:57:50.
  Batch 27,080  of  32,152.    Elapsed: 0:57:55.
  Batch 27,120  of  32,152.    Elapsed: 0:58:00.
  Batch 27,160  of  32,152.    Elapsed: 0:58:05.
  Batch 27,200  of  32,152.    Elapsed: 0:58:10.
  Batch 27,240  of  32,152.    Elapsed: 0:58:15.
  Batch 27,280  of  32,152.    Elapsed: 0:58:20.
  Batch 27,320  of  32,152.    Elapsed: 0:58:25.
  Batch 27,360  of  32,152.    Elapsed: 0:58:31.
  Batch 27,400  of  32,152.    Elapsed: 0:58:36.
  Batch 27,440  of  32,152.    Elapsed: 0:58:41.
  Batch 27,480  of  32,152.    Elapsed: 0:58:46.
  Batch 27,520  of  32,152.    Elapsed: 0:58:51.
  Batch 27,560  of  32,152.    Elapsed: 0:58:56.
  Batch 27,600  of  32,152.    Elapsed: 0:59:01.
  Batch 27,640  of  32,152.    Elapsed: 0:59:06.
  Batch 27,680  of  32,152.    Elapsed: 0:59:12.
  Batch 27,720  of  32,152.    Elapsed: 0:59:17.
  Batch 27,760  of  32,152.    Elapsed: 0:59:22.
  Batch 27,800  of  32,152.    Elapsed: 0:59:27.
  Batch 27,840  of  

  Batch 1,480  of  32,152.    Elapsed: 0:03:11.
  Batch 1,520  of  32,152.    Elapsed: 0:03:16.
  Batch 1,560  of  32,152.    Elapsed: 0:03:21.
  Batch 1,600  of  32,152.    Elapsed: 0:03:26.
  Batch 1,640  of  32,152.    Elapsed: 0:03:31.
  Batch 1,680  of  32,152.    Elapsed: 0:03:36.
  Batch 1,720  of  32,152.    Elapsed: 0:03:42.
  Batch 1,760  of  32,152.    Elapsed: 0:03:47.
  Batch 1,800  of  32,152.    Elapsed: 0:03:52.
  Batch 1,840  of  32,152.    Elapsed: 0:03:57.
  Batch 1,880  of  32,152.    Elapsed: 0:04:02.
  Batch 1,920  of  32,152.    Elapsed: 0:04:07.
  Batch 1,960  of  32,152.    Elapsed: 0:04:12.
  Batch 2,000  of  32,152.    Elapsed: 0:04:18.
  Batch 2,040  of  32,152.    Elapsed: 0:04:23.
  Batch 2,080  of  32,152.    Elapsed: 0:04:28.
  Batch 2,120  of  32,152.    Elapsed: 0:04:33.
  Batch 2,160  of  32,152.    Elapsed: 0:04:38.
  Batch 2,200  of  32,152.    Elapsed: 0:04:43.
  Batch 2,240  of  32,152.    Elapsed: 0:04:49.
  Batch 2,280  of  32,152.    Elapsed: 0

  Batch 8,320  of  32,152.    Elapsed: 0:17:52.
  Batch 8,360  of  32,152.    Elapsed: 0:17:57.
  Batch 8,400  of  32,152.    Elapsed: 0:18:02.
  Batch 8,440  of  32,152.    Elapsed: 0:18:07.
  Batch 8,480  of  32,152.    Elapsed: 0:18:12.
  Batch 8,520  of  32,152.    Elapsed: 0:18:17.
  Batch 8,560  of  32,152.    Elapsed: 0:18:23.
  Batch 8,600  of  32,152.    Elapsed: 0:18:28.
  Batch 8,640  of  32,152.    Elapsed: 0:18:33.
  Batch 8,680  of  32,152.    Elapsed: 0:18:38.
  Batch 8,720  of  32,152.    Elapsed: 0:18:43.
  Batch 8,760  of  32,152.    Elapsed: 0:18:48.
  Batch 8,800  of  32,152.    Elapsed: 0:18:54.
  Batch 8,840  of  32,152.    Elapsed: 0:18:59.
  Batch 8,880  of  32,152.    Elapsed: 0:19:04.
  Batch 8,920  of  32,152.    Elapsed: 0:19:09.
  Batch 8,960  of  32,152.    Elapsed: 0:19:14.
  Batch 9,000  of  32,152.    Elapsed: 0:19:19.
  Batch 9,040  of  32,152.    Elapsed: 0:19:25.
  Batch 9,080  of  32,152.    Elapsed: 0:19:30.
  Batch 9,120  of  32,152.    Elapsed: 0

  Batch 15,080  of  32,152.    Elapsed: 0:32:23.
  Batch 15,120  of  32,152.    Elapsed: 0:32:28.
  Batch 15,160  of  32,152.    Elapsed: 0:32:33.
  Batch 15,200  of  32,152.    Elapsed: 0:32:38.
  Batch 15,240  of  32,152.    Elapsed: 0:32:44.
  Batch 15,280  of  32,152.    Elapsed: 0:32:49.
  Batch 15,320  of  32,152.    Elapsed: 0:32:54.
  Batch 15,360  of  32,152.    Elapsed: 0:32:59.
  Batch 15,400  of  32,152.    Elapsed: 0:33:04.
  Batch 15,440  of  32,152.    Elapsed: 0:33:09.
  Batch 15,480  of  32,152.    Elapsed: 0:33:14.
  Batch 15,520  of  32,152.    Elapsed: 0:33:20.
  Batch 15,560  of  32,152.    Elapsed: 0:33:25.
  Batch 15,600  of  32,152.    Elapsed: 0:33:30.
  Batch 15,640  of  32,152.    Elapsed: 0:33:35.
  Batch 15,680  of  32,152.    Elapsed: 0:33:40.
  Batch 15,720  of  32,152.    Elapsed: 0:33:45.
  Batch 15,760  of  32,152.    Elapsed: 0:33:51.
  Batch 15,800  of  32,152.    Elapsed: 0:33:56.
  Batch 15,840  of  32,152.    Elapsed: 0:34:01.
  Batch 15,880  of  

  Batch 21,800  of  32,152.    Elapsed: 0:46:49.
  Batch 21,840  of  32,152.    Elapsed: 0:46:55.
  Batch 21,880  of  32,152.    Elapsed: 0:47:00.
  Batch 21,920  of  32,152.    Elapsed: 0:47:05.
  Batch 21,960  of  32,152.    Elapsed: 0:47:10.
  Batch 22,000  of  32,152.    Elapsed: 0:47:15.
  Batch 22,040  of  32,152.    Elapsed: 0:47:20.
  Batch 22,080  of  32,152.    Elapsed: 0:47:26.
  Batch 22,120  of  32,152.    Elapsed: 0:47:31.
  Batch 22,160  of  32,152.    Elapsed: 0:47:36.
  Batch 22,200  of  32,152.    Elapsed: 0:47:41.
  Batch 22,240  of  32,152.    Elapsed: 0:47:46.
  Batch 22,280  of  32,152.    Elapsed: 0:47:51.
  Batch 22,320  of  32,152.    Elapsed: 0:47:56.
  Batch 22,360  of  32,152.    Elapsed: 0:48:02.
  Batch 22,400  of  32,152.    Elapsed: 0:48:07.
  Batch 22,440  of  32,152.    Elapsed: 0:48:12.
  Batch 22,480  of  32,152.    Elapsed: 0:48:17.
  Batch 22,520  of  32,152.    Elapsed: 0:48:22.
  Batch 22,560  of  32,152.    Elapsed: 0:48:27.
  Batch 22,600  of  

  Batch 28,520  of  32,152.    Elapsed: 1:01:16.
  Batch 28,560  of  32,152.    Elapsed: 1:01:21.
  Batch 28,600  of  32,152.    Elapsed: 1:01:27.
  Batch 28,640  of  32,152.    Elapsed: 1:01:32.
  Batch 28,680  of  32,152.    Elapsed: 1:01:37.
  Batch 28,720  of  32,152.    Elapsed: 1:01:42.
  Batch 28,760  of  32,152.    Elapsed: 1:01:47.
  Batch 28,800  of  32,152.    Elapsed: 1:01:52.
  Batch 28,840  of  32,152.    Elapsed: 1:01:57.
  Batch 28,880  of  32,152.    Elapsed: 1:02:03.
  Batch 28,920  of  32,152.    Elapsed: 1:02:08.
  Batch 28,960  of  32,152.    Elapsed: 1:02:13.
  Batch 29,000  of  32,152.    Elapsed: 1:02:18.
  Batch 29,040  of  32,152.    Elapsed: 1:02:23.
  Batch 29,080  of  32,152.    Elapsed: 1:02:28.
  Batch 29,120  of  32,152.    Elapsed: 1:02:34.
  Batch 29,160  of  32,152.    Elapsed: 1:02:39.
  Batch 29,200  of  32,152.    Elapsed: 1:02:44.
  Batch 29,240  of  32,152.    Elapsed: 1:02:49.
  Batch 29,280  of  32,152.    Elapsed: 1:02:54.
  Batch 29,320  of  

  Batch 3,000  of  32,152.    Elapsed: 0:06:28.
  Batch 3,040  of  32,152.    Elapsed: 0:06:33.
  Batch 3,080  of  32,152.    Elapsed: 0:06:38.
  Batch 3,120  of  32,152.    Elapsed: 0:06:44.
  Batch 3,160  of  32,152.    Elapsed: 0:06:49.
  Batch 3,200  of  32,152.    Elapsed: 0:06:54.
  Batch 3,240  of  32,152.    Elapsed: 0:07:00.
  Batch 3,280  of  32,152.    Elapsed: 0:07:05.
  Batch 3,320  of  32,152.    Elapsed: 0:07:10.
  Batch 3,360  of  32,152.    Elapsed: 0:07:15.
  Batch 3,400  of  32,152.    Elapsed: 0:07:20.
  Batch 3,440  of  32,152.    Elapsed: 0:07:25.
  Batch 3,480  of  32,152.    Elapsed: 0:07:30.
  Batch 3,520  of  32,152.    Elapsed: 0:07:36.
  Batch 3,560  of  32,152.    Elapsed: 0:07:41.
  Batch 3,600  of  32,152.    Elapsed: 0:07:46.
  Batch 3,640  of  32,152.    Elapsed: 0:07:51.
  Batch 3,680  of  32,152.    Elapsed: 0:07:56.
  Batch 3,720  of  32,152.    Elapsed: 0:08:01.
  Batch 3,760  of  32,152.    Elapsed: 0:08:06.
  Batch 3,800  of  32,152.    Elapsed: 0

  Batch 9,840  of  32,152.    Elapsed: 0:21:12.
  Batch 9,880  of  32,152.    Elapsed: 0:21:17.
  Batch 9,920  of  32,152.    Elapsed: 0:21:22.
  Batch 9,960  of  32,152.    Elapsed: 0:21:27.
  Batch 10,000  of  32,152.    Elapsed: 0:21:32.
  Batch 10,040  of  32,152.    Elapsed: 0:21:37.
  Batch 10,080  of  32,152.    Elapsed: 0:21:42.
  Batch 10,120  of  32,152.    Elapsed: 0:21:47.
  Batch 10,160  of  32,152.    Elapsed: 0:21:53.
  Batch 10,200  of  32,152.    Elapsed: 0:21:58.
  Batch 10,240  of  32,152.    Elapsed: 0:22:03.
  Batch 10,280  of  32,152.    Elapsed: 0:22:08.
  Batch 10,320  of  32,152.    Elapsed: 0:22:13.
  Batch 10,360  of  32,152.    Elapsed: 0:22:18.
  Batch 10,400  of  32,152.    Elapsed: 0:22:24.
  Batch 10,440  of  32,152.    Elapsed: 0:22:29.
  Batch 10,480  of  32,152.    Elapsed: 0:22:34.
  Batch 10,520  of  32,152.    Elapsed: 0:22:39.
  Batch 10,560  of  32,152.    Elapsed: 0:22:44.
  Batch 10,600  of  32,152.    Elapsed: 0:22:49.
  Batch 10,640  of  32,1

  Batch 16,560  of  32,152.    Elapsed: 0:35:36.
  Batch 16,600  of  32,152.    Elapsed: 0:35:42.
  Batch 16,640  of  32,152.    Elapsed: 0:35:47.
  Batch 16,680  of  32,152.    Elapsed: 0:35:52.
  Batch 16,720  of  32,152.    Elapsed: 0:35:57.
  Batch 16,760  of  32,152.    Elapsed: 0:36:02.
  Batch 16,800  of  32,152.    Elapsed: 0:36:07.
  Batch 16,840  of  32,152.    Elapsed: 0:36:13.
  Batch 16,880  of  32,152.    Elapsed: 0:36:18.
  Batch 16,920  of  32,152.    Elapsed: 0:36:23.
  Batch 16,960  of  32,152.    Elapsed: 0:36:28.
  Batch 17,000  of  32,152.    Elapsed: 0:36:33.
  Batch 17,040  of  32,152.    Elapsed: 0:36:38.
  Batch 17,080  of  32,152.    Elapsed: 0:36:43.
  Batch 17,120  of  32,152.    Elapsed: 0:36:49.
  Batch 17,160  of  32,152.    Elapsed: 0:36:54.
  Batch 17,200  of  32,152.    Elapsed: 0:36:59.
  Batch 17,240  of  32,152.    Elapsed: 0:37:04.
  Batch 17,280  of  32,152.    Elapsed: 0:37:09.
  Batch 17,320  of  32,152.    Elapsed: 0:37:14.
  Batch 17,360  of  

  Batch 23,280  of  32,152.    Elapsed: 0:50:25.
  Batch 23,320  of  32,152.    Elapsed: 0:50:31.
  Batch 23,360  of  32,152.    Elapsed: 0:50:36.
  Batch 23,400  of  32,152.    Elapsed: 0:50:42.
  Batch 23,440  of  32,152.    Elapsed: 0:50:48.
  Batch 23,480  of  32,152.    Elapsed: 0:50:54.
  Batch 23,520  of  32,152.    Elapsed: 0:51:00.
  Batch 23,560  of  32,152.    Elapsed: 0:51:06.
  Batch 23,600  of  32,152.    Elapsed: 0:51:12.
  Batch 23,640  of  32,152.    Elapsed: 0:51:18.
  Batch 23,680  of  32,152.    Elapsed: 0:51:23.
  Batch 23,720  of  32,152.    Elapsed: 0:51:28.
  Batch 23,760  of  32,152.    Elapsed: 0:51:33.
  Batch 23,800  of  32,152.    Elapsed: 0:51:39.
  Batch 23,840  of  32,152.    Elapsed: 0:51:44.
  Batch 23,880  of  32,152.    Elapsed: 0:51:50.
  Batch 23,920  of  32,152.    Elapsed: 0:51:57.
  Batch 23,960  of  32,152.    Elapsed: 0:52:03.
  Batch 24,000  of  32,152.    Elapsed: 0:52:09.
  Batch 24,040  of  32,152.    Elapsed: 0:52:15.
  Batch 24,080  of  

  Batch 30,000  of  32,152.    Elapsed: 1:05:52.
  Batch 30,040  of  32,152.    Elapsed: 1:05:57.
  Batch 30,080  of  32,152.    Elapsed: 1:06:03.
  Batch 30,120  of  32,152.    Elapsed: 1:06:08.
  Batch 30,160  of  32,152.    Elapsed: 1:06:13.
  Batch 30,200  of  32,152.    Elapsed: 1:06:18.
  Batch 30,240  of  32,152.    Elapsed: 1:06:23.
  Batch 30,280  of  32,152.    Elapsed: 1:06:28.
  Batch 30,320  of  32,152.    Elapsed: 1:06:33.
  Batch 30,360  of  32,152.    Elapsed: 1:06:38.
  Batch 30,400  of  32,152.    Elapsed: 1:06:44.
  Batch 30,440  of  32,152.    Elapsed: 1:06:49.
  Batch 30,480  of  32,152.    Elapsed: 1:06:54.
  Batch 30,520  of  32,152.    Elapsed: 1:06:59.
  Batch 30,560  of  32,152.    Elapsed: 1:07:04.
  Batch 30,600  of  32,152.    Elapsed: 1:07:09.
  Batch 30,640  of  32,152.    Elapsed: 1:07:14.
  Batch 30,680  of  32,152.    Elapsed: 1:07:20.
  Batch 30,720  of  32,152.    Elapsed: 1:07:25.
  Batch 30,760  of  32,152.    Elapsed: 1:07:30.
  Batch 30,800  of  

In [29]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.26662927959264937,
  'Valid. Loss': 0.5094183790417903,
  'Valid. Accur.': 0.7954426091215848,
  'Training Time': '1:22:57',
  'Validation Time': '0:06:20'},
 {'epoch': 2,
  'Training Loss': 0.26737757947754265,
  'Valid. Loss': 0.5094183790417903,
  'Valid. Accur.': 0.7954426091215848,
  'Training Time': '1:21:57',
  'Validation Time': '0:06:21'}]

In [20]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.33739492187680153,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:59',
  'Validation Time': '0:06:21'},
 {'epoch': 2,
  'Training Loss': 0.33751966777918285,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:33',
  'Validation Time': '0:06:20'},
 {'epoch': 3,
  'Training Loss': 0.33728835648750694,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:52',
  'Validation Time': '0:06:20'},
 {'epoch': 4,
  'Training Loss': 0.3375051451361947,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:51',
  'Validation Time': '0:06:20'},
 {'epoch': 5,
  'Training Loss': 0.33707942959983905,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:51',
  'Validation Time': '0:06:20'}]

In [21]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.33739492187680153,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:59',
  'Validation Time': '0:06:21'},
 {'epoch': 2,
  'Training Loss': 0.33751966777918285,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:33',
  'Validation Time': '0:06:20'},
 {'epoch': 3,
  'Training Loss': 0.33728835648750694,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:52',
  'Validation Time': '0:06:20'},
 {'epoch': 4,
  'Training Loss': 0.3375051451361947,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:51',
  'Validation Time': '0:06:20'},
 {'epoch': 5,
  'Training Loss': 0.33707942959983905,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:51',
  'Validation Time': '0:06:20'}]

In [24]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.33739492187680153,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:08:45',
  'Validation Time': '0:06:21'},
 {'epoch': 2,
  'Training Loss': 0.33751966777918285,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:09:04',
  'Validation Time': '0:06:21'},
 {'epoch': 3,
  'Training Loss': 0.33728835648750694,
  'Valid. Loss': 0.30257358304289284,
  'Valid. Accur.': 0.8809976053118538,
  'Training Time': '1:10:29',
  'Validation Time': '0:06:22'}]

In [None]:
# ========================================
#               Test
# ========================================

print("")
print("Running Test...")

t0 = time.time()

# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

# Tracking variables 
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

# Evaluate data for one epoch
for batch in test_dataloader:

    # Unpack this training batch from our dataloader. 
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using 
    # the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        (loss, logits) = model(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask,
                               labels=b_labels)

    # Accumulate the validation loss.
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    total_eval_accuracy += flat_accuracy(logits, label_ids)


# Report the final accuracy for this validation run.
avg_test_accuracy = total_eval_accuracy / len(test_dataloader)
print("  Accuracy: {0:.2f}".format(avg_test_accuracy))

# Calculate the average loss over all of the batches.
avg_test_loss = total_eval_loss / len(test_dataloader)

# Measure how long the validation run took.
test_time = format_time(time.time() - t0)

print("  Test Loss: {0:.2f}".format(avg_test_loss))
print("  Test took: {:}".format(test_time))
