In [89]:
SEED = 42
import os
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger("bert")

In [120]:
import json
import gc

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
from transformers import BertForNextSentencePrediction, BertTokenizer, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler, RandomSampler
from fastprogress import master_bar, progress_bar
from sklearn.model_selection import train_test_split

In [91]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [92]:
import random
import numpy as np

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

## Data Preparation

In [93]:
#df = pd.read_json('title_abs_sample.json')
df_full = pd.read_json('title_abs_sample.json')

In [94]:
pd.Series(len(x.split(' ')) for x in df_full.title).describe()

count    100000.000000
mean          8.024900
std           4.461652
min           1.000000
25%           5.000000
50%           7.000000
75%          10.000000
max          72.000000
dtype: float64

In [45]:
pd.Series(len(x.split(' ')) for x in df_full.abstract).describe()

count    100000.000000
mean        125.629140
std          65.497318
min           4.000000
25%          86.000000
50%         121.000000
75%         147.000000
max        4047.000000
dtype: float64

In [96]:
df = df_full.drop(['publication_number'], axis=1).reset_index(drop=True)

In [97]:
df['label'] = 1
df1 = df[0:49999]
df2 = df[50000:100000]

In [101]:
df2.abstract = np.random.permutation(df2.abstract.values)
df2.label = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [104]:
df = pd.concat([df1, df2]).sample(frac=1)

In [106]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

In [107]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [153]:
def convert_sentence_pair(titles, abstracts, targets, max_seq_length, tokenizer):
    features = []
    for (ex_index, (title, abss, labs)) in enumerate(zip(titles, abstracts, targets)):
        tokens_a = tokenizer.tokenize(title)

        tokens_b = None
        tokens_b = tokenizer.tokenize(abss)
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        features.append(
                InputFeatures(
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    target=labs
        ))
    return features

In [139]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

03/01/2020 01:09:20 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ubuntu/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [150]:

xTrain, xTest, yTrain, yTest = train_test_split(df[['title', 'abstract']], df['label'], test_size = 0.2, random_state = 0)

In [157]:
train_data = convert_sentence_pair(xTrain.title.tolist(), xTrain.abstract.tolist(), yTrain.tolist(),max_seq_length=200, tokenizer=tokenizer)
train_inputs = torch.tensor([f.input_ids for f in train_data], dtype=torch.long)
train_masks = torch.tensor([f.input_mask for f in train_data], dtype=torch.long)
train_toks = torch.tensor([f.segment_ids for f in train_data], dtype=torch.long)
train_labels = torch.tensor([f.target for f in train_data], dtype=torch.long)

In [158]:
val_data = convert_sentence_pair(xTest.title.tolist(), xTest.abstract.tolist(), yTest.tolist(),max_seq_length=200, tokenizer=tokenizer)
validation_inputs = torch.tensor([f.input_ids for f in val_data], dtype=torch.long)
validation_masks = torch.tensor([f.input_mask for f in val_data], dtype=torch.long)
validation_toks = torch.tensor([f.segment_ids for f in val_data], dtype=torch.long)
validation_labels = torch.tensor([f.target for f in val_data], dtype=torch.long)

In [140]:
#new_data = convert_sentence_pair(df.title.tolist(), 
#                                      df.abstract.tolist(),
#                                      df.label.tolist(),
#                                      max_seq_length=200, tokenizer=tokenizer)

## Model

In [112]:
model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased",num_labels=2).to(device)

02/29/2020 23:57:21 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ubuntu/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
02/29/2020 23:57:21 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings

In [159]:
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_toks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_toks)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [160]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [161]:
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [162]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [167]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [168]:
# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this trai
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_toks = batch[3].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                    token_type_ids=b_toks, 
                    attention_mask=b_input_mask, 
                    next_sentence_label=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
            
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels, b_toks = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=b_toks, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of  2,500.    Elapsed: 0:01:17.
  Batch    80  of  2,500.    Elapsed: 0:02:34.
  Batch   120  of  2,500.    Elapsed: 0:03:52.
  Batch   160  of  2,500.    Elapsed: 0:05:11.
  Batch   200  of  2,500.    Elapsed: 0:06:29.


KeyboardInterrupt: 

In [169]:
total_step = len(train_dataloader)

# Store our loss and accuracy for plotting
train_loss_set = []

epochs = 2

# trange is a tqdm wrapper around the normal python range
for epoch_i in range(0, epochs):
    # Training
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for i, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_toks = batch[3].to(device)

        # Forward pass
        outputs = model(b_input_ids, 
                    token_type_ids=b_toks, 
                    attention_mask=b_input_mask, 
                    next_sentence_label=b_labels)
        loss = outputs[0]
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if (i) % 50 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, epochs, i+1, total_step, loss.item()))

Epoch [1/2], Step [1/2500], Loss: 0.0045
Epoch [1/2], Step [51/2500], Loss: 0.1213
Epoch [1/2], Step [101/2500], Loss: 0.0637
Epoch [1/2], Step [151/2500], Loss: 0.0159
Epoch [1/2], Step [201/2500], Loss: 0.0083
Epoch [1/2], Step [251/2500], Loss: 0.0621
Epoch [1/2], Step [301/2500], Loss: 0.2311
Epoch [1/2], Step [351/2500], Loss: 0.0016
Epoch [1/2], Step [401/2500], Loss: 0.0810
Epoch [1/2], Step [451/2500], Loss: 0.0052
Epoch [1/2], Step [501/2500], Loss: 0.2726
Epoch [1/2], Step [551/2500], Loss: 0.1450
Epoch [1/2], Step [601/2500], Loss: 0.0104
Epoch [1/2], Step [651/2500], Loss: 0.2704
Epoch [1/2], Step [701/2500], Loss: 0.0080
Epoch [1/2], Step [751/2500], Loss: 0.0202
Epoch [1/2], Step [801/2500], Loss: 0.0222
Epoch [1/2], Step [851/2500], Loss: 0.1049
Epoch [1/2], Step [901/2500], Loss: 0.0843
Epoch [1/2], Step [951/2500], Loss: 0.0232
Epoch [1/2], Step [1001/2500], Loss: 0.0725
Epoch [1/2], Step [1051/2500], Loss: 0.2253
Epoch [1/2], Step [1101/2500], Loss: 0.0091
Epoch [1/2]

In [171]:
torch.save(model.state_dict(), 'bert_nsp_model.ckpt')

In [187]:
torch.save(model, 'bert_nsp_model.pth')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [175]:
model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# Evaluate data for one epoch
mb = progress_bar(validation_dataloader)
for batch in mb:
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels, b_toks = batch
    
    with torch.no_grad():        
        outputs = model(b_input_ids, token_type_ids=b_toks, attention_mask=b_input_mask)
    
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Calculate the accuracy for this batch of test sentences.
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    # Accumulate the total accuracy.
    eval_accuracy += tmp_eval_accuracy

    nb_eval_steps += 1

# Report the final accuracy for this validation run.
print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("  Validation took: {:}".format(format_time(time.time() - t0)))

  Accuracy: 0.99
  Validation took: 3:41:33


In [186]:
eval_accuracy/nb_eval_steps

0.98535

In [176]:
eats_df = pd.read_csv('eats.csv')

In [178]:
eats_df = eats_df.drop(['Unnamed: 0', 'claims', 'Publication_Number'], axis=1)

In [None]:
test_data = convert_sentence_pair(eats_df.title.tolist(), eats_df.abstract.tolist(), eats_df.Label.tolist(),max_seq_length=200, tokenizer=tokenizer)
train_inputs = torch.tensor([f.input_ids for f in train_data], dtype=torch.long)
train_masks = torch.tensor([f.input_mask for f in train_data], dtype=torch.long)
train_toks = torch.tensor([f.segment_ids for f in train_data], dtype=torch.long)
train_labels = torch.tensor([f.target for f in train_data], dtype=torch.long)

In [184]:
def run_test(idx):
    rep_title = [eats_df.iloc[idx]["title"]] * (eats_df.shape[0] - 1)
    abss = eats_df.abstract.tolist()
    abss.pop(idx)
    labels = eats_df.Label.tolist()
    labels.pop(idx)
    sentence_pairs = convert_sentence_pair(rep_title, abss, labels, max_seq_length=200, tokenizer=tokenizer)
    
    BATCH_SIZE = 256
    all_input_ids = torch.tensor([f.input_ids for f in sentence_pairs], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in sentence_pairs], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in sentence_pairs], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE)

    model.eval()

    res = []

    mb = progress_bar(eval_dataloader)
    for input_ids, input_mask, segment_ids in mb:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            res.append(nn.functional.softmax(
                model(input_ids, segment_ids, input_mask), dim=1
            )[:, 0].detach().cpu().numpy())

    res = np.concatenate(res)
    res = [int(round(x)) for x in res]
    pred_label = np.stack((res, labels))
    
    return pred_label

In [185]:
pred = []
lab = []
for p in range(eats_df.shape[0]):
    print(p)
    pl = run_test(p)
    pred = np.append(pred, pl[0])
    lab = np.append(lab, pl[1])

0


AttributeError: 'tuple' object has no attribute 'softmax'

In [28]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report

In [29]:
results = confusion_matrix(lab, pred)

In [31]:
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(lab, pred)) 
print('Report : ')
print(classification_report(lab, pred))

Confusion Matrix :
[[225569 220623]
 [ 31339  21611]]
Accuracy Score : 0.49520977998244986
Report : 
              precision    recall  f1-score   support

         0.0       0.88      0.51      0.64    446192
         1.0       0.09      0.41      0.15     52950

   micro avg       0.50      0.50      0.50    499142
   macro avg       0.48      0.46      0.39    499142
weighted avg       0.79      0.50      0.59    499142



In [35]:
len(eats_df.title[0].split(' '))

12

7.973125884016973