In [2]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# (Most) Installs

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |▍                               | 10kB 26.1MB/s eta 0:00:01[K     |▊                               | 20kB 2.1MB/s eta 0:00:01[K     |█▏                              | 30kB 2.7MB/s eta 0:00:01[K     |█▌                              | 40kB 3.1MB/s eta 0:00:01[K     |█▉                              | 51kB 2.4MB/s eta 0:00:01[K     |██▎                             | 61kB 2.7MB/s eta 0:00:01[K     |██▋                             | 71kB 3.0MB/s eta 0:00:01[K     |███                             | 81kB 3.3MB/s eta 0:00:01[K     |███▍                            | 92kB 3.6MB/s eta 0:00:01[K     |███▊                            | 102kB 3.4MB/s eta 0:00:01[K     |████                            | 112kB 3.4MB/s eta 0:00:01[K     |████▌                           | 122kB 3.4M

In [4]:
#standard library imports
import itertools

#3rd party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

#local app & library specific imports
from scipy.special import softmax
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import DataLoader, TensorDataset

# Setup & global vars

In [5]:
# alert for cells
from IPython.display import Audio

alert = Audio(np.sin(np.linspace(0, 3000, 20000)), rate=20000, autoplay=True)

In [6]:
%cd drive/'My Drive'/rosie/

/content/drive/My Drive/rosie


In [7]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [10]:
#Global vars - use those from the best experimental run

MODEL = "bert-base-cased"
BATCH_SIZE = 16
max_len = 204

output_dir = "selected_model/"

# Load saved model

In [11]:
if MODEL == "bert-base-cased":

  from transformers import BertTokenizer, BertForSequenceClassification

  # Load a trained model and vocabulary that you have fine-tuned
  model = BertForSequenceClassification.from_pretrained(output_dir)
  tokenizer = BertTokenizer.from_pretrained(output_dir)

elif MODEL == "distilbert-base-cased":

  from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

  # Load a trained model and vocabulary that you have fine-tuned
  model = DistilBertForSequenceClassification.from_pretrained(output_dir)
  tokenizer = DistilBertTokenizer.from_pretrained(output_dir)

else:
  raise ValueError('Unknown model specified. Check MODEL var.')

# Copy the model to the GPU.
model.to(device)

Some weights of the model checkpoint at selected_model/ were not used when initializing BertForSequenceClassification: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [17]:
# label2id = model.config.label2id # these are the labels the model already "knows"

# Load prediction data

In [18]:
# Load the dataset into a pandas dataframe.
pred_df = pd.read_parquet(
    "data/one_perct_sample/2008_one_perct_sample.parquet") 

# Report the number of sentences.
print('Number of instances for prediction: {:,}\n'.format(pred_df.shape[0]))

# test_df["label_encoded"] = test_df["label"].map(label2id) #use same mapping from model

sentences = pred_df.body.values

Number of instances for prediction: 30,244



# Tokenize the data

In [19]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_dataloader = DataLoader(prediction_data, batch_size = BATCH_SIZE)



# Generate predictions

In [21]:
%%time
# ~6min

print('Predicting labels for {:,} instances...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  # b_input_ids, b_input_mask, b_labels = batch
  b_input_ids, b_input_mask = batch

  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

alert

Predicting labels for 30,244 instances...
    DONE.
CPU times: user 4min 18s, sys: 2min 31s, total: 6min 50s
Wall time: 6min 50s
