In [9]:
import numpy as np
import json
import torch
import pandas as pd

npzfile = np.load("train_and_val.npz")
vocab = json.load(open("vocab.json", "r"))

In [4]:
numToWord = {value[0]: key for (key, value) in vocab.items()}


In [29]:
train = [[numToWord[i] for i in sentence if numToWord[i]!=""] for sentence in npzfile["train_x"]]
train_y = npzfile["train_y"]
val = npzfile["val_x"][:-1000]
val = [[numToWord[i] for i in sentence if numToWord[i]!=""] for sentence in val]
val_y = npzfile["val_y"][:-1000]
test = npzfile["val_x"][-1000:]
test = [[numToWord[i] for i in sentence if numToWord[i]!=""] for sentence in test]
test_y = npzfile["val_y"][-1000:]

train_tostr = [" ".join(sentence).strip() for sentence in train]
val_tostr = [" ".join(sentence).strip() for sentence in val]
test_tostr = [" ".join(sentence).strip() for sentence in test]

train_tostr[294]

'about to pass out waking up at 4:30 is not fun'

In [31]:
# pretty balanced
print("train balance ratio: ", sum(train_y!=1.0)/len(train_y))
print("val balance ratio: ", sum(val_y!=1.0)/len(val_y))
print("test balance ratio: ", sum(test_y!=1.0)/len(test_y))

train balance ratio:  0.5081475
val balance ratio:  0.5078571428571429
test balance ratio:  0.499


In [32]:
len(np.unique(npzfile["train_x"].flatten()))

8461

In [33]:
len(numToWord)

8475

In [34]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [36]:
# Print the original sentence.
print(' Original: ', train_tostr[294])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_tostr[294]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_tostr[294])))

 Original:  about to pass out waking up at 4:30 is not fun
Tokenized:  ['about', 'to', 'pass', 'out', 'waking', 'up', 'at', '4', ':', '30', 'is', 'not', 'fun']
Token IDs:  [2055, 2000, 3413, 2041, 12447, 2039, 2012, 1018, 1024, 2382, 2003, 2025, 4569]


In [38]:
maxlen = -1
for sentence in train_tostr:
    maxlen = max(maxlen, len(sentence))

maxlen = int(maxlen*1.5) # in case of longer test sentence
print(maxlen)

216


In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in train_tostr:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation=True,
                        max_length = 216,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])