In [1]:
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import pandas as pd 
import numpy as np
import torch

In [2]:
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased')]

In [3]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
df = pd.read_csv('data/train.csv')
df.drop("id", axis=1, inplace=True)

np.random.seed(42)
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [6]:
df = df[:10000]

In [7]:
def tokenize_cut_pad(df):
    
    max_input_size = tokenizer.max_model_input_sizes['bert-base-uncased']
    
    # shorten sequences longer than BERT max input size
    df["comment_text"] = [text[:max_input_size - 2] for text in df["comment_text"].values] 
    tokenized = df["comment_text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            # max_len will be equal to longest sequence in the tokenized values
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
    return torch.tensor(padded)

In [8]:
# input_ids are the tokenized, cut and padded tokens for each word, ready to be fed into BERT
input_ids = tokenize_cut_pad(df)

In [9]:
input_ids.shape

torch.Size([10000, 308])

In [10]:
input_ids = input_ids[:2000] # we run out of memory if we try to use any more examples

In [11]:
input_ids = input_ids.to(device)

In [12]:
bert_model = bert_model.to(device)

In [15]:
input_ids

2000

### This method of getting feature embeddings for all features in a dataset only works for small datasets with short sequence lengths due to OOM

In [14]:
with torch.no_grad():
    # hidden_states acts as an embedding layer returning 768 dim embeddings for our sentences
    hidden_states = bert_model(input_ids)[0]

RuntimeError: CUDA out of memory. Tried to allocate 8.48 GiB (GPU 0; 15.90 GiB total capacity; 11.00 GiB already allocated; 4.27 GiB free; 47.25 MiB cached)

In [13]:
features = hidden_states[:, 0, :].numpy()

In [16]:
features.shape

(100, 768)

In [17]:
labels_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [18]:
labels = df[labels_list].values

In [19]:
labels.shape

(10000, 6)

In [20]:
labels = torch.tensor(labels)

In [21]:
labels

tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ...,
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]])

In [20]:
# train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [21]:
# train_features, valid_features, train_labels, valid_labels = train_test_split(train_features, train_labels)

In [None]:
# Define criterion
# Define optimizer
# Define metrics 
# Define model 
# Define training loop
# Define prediction func

### Example of custom tensor dataset, without using PyTorch

In [22]:
class create_dataset:
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __len__(self):
        return (len(self.inputs))
    
    def __iter__(self):
        yield(self.inputs, self.labels)
        
dataset = create_dataset(input_ids, labels)

In [23]:
next(iter(dataset))

(tensor([[  101, 20277,  2480,  ...,     0,     0,     0],
         [  101,  2482,  3695,  ...,     0,     0,     0],
         [  101,  1000,  5798,  ...,     0,     0,     0],
         ...,
         [  101,  2498,  1037,  ...,     0,     0,     0],
         [  101,  1000,  1996,  ...,     0,     0,     0],
         [  101,  2047,  5198,  ...,     0,     0,     0]]),
 tensor([[0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0],
         ...,
         [0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0]]))

In [24]:
len(dataset)

10000