This notebook is heavily inspired by https://towardsdatascience.com/build-a-bert-sci-kit-transformer-59d60ddd54a5

In [1]:
import transformers
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

In [2]:
transformers.logging.set_verbosity_info()

In [3]:
import numpy as np
from numpy.random import RandomState
import torch
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from typing import Callable, List, Optional, Tuple
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
import re
import matplotlib.pyplot as plt
import torch.nn as nn

In [4]:
from preprocessing import *

## Set up

In [5]:
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\Matteo/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\Matteo/.cache\huggingface\transformers\45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f

In [6]:
bert_model = AutoModel.from_pretrained("distilbert-base-uncased")

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\Matteo/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\Matteo/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e366

In [42]:
class BertSimple(nn.Module):
    def __init__(
            self,
            bert_model
    ):
        super(BertSimple, self).__init__()
        self.model = bert_model
        self.drop = nn.Dropout(p=.33)
        self.out = nn.Linear(self.model.config.hidden_size, 2)

    def forward(self, input_ids, input_attention):
        hidden_state = self.model(input_ids=input_ids, attention_mask=input_attention)
        
        # Choose pooling strategy
        # - Average lash hidden state vector 
        pooled_out = hidden_state.mean(axis=1)
        
        output = self.drop(pooled_out['last_hidden_state'])
        return self.out(output)

In [8]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, chunks, labels, tokenizer, max_len):
        self.chunks = chunks
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return self.chunks.shape[0]
    
    def __getitem__(self, item):
        sentence = self.chunks[item]
        labels = self.labels[item]
        
        encoded = self.tokenizer.encode_plus(
            sentence,
            is_split_into_words=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [9]:
rng = RandomState(124)

## Import data

In [10]:
neg_train = []
with open("twitter-datasets/train_neg_u.txt", "r") as f:
    for line in f.readlines():
        neg_train.append(line)

In [11]:
pos_train = []
with open("twitter-datasets/train_pos_u.txt", "r") as f:
    for line in f.readlines():
        pos_train.append(line)

In [12]:
len(neg_train), len(pos_train)

(91088, 90233)

## WARNING: I get rid of some negative samples to re-establish class equilibrium
Imbalanced classes are a pain

In [13]:
neg_train = neg_train[:len(pos_train)-len(neg_train)]

In [14]:
len(neg_train)

90233

#### Trim the dataset used for training
Right now, I only use 5K samples from each category to test the training
(17mins to train the model)

In [15]:
samples_num_by_cat = 40000

In [16]:
neg_train = neg_train[:samples_num_by_cat]
pos_train = pos_train[:samples_num_by_cat]

In [17]:
train_labels = np.concatenate([[0] * len(neg_train), [1] * len(pos_train)])

train_data = np.concatenate([neg_train, pos_train])

In [18]:
shuffling = np.arange(0, train_data.shape[0])
len(shuffling)

80000

In [19]:
rng.shuffle(shuffling)

In [20]:
train_labels = train_labels[shuffling]
train_data = train_data[shuffling]

In [21]:
train_tokenized = []
for sentence in train_data:
    train_tokenized.append(process_sentence(np.array(sentence.split(" ")), preproc_pipeline))

In [22]:
train_data = np.array(train_tokenized)

In [23]:
split = rng.choice(
    ["train", "val", "test"],
    size=len(train_data),
    p=[.8, .1, .1]
)
split

array(['train', 'train', 'train', ..., 'val', 'train', 'train'],
      dtype='<U5')

In [24]:
bert_x_data = train_data[split == "train"]
bert_labels = train_labels[split == "train"]

In [25]:
MAX_LENGTH = 200
BATCH_SIZE = 16

In [26]:
def get_loader(dataset):
    return torch.utils.data.DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        num_workers=0
    )

In [27]:
train_dataset = SentimentDataset(
    train_data[split == "train"], 
    train_labels[split == "train"], 
    tokenizer=bert_tokenizer, 
    max_len=MAX_LENGTH
)

In [28]:
train_loader = get_loader(train_dataset)

In [29]:
train_dataset.__getitem__(0)

{'input_ids': tensor([  101,  4862,  2278,  4931,  3336,  1010,  2017, 14071,  2078,  4060,
          2033,  2039,  1037, 18178,  2063,  2175,  4103, 13675,  4609,  2089,
          2497,  1029,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [30]:
train_dataset.__len__()

64063

In [31]:
next(iter(train_loader))

{'input_ids': tensor([[ 101, 4862, 2278,  ...,    0,    0,    0],
         [ 101, 4862, 2615,  ...,    0,    0,    0],
         [ 101, 4862, 2278,  ...,    0,    0,    0],
         ...,
         [ 101, 4862, 2278,  ...,    0,    0,    0],
         [ 101, 4862, 2278,  ...,    0,    0,    0],
         [ 101, 5650, 7632,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1])}

## Run the model

In [32]:
torch.cuda.is_available(), torch.cuda.get_device_properties('cuda:0')

(True,
 _CudaDeviceProperties(name='GeForce GTX 1650', major=7, minor=5, total_memory=4096MB, multi_processor_count=16))

In [33]:
gpu = torch.device('cuda:0')

In [34]:
bert_classification = BertSimple(bert_model)
bert_classification = bert_classification.to(gpu)

In [35]:
EPOCHS = 10

In [36]:
optimizer = AdamW(bert_classification.parameters(), lr=2e-5, correct_bias=False)
tot_steps = EPOCHS * len(train_loader)

In [37]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=tot_steps
)

In [38]:
loss_fn = nn.CrossEntropyLoss().to(gpu)

In [39]:
test_dp = next(iter(train_loader))
test_dp

{'input_ids': tensor([[ 101, 4862, 2278,  ...,    0,    0,    0],
         [ 101, 4862, 2615,  ...,    0,    0,    0],
         [ 101, 4862, 2278,  ...,    0,    0,    0],
         ...,
         [ 101, 4862, 2278,  ...,    0,    0,    0],
         [ 101, 4862, 2278,  ...,    0,    0,    0],
         [ 101, 5650, 7632,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1])}

In [40]:
test_dp['input_ids'].shape

torch.Size([16, 200])

In [44]:
bert_model = bert_model.eval()
test_out = bert_model(test_dp["input_ids"].to(gpu), test_dp["attention_mask"].to(gpu))

In [45]:
torch.cuda.empty_cache()

In [50]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                input_attention=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [51]:
eval_model(bert_classification, train_loader, loss_fn, gpu, 10)

RuntimeError: CUDA out of memory. Tried to allocate 38.00 MiB (GPU 0; 4.00 GiB total capacity; 1.57 GiB already allocated; 32.60 MiB free; 1.66 GiB reserved in total by PyTorch)

In [None]:
def train_one_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    gpu_device,
    scheduler,
    n_examples
):
    model = model.train()
    losses = []
    
    correct = 0
    
    for d in data_loader:
        inputs_ids = d["input_ids"].to(gpu_device)
        attention_mask = d["attention_mask"].to(gpu_device)
        labels = d["labels"].to(gpu_device)
        
        outs = model(
            inputs_ids,
            attention_mask
        )
        
        _, preds = torch.max(outs, dim=1)