Inspired by https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

### Main features
 1. RoBERTa + Dropout + Linear
 2. CrossEntropy Loss
 3. Finetuning RoBERTa
 3. Adam with Weight decay optimizer (cite this: https://arxiv.org/abs/1711.05101)
 4. Cosine schedule
 5. Preprocessing ('standard' or 'extended')

In [1]:
# !pip install transformers
# !pip install wordsegment
# !pip install nltk

In [9]:
import transformers
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW, get_cosine_schedule_with_warmup

ImportError: ray.tune in ray > 0.7.5 requires 'tabulate'. Please re-run 'pip install ray[tune]' or 'pip install ray[rllib]'.

In [8]:
transformers.logging.set_verbosity_info()

NameError: name 'transformers' is not defined

In [3]:
import numpy as np
from numpy.random import RandomState
import torch
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from typing import Callable, List, Optional, Tuple
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
import re
import matplotlib.pyplot as plt
import torch.nn as nn

In [4]:
from preprocessing import *

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/matteopariset/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matteopariset/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Set up

In [5]:
bert_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

NameError: name 'AutoTokenizer' is not defined

In [None]:
bert_tokenizer.add_prefix_space = False

In [6]:
sample_sentence = "that's a #verybad sentence <user> <url> youre gonna love it. lemme know what u think :-/"
print("Trying tokenizer:", bert_tokenizer.tokenize(" ".join(process_sentence(sample_sentence.split(" "), standard_pipeline))))

NameError: name 'bert_tokenizer' is not defined

In [9]:
bert_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/matteopariset/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at /home/matteopariset/.cache/huggingface/transformers/51ba668f7ff34e7cdf

In [10]:
class RobertaSimple(nn.Module):
    def __init__(
            self,
            bert_model
    ):
        super(RobertaSimple, self).__init__()
        self.model = bert_model

    def forward(self, input_ids, input_attention, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=input_attention, labels=labels)
        
        return outputs

In [12]:
def apply_preprocessing(tweet):
    return " ".join(process_sentence(tweet.split(" "), standard_pipeline))

In [13]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, chunks, labels, tokenizer, max_len):
        self.chunks = chunks
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return self.chunks.shape[0]
    
    def __getitem__(self, item):
        sentence = self.chunks[item]
        labels = self.labels[item]
        
        encoded = self.tokenizer.encode_plus(
            apply_preprocessing(sentence),
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [14]:
rng = RandomState(124)

## Import data

In [17]:
# Download negative small
# !wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQyeURtYWFXMzZoMnVEeGc_ZT1IMnhQ/root/content -O neg_small.txt

In [18]:
# Download positive small
# !wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQxYUNPOENKdTBrX19hY2c_ZT1WNW5Y/root/content -O pos_small.txt

In [19]:
# Download negative full
!wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content -O neg_full.txt

--2020-12-08 23:03:03--  https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content
Resolving api.onedrive.com (api.onedrive.com)... 13.107.42.12
Connecting to api.onedrive.com (api.onedrive.com)|13.107.42.12|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://d6fldw.db.files.1drv.com/y4mT677vxr3BV8LZI_OqX1DbpwG5d-npIsv2PqEvg4r49RqRI9-QE__dsFsHCdTdnNP-IkyKZpesyPoVdD_kAWg6MQzntgpFWy1saRspUrpOnctnGcSlvikJjOFHtMM8laRD96sUbU0t_1sPyMUHjdAD1iy2w7_TLMAX3ig614_7AkB-b2utLC3cHtP0X4uier5OGQv-NqKuA8ZPUODIjN0vw/train_neg_full_u.txt [following]
--2020-12-08 23:03:03--  https://d6fldw.db.files.1drv.com/y4mT677vxr3BV8LZI_OqX1DbpwG5d-npIsv2PqEvg4r49RqRI9-QE__dsFsHCdTdnNP-IkyKZpesyPoVdD_kAWg6MQzntgpFWy1saRspUrpOnctnGcSlvikJjOFHtMM8laRD96sUbU0t_1sPyMUHjdAD1iy2w7_TLMAX3ig614_7AkB-b2utLC3cHtP0X4uier5OGQv-NqKuA8ZPUODIjN0vw/train_neg_full_u.txt
Resolving d6fldw.db.files.1drv.com (d6fldw.db.files.1drv.com)

In [20]:
# Download positive full
!wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content -O pos_full.txt

--2020-12-08 23:03:11--  https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content
Resolving api.onedrive.com (api.onedrive.com)... 13.107.42.12
Connecting to api.onedrive.com (api.onedrive.com)|13.107.42.12|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://pknvjw.db.files.1drv.com/y4mB6CuxdVyCAa1f_jeMpRk339mvvQqxnZRyH6_43ppSYfXDvaPBdmK92XPj-ptktso47h95B_PKWCJw0Yy5yxj6_pF5I5eRggN0bTDBdc9NkAGry8mcM3jdkFMlp4TRx76UK-2-KMAMX2cG5Hmi9tKLomHJrTrQ1WfC6KoqiueRMA_-IcQZIFUYbsBWxGJiM16U5uTVSurs_j8ejysi5y-vw/train_pos_full_u.txt [following]
--2020-12-08 23:03:11--  https://pknvjw.db.files.1drv.com/y4mB6CuxdVyCAa1f_jeMpRk339mvvQqxnZRyH6_43ppSYfXDvaPBdmK92XPj-ptktso47h95B_PKWCJw0Yy5yxj6_pF5I5eRggN0bTDBdc9NkAGry8mcM3jdkFMlp4TRx76UK-2-KMAMX2cG5Hmi9tKLomHJrTrQ1WfC6KoqiueRMA_-IcQZIFUYbsBWxGJiM16U5uTVSurs_j8ejysi5y-vw/train_pos_full_u.txt
Resolving pknvjw.db.files.1drv.com (pknvjw.db.files.1drv.com)

In [15]:
neg_train = []
with open("neg_full.txt", "r") as f:
    for line in f.readlines():
        neg_train.append(line)

In [16]:
pos_train = []
with open("pos_full.txt", "r") as f:
    for line in f.readlines():
        pos_train.append(line)

In [17]:
print("Dataset size: \t negative %d \t positive %d" % (len(neg_train), len(pos_train)))

Dataset size: 	 negative 1142838 	 positive 1127644


## WARNING: I get rid of some negative samples to re-establish class equilibrium
Imbalanced classes are a pain

In [18]:
if len(neg_train) < len(pos_train):
  pos_train = neg_train[:len(neg_train)-len(pos_train)]
elif len(neg_train) > len(pos_train):
  neg_train = neg_train[:len(pos_train)-len(neg_train)]

In [19]:
assert len(neg_train) == len(pos_train)

#### Trim the dataset used for training

In [20]:
samples_num_by_cat = 1_120_000

In [21]:
neg_train = neg_train[:samples_num_by_cat]
pos_train = pos_train[:samples_num_by_cat]

In [22]:
train_labels = np.concatenate([[0] * len(neg_train), [1] * len(pos_train)])

train_data = np.concatenate([neg_train, pos_train])

In [23]:
shuffling = np.arange(0, train_data.shape[0])
len(shuffling)

2240000

In [24]:
rng.shuffle(shuffling)

In [25]:
train_labels = train_labels[shuffling]
train_data = train_data[shuffling]

In [26]:
split = rng.choice(
    ["train", "val"],
    size=len(train_data),
    p=[.9, .1]
)
split

array(['train', 'train', 'train', ..., 'train', 'train', 'train'],
      dtype='<U5')

In [27]:
bert_x_data = train_data[split == "train"]
bert_labels = train_labels[split == "train"]

In [28]:
MAX_LENGTH = 200
BATCH_SIZE = 32

In [29]:
def get_loader(dataset):
    return torch.utils.data.DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        num_workers=0
    )

In [30]:
train_dataset = SentimentDataset(
    train_data[split == "train"], 
    train_labels[split == "train"], 
    tokenizer=bert_tokenizer, 
    max_len=MAX_LENGTH
)

In [31]:
train_loader = get_loader(train_dataset)

In [32]:
train_dataset.__getitem__(300)

{'input_ids': tensor([    0,   611,  7474,  1855,  7474,  1855,  7474,   132, 39542,   359,
           155,   475,   282, 15561,   536,    79,    26,    79,   473,    45,
           283,   885,  1589,   143, 26848,  2407,  3999,  1942,    69, 13561,
           438,   939,    74,   202,   236,   132,   428,   176,  6460,  1843,
           306,  3623,   338, 28696,  6423, 15698,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [33]:
train_dataset.__len__()

2015851

In [34]:
val_dataset = SentimentDataset(
    train_data[split == "val"], 
    train_labels[split == "val"], 
    tokenizer=bert_tokenizer, 
    max_len=MAX_LENGTH
)

In [35]:
val_loader = get_loader(val_dataset)

In [36]:
len(val_dataset)

224149

## Run the model

In [37]:
print("GPU detected:", torch.cuda.get_device_properties('cuda:0'))

GPU detected: _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)


In [38]:
gpu = torch.device('cuda:0')

In [57]:
bert_classification = RobertaSimple(bert_model)
bert_classification = bert_classification.to(gpu)

In [41]:
EPOCHS = 2

In [42]:
optimizer = AdamW(bert_classification.parameters(), lr=2e-5, correct_bias=False)
tot_steps = EPOCHS * len(train_loader)

NameError: ignored

In [None]:
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=tot_steps
)

In [50]:
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    num_preds = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                input_attention=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits
            
            preds = torch.argmax(logits, dim=1)
            
            correct_predictions += torch.sum(preds == targets)
            num_preds += targets.shape[0]
            losses.append(loss.item())
    return correct_predictions.double() / float(num_preds), np.mean(losses)

In [64]:
def train_epoch(
  model,
  data_loader,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["labels"].to(device)

    outputs = model(
      input_ids=input_ids,
      input_attention=attention_mask,
      labels=targets
    )

    logits = outputs.logits
    loss = outputs.loss

    preds = torch.argmax(logits, dim=1)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)


In [54]:
def save_model(filename, model):
    torch.save(model.state_dict(), filename + ".pth")
    print("Model saved")

In [66]:
for epch in range(EPOCHS):
  print("EPOCH: ", epch)
  print("\t Train: ", train_epoch(bert_classification, train_loader, optimizer, gpu, scheduler, len(train_dataset)))
  print("\t Validation: ", eval_model(bert_classification, val_loader, gpu))
  save_model("RoBERTa_preproc_" + str(epch) + "epch", bert_classification)

EPOCH:  0
	 Train:  (tensor(0.8844, device='cuda:0', dtype=torch.float64), 0.2703242950417377)
	 Validation:  (tensor(0.9015, device='cuda:0', dtype=torch.float64), 0.24347110731848282)
Model saved
EPOCH:  1
	 Train:  (tensor(0.9226, device='cuda:0', dtype=torch.float64), 0.1919433712967568)
	 Validation:  (tensor(0.9064, device='cuda:0', dtype=torch.float64), 0.247945890677116)
Model saved


In [None]:
eval_model(bert_classification, all_neg_loader, gpu)

(tensor(0.8698, device='cuda:0', dtype=torch.float64), 0.3341606027943599)

## Predict

In [67]:
# !onelink() { echo -n "$1"|base64|sed "s/=$//;s/\//\_/g;s/\+/\-/g;s/^/https:\/\/api\.onedrive\.com\/v1\.0\/shares\/u\!/;s/$/\/root\/content/"; }; onelink "https://1drv.ms/t/s!ArTDgu9z7IOVjp4yCxhYs8OaIwRKzw?e=aIxt9i"

In [68]:
!wget -O test_data.txt https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDR5Q3hoWXM4T2FJd1JLenc_ZT1hSXh0/root/content

--2020-12-08 21:03:39--  https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDR5Q3hoWXM4T2FJd1JLenc_ZT1hSXh0/root/content
Resolving api.onedrive.com (api.onedrive.com)... 13.107.42.12
Connecting to api.onedrive.com (api.onedrive.com)|13.107.42.12|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://43loag.db.files.1drv.com/y4mXjx_iztEzPDr2_yEraDIdBKOZgu7urAv8l930TMSuHIzGvuuFoS5EfKK4GgVbyI14jS0zrrS931mmVdpBJy7ijfAC-JdaNmzUA6UaAGlRPgFHOMpuv1AGSx8mXlfcvy3wvWFXD_SU74GTlcsVczeKhgAYKm143iI_FhQ3xJt4LHHaGElsHNgoLfjIrFmv55BCkb-Wn44B_ej_zp_5Xu4yg/test_data.txt [following]
--2020-12-08 21:03:39--  https://43loag.db.files.1drv.com/y4mXjx_iztEzPDr2_yEraDIdBKOZgu7urAv8l930TMSuHIzGvuuFoS5EfKK4GgVbyI14jS0zrrS931mmVdpBJy7ijfAC-JdaNmzUA6UaAGlRPgFHOMpuv1AGSx8mXlfcvy3wvWFXD_SU74GTlcsVczeKhgAYKm143iI_FhQ3xJt4LHHaGElsHNgoLfjIrFmv55BCkb-Wn44B_ej_zp_5Xu4yg/test_data.txt
Resolving 43loag.db.files.1drv.com (43loag.db.files.1drv.com)... 13.107.42.

In [39]:
def predict(model, data_loader, device):
    model = model.eval()

    idxs = []
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            idx = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                input_attention=attention_mask,
                labels=torch.zeros(idx.shape[0], dtype=torch.long).to(device),
            )

            logits = outputs.logits
            loss = outputs.loss

            preds = torch.argmax(logits, dim=1)

            idxs.append(idx.cpu())
            predictions.append(preds.cpu())

    return np.concatenate(idxs), np.concatenate(predictions)

In [40]:
def prepare_submission(model, device):
    print("Loading file...")
    unk_ids = []
    unk_data = []
    with open("test_data.txt", "r") as f:
        for line in f.readlines():
            comma_pos = line.find(",")
            unk_ids.append(int(line[:comma_pos]))
            unk_data.append(line[comma_pos+1:])
            
    # Sanity check
    assert len(unk_data) == 10000

    print("Content:", unk_ids[:2], unk_data[:2])
    
    print("Create dataloader...")
    dataset = SentimentDataset(
        np.array(unk_data), 
        np.array(unk_ids), 
        tokenizer=bert_tokenizer, 
        max_len=MAX_LENGTH
    )
    
    d_loader = get_loader(dataset)

    print("Generating predictions...")
    return predict(model, d_loader, device)

In [77]:
submission_idxs, submission_labels = prepare_submission(bert_classification, gpu)
submission_idxs, submission_labels

Loading file...
Content: [1, 2] ['sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air , stay longer in the water and ... <url>\n', "<user> shucks well i work all week so now i can't come cheer you on ! oh and put those batteries in your calculator ! ! !\n"]
Create dataloader...
Generating predictions...


(array([    1,     2,     3, ...,  9998,  9999, 10000]),
 array([0, 0, 0, ..., 0, 1, 0]))

In [78]:
submission_labels.mean()

0.6265

In [41]:
def write_submission(filename, idxs, labels):
  # Convert to -1, 11
  labels = (labels * 2 - 1).astype(int)
  idxs = idxs.astype(int)
  submission_content = np.concatenate([idxs[..., np.newaxis], labels[..., np.newaxis]], axis=1).astype(int)
  print(submission_content)
  np.savetxt(filename, submission_content, fmt='%d', delimiter=',', header="Id,Prediction", comments="")

In [80]:
write_submission("sub_preproc.csv", submission_idxs, submission_labels)

[[    1    -1]
 [    2    -1]
 [    3    -1]
 ...
 [ 9998    -1]
 [ 9999     1]
 [10000    -1]]


In [42]:
# Not yet tested
def load_model(filename, bert_model, device):
    model = RobertaSimple(bert_model)
    model = model.to(device)
    model.load_state_dict(torch.load(filename + ".pth"))
    model.eval()
    print("Model loaded")
    return model

In [14]:
# save_model("sub2", bert_classification)

NameError: ignored

In [45]:
reloaded_model = load_model("RoBERTa_preproc_std_0epch", bert_model, gpu)

Model loaded


In [46]:
_, reloaded_preds = prepare_submission(reloaded_model, gpu)

Loading file...
Content: [1, 2] ['sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air , stay longer in the water and ... <url>\n', "<user> shucks well i work all week so now i can't come cheer you on ! oh and put those batteries in your calculator ! ! !\n"]
Create dataloader...
Generating predictions...


In [47]:
reloaded_preds.mean()

0.5035

In [48]:
write_submission("RoBERTa_std_epch0.csv", np.arange(1, 10001, 1), reloaded_preds)

[[    1    -1]
 [    2    -1]
 [    3     1]
 ...
 [ 9998    -1]
 [ 9999     1]
 [10000    -1]]
