# Model

## Description
 1. RoBERTa + Dropout + Linear
 2. CrossEntropy Loss
 3. Finetuning RoBERTa
 3. Adam with Weight decay optimizer (https://arxiv.org/abs/1711.05101)
 4. Cosine schedule
 5. Preprocessing ('standard' + 'extended')

## Notes
GPU required

## Credits
Some ideas were taken from https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

## Set up

In [1]:
# !pip install transformers
# !pip install wordsegment
# !pip install nltk

In [2]:
import transformers
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW, get_cosine_schedule_with_warmup

In [None]:
transformers.logging.set_verbosity_info()

In [None]:
import numpy as np
from numpy.random import RandomState
import torch
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from typing import Callable, List, Optional, Tuple
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
import re
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
from preprocessing_v6 import *

## GPU check

In [4]:
assert torch.cuda.is_available(), "A CUDA-enabled GPU is required to execute this notebook (in a reasonable time)"

True

In [None]:
print("GPU detected:", torch.cuda.get_device_properties('cuda:0'))

In [None]:
gpu = torch.device('cuda:0')

## Load components

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
bert_tokenizer.add_prefix_space = False

In [None]:
def apply_preprocessing(tweet):
    return " ".join(process_sentence(tweet.split(" "), extended_pipeline(bert_tokenizer)))

In [None]:
# Test preprocessing
sample_sentence = "that's a #verybad sentence <user> <url> youre gonna love it. lemme know what u think :-/"
print("Testing preprocessing & tokenizer...")
print("Original sentence:", sample_sentence)
print("Processed sentence:", bert_tokenizer.tokenize())

In [None]:
bert_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

In [None]:
class RobertaSimple(nn.Module):
    def __init__(
            self,
            bert_model
    ):
        super(RobertaSimple, self).__init__()
        self.model = bert_model

    def forward(self, input_ids, input_attention, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=input_attention, labels=labels)
        
        return outputs

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, chunks, labels, tokenizer, max_len):
        self.chunks = chunks
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return self.chunks.shape[0]
    
    def __getitem__(self, item):
        sentence = self.chunks[item]
        labels = self.labels[item]
        
        encoded = self.tokenizer.encode_plus(
            apply_preprocessing(sentence),
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [None]:
# Initialize random state (for reproducibility)
rng = RandomState(124)

## Define parameters

In [None]:
# Max number of tokens in each tweet
MAX_LENGTH = 200
# Batch size
BATCH_SIZE = 32
# Number of training epochs
EPOCHS = 2

## Import data

In [None]:
# Download negative small
# !wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQyeURtYWFXMzZoMnVEeGc_ZT1IMnhQ/root/content -O neg_small.txt

In [None]:
# Download positive small
# !wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQxYUNPOENKdTBrX19hY2c_ZT1WNW5Y/root/content -O pos_small.txt

In [None]:
# Download negative full
!wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content -O neg_full.txt

In [None]:
# Download positive full
!wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content -O pos_full.txt

In [None]:
neg_train = []
with open("neg_full.txt", "r") as f:
    for line in f.readlines():
        neg_train.append(line)

In [None]:
pos_train = []
with open("pos_full.txt", "r") as f:
    for line in f.readlines():
        pos_train.append(line)

In [None]:
print("Dataset loaded. Size: \t negative %d \t positive %d" % (len(neg_train), len(pos_train)))

### Re-establish balance between classes

In [None]:
if len(neg_train) < len(pos_train):
  pos_train = neg_train[:len(neg_train)-len(pos_train)]
elif len(neg_train) > len(pos_train):
  neg_train = neg_train[:len(pos_train)-len(neg_train)]

In [None]:
assert len(neg_train) == len(pos_train)

### Trim and shuffle

In [None]:
samples_num_by_cat = 1_120_000

In [None]:
neg_train = neg_train[:samples_num_by_cat]
pos_train = pos_train[:samples_num_by_cat]

In [None]:
train_labels = np.concatenate([[0] * len(neg_train), [1] * len(pos_train)])
train_data = np.concatenate([neg_train, pos_train])

In [None]:
shuffling = np.arange(0, train_data.shape[0])
len(shuffling)

In [None]:
rng.shuffle(shuffling)

In [None]:
train_labels = train_labels[shuffling]
train_data = train_data[shuffling]

In [None]:
split = rng.choice(
    ["train", "val"],
    size=len(train_data),
    p=[.9, .1]
)
split

In [None]:
bert_x_data = train_data[split == "train"]
bert_labels = train_labels[split == "train"]

In [None]:
def get_loader(dataset):
    return torch.utils.data.DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        num_workers=0
    )

In [None]:
train_dataset = SentimentDataset(
    train_data[split == "train"], 
    train_labels[split == "train"], 
    tokenizer=bert_tokenizer, 
    max_len=MAX_LENGTH
)

In [None]:
train_loader = get_loader(train_dataset)

In [None]:
print("Random sample:")
train_dataset.__getitem__(1)

In [None]:
val_dataset = SentimentDataset(
    train_data[split == "val"], 
    train_labels[split == "val"], 
    tokenizer=bert_tokenizer, 
    max_len=MAX_LENGTH
)

In [None]:
val_loader = get_loader(val_dataset)

In [None]:
print("Training set size: %d \t Validation set size: %d" % (len(train_dataset), len(val_dataset)))

## Run the model

In [None]:
bert_classification = RobertaSimple(bert_model)
bert_classification = bert_classification.to(gpu)

In [None]:
optimizer = AdamW(bert_classification.parameters(), lr=2e-5, correct_bias=False)
tot_steps = EPOCHS * len(train_loader)

In [None]:
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=tot_steps
)

In [None]:
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    num_preds = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                input_attention=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits
            
            preds = torch.argmax(logits, dim=1)
            
            correct_predictions += torch.sum(preds == targets)
            num_preds += targets.shape[0]
            losses.append(loss.item())
    return correct_predictions.double() / float(num_preds), np.mean(losses)

In [None]:
def train_epoch(
  model,
  data_loader,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["labels"].to(device)

    outputs = model(
      input_ids=input_ids,
      input_attention=attention_mask,
      labels=targets
    )

    logits = outputs.logits
    loss = outputs.loss

    preds = torch.argmax(logits, dim=1)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
def save_model(filename, model):
    torch.save(model.state_dict(), filename + ".pth")
    print("Model saved")

In [None]:
# Train the model and store it
for epch in range(EPOCHS):
  print("EPOCH: ", epch)
  print("\t Train: ", train_epoch(bert_classification, train_loader, optimizer, gpu, scheduler, len(train_dataset)))
  print("\t Validation: ", eval_model(bert_classification, val_loader, gpu))
  save_model("RoBERTa_preproc_" + str(epch) + "epch", bert_classification)

In [None]:
eval_model(bert_classification, all_neg_loader, gpu)

## Predict

In [None]:
!wget -O test_data.txt https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDR5Q3hoWXM4T2FJd1JLenc_ZT1hSXh0/root/content

In [None]:
def predict(model, data_loader, device):
    model = model.eval()

    idxs = []
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            idx = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                input_attention=attention_mask,
                labels=torch.zeros(idx.shape[0], dtype=torch.long).to(device),
            )

            logits = outputs.logits
            loss = outputs.loss

            preds = torch.argmax(logits, dim=1)

            idxs.append(idx.cpu())
            predictions.append(preds.cpu())

    return np.concatenate(idxs), np.concatenate(predictions)

In [None]:
def prepare_submission(model, device, filename="test_data.txt"):
    print("Loading file...")
    unk_ids = []
    unk_data = []
    with open(filename, "r") as f:
        for line in f.readlines():
            comma_pos = line.find(",")
            unk_ids.append(int(line[:comma_pos]))
            unk_data.append(line[comma_pos+1:])
            
    # Sanity check
    assert len(unk_data) == 10000

    print("Content:", unk_ids[:2], unk_data[:2])
    
    print("Create dataloader...")
    dataset = SentimentDataset(
        np.array(unk_data), 
        np.array(unk_ids), 
        tokenizer=bert_tokenizer, 
        max_len=MAX_LENGTH
    )
    
    d_loader = get_loader(dataset)

    print("Generating predictions...")
    return predict(model, d_loader, device)

In [None]:
submission_idxs, submission_labels = prepare_submission(bert_classification, gpu)
submission_idxs, submission_labels

In [None]:
def write_submission(filename, idxs, labels):
  # Convert to -1, 11
  labels = (labels * 2 - 1).astype(int)
  idxs = idxs.astype(int)
  submission_content = np.concatenate([idxs[..., np.newaxis], labels[..., np.newaxis]], axis=1).astype(int)
  print(submission_content)
  np.savetxt(filename, submission_content, fmt='%d', delimiter=',', header="Id,Prediction", comments="")

In [None]:
# Filename of predictions
PREDICTIONS_FILENAME = "sub_preproc.csv"
write_submission(PREDICTIONS_FILENAME, submission_idxs, submission_labels)

In [None]:
# Not yet tested
def load_model(filename, bert_model, device):
    model = RobertaSimple(bert_model)
    model = model.to(device)
    model.load_state_dict(torch.load(filename + ".pth"))
    model.eval()
    print("Model loaded")
    return model

In [None]:
# save_model("sub2", bert_classification)

In [None]:
reloaded_model = load_model("RoBERTa_preproc_std_0epch", bert_model, gpu)

In [None]:
_, reloaded_preds = prepare_submission(reloaded_model, gpu)

In [None]:
reloaded_preds.mean()

In [None]:
write_submission("RoBERTa_std_epch0.csv", np.arange(1, 10001, 1), reloaded_preds)