# Model
This notebook loads our finetuned model and computes predictions

## Description
 1. RoBERTa + Dropout + Linear
 2. CrossEntropy Loss
 3. Finetuning RoBERTa
 3. Adam with Weight decay optimizer (https://arxiv.org/abs/1711.05101)
 4. Cosine schedule
 5. Preprocessing ('standard' + 'extended')

## Notes
GPU required

## Credits
Some ideas were taken from https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

## Set up

In [1]:
# !pip install transformers
# !pip install wordsegment
# !pip install nltk

In [2]:
import transformers
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW, get_cosine_schedule_with_warmup

In [None]:
transformers.logging.set_verbosity_info()

In [None]:
import numpy as np
from numpy.random import RandomState
import torch
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from typing import Callable, List, Optional, Tuple
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
import re
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
from preprocessing_v6 import *

## GPU check

In [4]:
assert torch.cuda.is_available(), "A CUDA-enabled GPU is required to execute this notebook (in a reasonable time)"

True

In [None]:
print("GPU detected:", torch.cuda.get_device_properties('cuda:0'))

In [None]:
gpu = torch.device('cuda:0')

## Load components

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
bert_tokenizer.add_prefix_space = False

In [None]:
def apply_preprocessing(tweet):
    return " ".join(process_sentence(tweet.split(" "), extended_pipeline(bert_tokenizer)))

In [None]:
# Test preprocessing
sample_sentence = "that's a #verybad sentence <user> <url> youre gonna love it. lemme know what u think :-/"
print("Testing preprocessing & tokenizer...")
print("Original sentence:", sample_sentence)
print("Processed sentence:", bert_tokenizer.tokenize())

In [None]:
bert_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

In [None]:
class RobertaSimple(nn.Module):
    def __init__(
            self,
            bert_model
    ):
        super(RobertaSimple, self).__init__()
        self.model = bert_model

    def forward(self, input_ids, input_attention, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=input_attention, labels=labels)
        
        return outputs

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, chunks, labels, tokenizer, max_len):
        self.chunks = chunks
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return self.chunks.shape[0]
    
    def __getitem__(self, item):
        sentence = self.chunks[item]
        labels = self.labels[item]
        
        encoded = self.tokenizer.encode_plus(
            apply_preprocessing(sentence),
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [None]:
# Initialize random state (for reproducibility)
rng = RandomState(124)

## Define parameters

In [None]:
# Max number of tokens in each tweet
MAX_LENGTH = 200
# Batch size
BATCH_SIZE = 32
# Number of training epochs
EPOCHS = 2

## Load fine-tuned model

In [1]:
def load_model(filename, bert_model, device):
    model = RobertaSimple(bert_model)
    model = model.to(device)
    model.load_state_dict(torch.load(filename + ".pth"))
    model.eval()
    print("Model loaded")
    return model

In [None]:
assert False, "Download model"

In [None]:
reloaded_model = load_model("RoBERTa_preproc_std_0epch", bert_model, gpu)

## Predict

In [None]:
!wget -O test_data.txt https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDR5Q3hoWXM4T2FJd1JLenc_ZT1hSXh0/root/content

In [None]:
def predict(model, data_loader, device):
    model = model.eval()

    idxs = []
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            idx = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                input_attention=attention_mask,
                labels=torch.zeros(idx.shape[0], dtype=torch.long).to(device),
            )

            logits = outputs.logits
            loss = outputs.loss

            preds = torch.argmax(logits, dim=1)

            idxs.append(idx.cpu())
            predictions.append(preds.cpu())

    return np.concatenate(idxs), np.concatenate(predictions)

In [None]:
def prepare_submission(model, device, filename="test_data.txt"):
    print("Loading file...")
    unk_ids = []
    unk_data = []
    with open(filename, "r") as f:
        for line in f.readlines():
            comma_pos = line.find(",")
            unk_ids.append(int(line[:comma_pos]))
            unk_data.append(line[comma_pos+1:])
            
    # Sanity check
    assert len(unk_data) == 10000

    print("Content:", unk_ids[:2], unk_data[:2])
    
    print("Create dataloader...")
    dataset = SentimentDataset(
        np.array(unk_data), 
        np.array(unk_ids), 
        tokenizer=bert_tokenizer, 
        max_len=MAX_LENGTH
    )
    
    d_loader = get_loader(dataset)

    print("Generating predictions...")
    return predict(model, d_loader, device)

In [None]:
submission_idxs, submission_labels = prepare_submission(reloaded_model, gpu)
submission_idxs, submission_labels

In [None]:
def write_submission(filename, idxs, labels):
  # Convert to -1, 11
  labels = (labels * 2 - 1).astype(int)
  idxs = idxs.astype(int)
  submission_content = np.concatenate([idxs[..., np.newaxis], labels[..., np.newaxis]], axis=1).astype(int)
  print(submission_content)
  np.savetxt(filename, submission_content, fmt='%d', delimiter=',', header="Id,Prediction", comments="")

In [None]:
# Filename of predictions
PREDICTIONS_FILENAME = "sub_reloaded.csv"
write_submission(PREDICTIONS_FILENAME, submission_idxs, submission_labels)