## BERT for sentiment analysis

This notebook contains the codes for fine tuning a pre-trained BERT to predict the level of humour in texts. <br>
It also contains codes for running models without using pre-trained representation for comparison. <br>
Examples include CNN and and logistic regression with manual feature engineering.


In [None]:
# Download word embedding if needed

# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

! pip install torch pandas numpy sklearn wget transformers


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

In [None]:
# Get the training, development and test set
# Training set also includes data from funlines
# The links are most likely not working alrdy by now

!wget -O train.csv https://www.dropbox.com/s/3hm0nf7p5yvy4dc/train.csv?dl=0
# !wget -O train_funlines.csv https://www.dropbox.com/s/e8v4mor8sx00g0u/train_funlines.csv?dl=0
!wget -O dev.csv https://www.dropbox.com/s/7k5j43hq8ad5ghj/dev.csv?dl=0
!wget -O test.csv https://www.dropbox.com/s/ogqwx15uhfrinbd/test.csv?dl=0

In [None]:
# Setting up

import re
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import AutoTokenizer, BertModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs


if not os.path.exists('/content/drive/MyDrive/nlp/'):
    os.makedirs('/content/drive/MyDrive/nlp/')
    

In [None]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
# np.random.seed(SEED)


use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [None]:
# Load data

train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")
print(train_df.shape)
print(dev_df.shape)
# train_df = pd.concat([train_df, train_funlines_df])
# train_df.index = range(len(train_df))
# print(train_df.shape)


In [None]:
# Define the Dataset class

class Task1Dataset(Dataset):

    def __init__(self, train_data, labels):
        self.x_train = train_data
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

In [None]:
# Define training loop

def train(train_iter, dev_iter, model, embedding_model, number_epoch, scheduler):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    
    print("Training model.")

    for epoch in range(number_epoch):
        
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0    # Observations used for training so far
        pred_train = np.empty(0)

        for feature, target in train_iter:

            feature, target = feature.to(device), target.float().to(device)
            segments_tensor = torch.where(feature == 0, 0, 1).to(device)
            
            # Get the BERT embeddings, output has shape (batch, tokens per batch, embedding_dim)
            bert_embeddings = get_bert_embeddings(feature, segments_tensor, embedding_model).float()
        
            # For RNN
            no_observations += target.shape[0]
            model.batch_size = target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(bert_embeddings).squeeze(1)

            # save the predicted values from the last epoch
            if epoch+1 == number_epoch:
                pred_train = np.append(pred_train, predictions.detach().cpu().numpy(), axis = 0)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]     # Get the SSE of this batch from loss_fn()
            epoch_sse += sse       # Get the SSE of this batch from our model_performance() function

        valid_loss, valid_mse, __, __ = eval(dev_iter, model, embedding_model)

        # print the result of this epoch
        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.4f} | Train MSE: {epoch_mse:.4f} | Train RMSE: {epoch_mse**0.5:.4f} | \
        Val. Loss: {valid_loss:.4f} | Val. MSE: {valid_mse:.4f} |  Val. RMSE: {valid_mse**0.5:.4f} |')
        
        # step the scheduler
        scheduler.step()

        if valid_mse**0.5 < 0.536:
            torch.save(model, '/content/drive/MyDrive/nlp_cw/approach1_model.pth')
            break

    return valid_loss, valid_mse**0.5

        # print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.4f} | Train MSE: {epoch_mse:.4f} | Train RMSE: {epoch_mse**0.5:.4f} ')

    # valid_loss, valid_mse, __, __ = eval(dev_iter, model, embedding_model)

    # epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
    # print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.4f} | Train MSE: {epoch_mse:.4f} | Train RMSE: {epoch_mse**0.5:.4f} | \
    # Val. Loss: {valid_loss:.4f} | Val. MSE: {valid_mse:.4f} |  Val. RMSE: {valid_mse**0.5:.4f} |')

In [None]:
# Evaluate performance on dev set

def eval(data_iter, model, embedding_model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for feature, target in data_iter:

            feature, target = feature.to(device), target.float().to(device)
            segments_tensor = torch.where(feature == 0, 0, 1).to(device)

            # Get the BERT embeddings
            bert_embeddings = get_bert_embeddings(feature, segments_tensor, embedding_model).float()

            # for RNN
            no_observations = no_observations + target.shape[0]
            model.batch_size = target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(bert_embeddings).squeeze(1)

            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]    # Get the SSE of this batch from loss
            epoch_sse += sse         # Get the SSE from our model_performance() function  
            pred_all.extend(pred)
            trg_all.extend(trg)

    # Return the MSE from loss, MSE from our model_performance(), the predicted and target values
    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)
    

In [None]:
# Print the model performance

def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [None]:
# BiLSTM without the embedding layer

class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, linear_dim, num_layers, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.num_layers = num_layers

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = num_layers, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, linear_dim)
        self.linear = nn.Linear(linear_dim, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2*num_layers, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2*num_layers, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
        # Input sentence has shape (batch_size, seq_length, embedding_dim)
        # Need to reshape before going into RNN
        sentence = sentence.permute(1,0,2)
        
        lstm_out, self.hidden = self.lstm(
            sentence, self.hidden)
        
        out = nn.functional.relu(self.hidden2label(lstm_out[-1]))
        out = self.linear(out)
        return out

In [None]:
# Define the preprocessing functions

def get_editted_headline(input):
      """ 
      replace the target word with the edit word to get the editted headline
      """ 
      input["new"] = input.apply(
          lambda x: re.sub(r"<.+/>", x["edit"], x["original"]), axis = 1
      ).str.strip().str.lower()
      return input

def remove_common_words(input):
      """ 
      remove common words and punctuations in news headline
      """ 
      remove_words = [":", ",", "-", "|", "?", "!", "#", "%", "$", "live updates", "latest updates", 
                      "bbc news", "us news", "world news", "fox news", "nbc news",
                      "world news", "guardian us news", "news agency", "foxnews", 
                      "breitbart news", "ap news", "live news"]
      for word in remove_words:
        input["new"] = input["new"].str.replace(word, " ")
      return input

def drop_zero_grade(input):
      """ 
      remove headlines that received no grade
      """ 
      input = input.drop(input[input["meanGrade"]==0].index)
      input.index = range(len(input))
      return input

In [None]:
# Define the functions for obtaining BERT hidden states

def get_bert_tokens(text, tokenizer):
      """ 
      input a list of texts and a tokenizer,
      output the BERT token index
      """
      tokens_tensor = tokenizer(text, padding = "max_length", truncation = True, 
                                max_length = 36, return_tensors = 'pt')["input_ids"]

      return tokens_tensor


def get_bert_embeddings(tokens_tensor, segments_tensor, model):
      """ 
      input BERT token index, segment tensor and the pre-trained BERT model,
      output the BERT hidden states (12 layers)
      """
      model.eval()
      with torch.no_grad():
        output = model(tokens_tensor, segments_tensor)
        hidden_states = output[2][-1]    # only takes the last hidden layer

        # # Adjust for other hidden layers here
        # Sum the last 4 hidden states
        # hidden_states = torch.stack(output[2][-4:], dim = 0)
        # hidden_states = torch.sum(hidden_states, dim = 0)

        # Average the last 4 hidden states
        # hidden_states = torch.stack(output[2][-4:], dim = 0)
        # hidden_states = torch.mean(hidden_states, dim = 0)

        # Sum all 12 hidden states
        # hidden_states = torch.stack(output[2][:], dim = 0)
        # hidden_states = torch.sum(hidden_states, dim = 0)

        # Take the second last hidden state
        # hidden_states = output[2][-2]

        # Concatenate the last 4 hidden states
        # Please multiply hidden_dim of Bi-LSTM by 4 as well.
        # hidden_states = output[2][-1]
        # for i in range(3):
        #       hidden_states = torch.cat((hidden_states, output[2][-i-2]), dim = 3)

      return hidden_states


In [None]:
# Define the hyperparameters here

# Proportion of training data for train compared to dev
train_proportion = 0.8

# RNN hyperparameters
num_layers = 1
embedding_dim = 768   # Do not change, the BERT embedding dimension is fixed  
hidden_dim = 300
linear_dim = 150

# Training hyperparameters
epochs = 28
batch_size = 32
learning_rate = 0.00001
milestones = [8, 20]
gamma = 0.2

In [None]:
# Data preprocessing

train_df = get_editted_headline(train_df)
train_df = remove_common_words(train_df)
print(train_df.shape)
train_df = drop_zero_grade(train_df)
print(train_df.shape)

dev_df = get_editted_headline(dev_df)
dev_df = remove_common_words(dev_df)
print(dev_df.shape)
dev_df = drop_zero_grade(dev_df)
print(dev_df.shape)

# Create the bert model and tokenizer
bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states = True).to(device)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
# code for optimising the network performance

# hyperparameter tuning with random search

# store the hyperparameters in lists
lr = []
batchsize = []
hiddendim = []
lineardim = []
RMSEs = []

# train n models by randomly sampling hyperparameters
# after finding the best model, zoom into the neighbourhood of its hyperparameter space
# and perform subsequent rounds of random search
# lastly, hyperparameters like epoch are also tuned individually to achieve better validation performance

num_model = 25

for i in range(num_model): # sample the hyperparameters for each model

        learning_rate = 10**(-2*np.random.rand()-2)  #10^-4 to 10^-2, sampling from log scale
        lr.append(learning_rate)

        batch = np.array([4,8,16,32,64,128])
        batch_size = int(np.random.choice(batch))
        batchsize.append(batch_size)

        hid = np.array([20,40,60,80,100,120,140,160,180,200])
        hidden = int(np.random.choice(hid))
        hiddendim.append(hidden)

        lin = np.array([20,30,40,50,60,70,80,90,100,120])
        linear = int(np.random.choice(lin))
        lineardim.append(linear)

        # Convert preprocessed texts into a list
        train_text = list(train_df["new"])
        train_tokens = get_bert_tokens(train_text, tokenizer)
        print(train_tokens.shape)

        # Split dataset
        train_tokens = Task1Dataset(train_tokens, train_df["meanGrade"])
        train_split = round(len(train_text)*train_proportion)
        val_split = len(train_text) - train_split
        train_dataset, val_dataset = random_split(train_tokens, (train_split, val_split),
                                                  generator = torch.Generator().manual_seed(SEED))
        print(len(train_dataset))
        print(len(val_dataset))

        # Create the DataLoaders
        train_loader = DataLoader(train_dataset, shuffle = True, batch_size = batch_size)
        val_loader = DataLoader(val_dataset, shuffle = False, batch_size = batch_size)

        # Create the model, loss function and optimizer
        model = BiLSTM(embedding_dim, hidden, linear, num_layers, batch_size, device).to(device)
        loss_fn = nn.MSELoss().to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones = milestones, gamma = gamma)

        # Start training
        mse, rmse = train(train_loader, val_loader, model, bert, epochs, scheduler)

        # save the result
        RMSEs.append(rmse)


# view results in pandas dataframe
print(lr)
print(batchsize)
print(hiddendim)
print(lineardim)
print(RMSEs)

result = {'lr': lr, 'batch_size': batchsize, 'hidden_dim': hiddendim, 'linear_dim': lineardim, 'RMSE': RMSEs}
df = pd.DataFrame(result)
df = df.sort_values(by = ['RMSEs'], ascending = True)

print(df)

In [None]:
# Training the final model

# Convert preprocessed texts into a list
train_text = list(train_df["new"])
train_tokens = get_bert_tokens(train_text, tokenizer)
print(train_tokens.shape)

dev_text = list(dev_df["new"])
dev_tokens = get_bert_tokens(dev_text, tokenizer)
print(dev_tokens.shape)


# Create dataset
train_tokens = Task1Dataset(train_tokens, train_df["meanGrade"])
dev_tokens = Task1Dataset(dev_tokens, dev_df["meanGrade"])

## Split training set into a train and validation set here for preliminary experimentations

# train_split = round(len(train_text)*train_proportion)
# val_split = len(train_text) - train_split
# train_dataset, val_dataset = random_split(train_tokens, (train_split, val_split),
#                                           generator = torch.Generator().manual_seed(SEED))
print(len(train_tokens))
print(len(dev_tokens))


# Create the DataLoaders
train_loader = DataLoader(train_tokens, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(dev_tokens, shuffle = False, batch_size = batch_size)


# Create the model, loss function and optimizer
model = BiLSTM(embedding_dim, hidden_dim, linear_dim, num_layers, batch_size, device).to(device)
loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones = milestones, gamma = gamma)


# Start training
train(train_loader, val_loader, model, bert, epochs, scheduler)



In [None]:
# Get the test RMSE

# dataloading and preprocessing
test_df = pd.read_csv("test.csv")
print(test_df.shape)
test_df = get_editted_headline(test_df)
test_df = remove_common_words(test_df)
test_df = drop_zero_grade(test_df)
print(test_df.shape)

# get BERT tokens
test_text = list(test_df["new"])
test_tokens = get_bert_tokens(test_text, tokenizer)
print(test_tokens.shape)

# create the test loader
test_tokens = Task1Dataset(test_tokens, test_df["meanGrade"])
print(len(test_tokens))
test_loader = DataLoader(test_tokens, shuffle = False, batch_size = batch_size)

# evaluation
test_loss, test_MSE, __, __ = eval(test_loader, model, bert)
print(f"The final RMSE on the test set is : {test_MSE**0.5:.4f}")


# get the evaluation for each quintile of meanGrade
test_df["quintile"] = pd.qcut(test_df["meanGrade"], 5, labels = False)

for i in range(5):
    test_quintile = test_df[test_df["quintile"] == i]
    test_quintile.index = range(len(test_quintile))

    test_text = list(test_quintile["new"])
    test_tokens = get_bert_tokens(test_text, tokenizer)
    print(test_tokens.shape)

    # create the test loader
    test_tokens = Task1Dataset(test_tokens, test_quintile["meanGrade"])
    print(len(test_tokens))
    test_loader = DataLoader(test_tokens, shuffle = False, batch_size = batch_size)

    # evaluation
    test_loss, test_MSE, __, __ = eval(test_loader, model, bert)
    print(f"The final RMSE on the {i+1} quintile of the test set is : {test_MSE**0.5:.4f}")


In [None]:
# non-contextualised word embeddings

# Define training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    
    print("Training model.")

    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:

            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')



# Evaluate performance on dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)



# How print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse


In [None]:
# non-contextualised word embeddings

def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list

    for sentence in data:

        tokenized_sentence = []

        for token in sentence.split(' '): # simplest split is

            tokenized_sentence.append(token)

        tokenized_corpus.append(tokenized_sentence)

    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list

    for sentence in tokenized_corpus:

        for token in sentence:

            if token not in vocabulary:

                if True:
                    vocabulary.append(token)

    return vocabulary, tokenized_corpus


def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_labels = [l for f, l in batch]
    batch_features = [f for f, l in batch]

    batch_features_len = [len(f) for f, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    batch_labels = torch.FloatTensor(batch_labels)

    return seq_tensor, batch_labels


class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)

        out = self.hidden2label(lstm_out[-1])
        return out
  


## Approach 1 code, using functions defined above:

# We set our training data and test data
test_df = pd.read_csv("test.csv")
test_df = get_editted_headline(test_df)
test_df = remove_common_words(test_df)
test_df = drop_zero_grade(test_df)

train_df = get_editted_headline(train_df)
train_df = remove_common_words(train_df)
train_df = drop_zero_grade(train_df)

training_data = train_df['new']
test_data = test_df['new']

# Creating word vectors
training_vocab, training_tokenized_corpus = create_vocab(training_data)
test_vocab, test_tokenized_corpus = create_vocab(test_data)

# Creating joint vocab from test and train:
joint_vocab, joint_tokenized_corpus = create_vocab(pd.concat([training_data, test_data]))

print("Vocab created.")

# We create representations for our tokens
wvecs = [] # word vectors
word2idx = [] # word2index
idx2word = []

# This is a large file, it will take a while to load in the memory!
with codecs.open('glove.6B.100d.txt', 'r','utf-8') as f:
  index = 1
  for line in f.readlines():
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:
          (word, vec) = (word,
                     list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1

wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in training_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]

INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
BATCH_SIZE = 32

model = BiLSTM(EMBEDDING_DIM, 50, INPUT_DIM, BATCH_SIZE, device)
print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(wvecs))

feature = vectorized_seqs

# 'feature' is a list of lists, each containing embedding IDs for word tokens
train_and_dev = Task1Dataset(feature, train_df['meanGrade'])

train_examples = round(len(train_and_dev)*train_proportion)
dev_examples = len(train_and_dev) - train_examples

train_dataset, dev_dataset = random_split(train_and_dev,
                                           (train_examples,
                                            dev_examples))

train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters())

train(train_loader, dev_loader, model, epochs)


In [None]:
# fine tune a pre-trained BERT

class BertRegressor(nn.Module):

    def __init__(self, freeze_bert = False):
        super(BertRegressor, self).__init__()

        input, hidden, output = 768, 120, 1

        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.regressor = nn.Sequential(
            nn.Linear(input, hidden),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden, output)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input, segment_tensor):
        output = self.bert(input, segment_tensor)

        last_hidden_state = output[0][:, 0, :]

        out = self.regressor(last_hidden_state)

        return out

In [None]:
model = BertRegressor().to(device)
# Convert preprocessed texts into a list
train_text = list(train_df["new"])
train_tokens = get_bert_tokens(train_text, tokenizer)
print(train_tokens.shape)

dev_text = list(dev_df["new"])
dev_tokens = get_bert_tokens(dev_text, tokenizer)
print(dev_tokens.shape)


In [None]:
# Create dataset
train_tokens = Task1Dataset(train_tokens, train_df["meanGrade"])
dev_tokens = Task1Dataset(dev_tokens, dev_df["meanGrade"])
# train_split = round(len(train_text)*train_proportion)
# val_split = len(train_text) - train_split
# train_dataset, val_dataset = random_split(train_tokens, (train_split, val_split),
#                                           generator = torch.Generator().manual_seed(SEED))
print(len(train_tokens))
print(len(dev_tokens))

# Change the number of epochs here
epochs = 8

loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# Create the DataLoaders
train_loader = DataLoader(train_tokens, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(dev_tokens, shuffle = False, batch_size = batch_size)

for epoch in range(epochs):

    model.train()
    epoch_loss = 0
    epoch_sse = 0
    no_observations = 0

    for feature, target in train_loader:

        feature, target = feature.to(device), target.float().to(device)
        no_observations += target.shape[0]
        segments_tensor = torch.where(feature == 0, 0, 1).to(device)
        out = model(feature, segments_tensor)

        optimizer.zero_grad()
        target = target.reshape((target.shape[0],1))
        loss = loss_fn(out, target).float()
        # print(loss.item())
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()*target.shape[0]     # Get the SSE of this batch from loss_fn()

    # print the result of this epoch
    epoch_loss = epoch_loss / no_observations
    print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.4f} | Train RMSE: {epoch_loss**0.5:.4f}')

## Use no pre-trained representation

In [None]:
train_and_dev = train_df['edit']

training_data, dev_data, training_y, dev_y = train_test_split(train_df['edit'], train_df['meanGrade'],
                                                                        test_size=(1-train_proportion),
                                                                        random_state=42)

# We train a Tf-idf model
count_vect = CountVectorizer(stop_words='english')
train_counts = count_vect.fit_transform(training_data)
transformer = TfidfTransformer().fit(train_counts)
train_counts = transformer.transform(train_counts)
regression_model = LinearRegression().fit(train_counts, training_y)

# Train predictions
predicted_train = regression_model.predict(train_counts)

# Calculate Tf-idf using train and dev, and validate model on dev:
test_and_test_counts = count_vect.transform(train_and_dev)
transformer = TfidfTransformer().fit(test_and_test_counts)

test_counts = count_vect.transform(dev_data)

test_counts = transformer.transform(test_counts)

# Dev predictions
predicted = regression_model.predict(test_counts)

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, training_y, True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_y, True)

In [None]:
# Baseline for the task
pred_baseline = torch.zeros(len(dev_y)) + np.mean(training_y)
print("\nBaseline performance:")
sse, mse = model_performance(pred_baseline, dev_y, True)

In [None]:
#load data and replace the old words with editted words
train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("test.csv")
train_df = get_editted_headline(train_df)
dev_df = get_editted_headline(dev_df)
test_df = get_editted_headline(test_df)
train_df = remove_common_words(train_df)
dev_df = remove_common_words(dev_df)
test_df = remove_common_words(test_df)
train_df = drop_zero_grade(train_df)
dev_df = drop_zero_grade(dev_df)
test_df = drop_zero_grade(test_df)

In [None]:
#Adding start and stop tokens to each sentence, and tokenize these sentence
def get_tokenized_corpus(data):
  tokenized_corpus = []
  for sentence in data:
    sentence = "<s> " + sentence + " </s>"
    tokenized_sentence = []
    for token in sentence.split(' '):
      tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)
  return tokenized_corpus


In [None]:
tokenized_train = get_tokenized_corpus(train_df["new"])
tokenized_dev = get_tokenized_corpus(dev_df["new"])
tokenized_test = get_tokenized_corpus(test_df["new"])
print(tokenized_train)
print(tokenized_dev)
print(tokenized_test)

In [None]:
#word2vec vocabulary, which is built through training data and used to convert words to correponding indexs or indexs to corresponding.
class Vocabulary(object):
  def __init__(self):
    self._word2idx = {}
    self.idx2word = []
    # 0-padding token
    self.add_word('<pad>')
    # Unknown words
    self.add_word('<unk>')
    self._unk_idx = self._word2idx['<unk>']

  def word2idx(self, word):
    return self._word2idx.get(word, self._unk_idx)

  def add_word(self, word):
    if word not in self._word2idx:
      self.idx2word.append(word)
      self._word2idx[word] = len(self.idx2word) - 1

  def build_from_data(self, data):
    for sentence in data:
      for word in sentence:
        self.add_word(word)

  def convert_idxs_to_words(self, idxs):
    return ' '.join(self.idx2word[idx] for idx in idxs)

  def convert_words_to_idxs(self, words):
    return [self.word2idx(w) for w in words]

  def __len__(self):
    return len(self.idx2word)
  
  def __repr__(self):
    return "Vocabulary with {} items".format(self.__len__())

In [None]:
vocab = Vocabulary()
print(vocab.idx2word)
vocab.build_from_data(tokenized_train)
print(vocab.idx2word)

In [None]:
#Convert data to indexs through vocabulary
def convert_data_to_idx(data):
  corpus_idxs = []
  for sentence in data:
    corpus_idxs.append(vocab.convert_words_to_idxs(sentence))
  return corpus_idxs

In [None]:
train_idxs = convert_data_to_idx(tokenized_train)
test_idxs = convert_data_to_idx(tokenized_test)
dev_idxs = convert_data_to_idx(tokenized_dev)
print(train_idxs)
print(test_idxs)
print(dev_idxs)

In [None]:
#pad the sentences in training data, validation data and testing data to their corresponding maximum lenghth.
def collate_fn_padd(data):
  lens = []
  for sentence in data:
    lens.append(len(sentence))
  
  padded = torch.zeros((len(data), max(lens)))

  for i in range(len(data)):
    padded[i][: lens[i]] = torch.LongTensor(data[i])

  return padded.to(device)


In [None]:
train_idxs_padded = collate_fn_padd(train_idxs).to(device)
test_idxs_padded = collate_fn_padd(test_idxs).to(device)
dev_idxs_padded = collate_fn_padd(dev_idxs).to(device)

print(train_idxs_padded.size())
print(test_idxs_padded.size())
print(dev_idxs_padded.size())

In [None]:
#Neural network with embedding layer
class FFNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):  
        super(FFNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)  
    
    def forward(self, x):

        embedded = self.embedding(x)
        #Average through each words in one sentence
        sent_lens = x.ne(0).sum(1, keepdims=True)
        sent_lens = x.ne(0).sum(1, keepdims=True)
        averaged = embedded.sum(1) / sent_lens
        out = self.fc1(averaged)
        out = self.relu1(out)
        out = self.fc2(out)
        return out

In [None]:
# Define training loop

def train(train_iter, dev_iter, model, number_epoch, optimizer, loss_fn):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    
    print("Training model.")
    
    for epoch in range(number_epoch):
        
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0    # Observations used for training so far
        pred_train = np.empty(0)

        for feature, target in train_iter:

            feature, target = feature.to(device).long(), target.float().to(device)
            
            no_observations += target.shape[0]

            predictions = model(feature).squeeze(1)

            # save the predicted values from the last epoch
            if epoch+1 == number_epoch:
                pred_train = np.append(pred_train, predictions.detach().cpu().numpy(), axis = 0)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]     # Get the SSE of this batch from loss_fn()
            epoch_sse += sse       # Get the SSE of this batch from our model_performance() function

        valid_loss, valid_mse, __, __ = eval(dev_iter, model, loss_fn)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.4f} | Train MSE: {epoch_mse:.4f} | Train RMSE: {epoch_mse**0.5:.4f} | \
        Val. Loss: {valid_loss:.4f} | Val. MSE: {valid_mse:.4f} |  Val. RMSE: {valid_mse**0.5:.4f} |')
        

        if valid_mse**0.5 < 0.540:
            torch.save(model, '/content/drive/MyDrive/nlp_cw/approach1_model.pth')
            break


In [None]:
# Evaluate performance on dev set

def eval(data_iter, model, loss_fn):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for feature, target in data_iter:

            feature, target = feature.to(device).long(), target.float().to(device)


            no_observations = no_observations + target.shape[0]

            predictions = model(feature).squeeze(1)

            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]    # Get the SSE of this batch from loss
            epoch_sse += sse         # Get the SSE from our model_performance() function  
            pred_all.extend(pred)
            trg_all.extend(trg)

    # Return the MSE from loss, MSE from our model_performance(), the predicted and target values
    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)
    

In [None]:
batch_size = 64
epochs = 13
embedding_dim = len(vocab.idx2word) 
hidden_dim = 20
learning_rate = 0.00002

# Create dataset
train_tokens = Task1Dataset(train_idxs_padded, train_df["meanGrade"])
dev_tokens = Task1Dataset(dev_idxs_padded, dev_df["meanGrade"])

print(len(train_tokens))
print(len(dev_tokens))

# Create the DataLoaders
train_loader = DataLoader(train_tokens, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(dev_tokens, shuffle = False, batch_size = batch_size)

# Create the model, loss function and optimizer
model = FFNN(embedding_dim, hidden_dim, len(vocab.idx2word)).to(device)
loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# Start training
train(train_loader, val_loader, model, epochs, optimizer, loss_fn)



In [None]:
# Get the test RMSE

test_tokens = Task1Dataset(test_idxs_padded, test_df["meanGrade"])

print(len(test_tokens))
test_loader = DataLoader(test_tokens, shuffle = False, batch_size = batch_size)

# evaluation
test_loss, test_MSE, __, __ = eval(test_loader, model, loss_fn)
print(f'| MSE: {test_MSE:.2f} | RMSE: {test_MSE**0.5:.2f} |')


In [None]:
#CNN model
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, out_channels, window_size, dropout):
    super(CNN, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
    self.conv = nn.Conv2d(
      in_channels=1, out_channels=out_channels,
      kernel_size=(window_size, embedding_dim))
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(out_channels, 1)
        
  def forward(self, x):
    embedded = self.embedding(x)
    embedded = embedded.unsqueeze(1)
    feature_maps = self.conv(embedded)
    feature_maps = feature_maps.squeeze(3)
    feature_maps = nn.functional.relu(feature_maps)
    pooled = nn.functional.max_pool1d(feature_maps, feature_maps.shape[2])
    pooled = pooled.squeeze(2)
    dropped = self.dropout(pooled)
    preds = self.fc(dropped)
    
    return preds

In [None]:
batch_size = 64
epochs = 15
embedding_dim = len(vocab.idx2word) 
hidden_dim = 15
learning_rate = 0.00003
N_OUT_CHANNELS = 100
WINDOW_SIZE = 1
DROPOUT = 0.6

# Create dataset
train_tokens = Task1Dataset(train_idxs_padded, train_df["meanGrade"])
dev_tokens = Task1Dataset(dev_idxs_padded, dev_df["meanGrade"])

print(len(train_tokens))
print(len(dev_tokens))

# Create the DataLoaders
train_loader = DataLoader(train_tokens, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(dev_tokens, shuffle = False, batch_size = batch_size)

# Create the model, loss function and optimizer
model = CNN(len(vocab.idx2word), embedding_dim, N_OUT_CHANNELS, WINDOW_SIZE, DROPOUT).to(device)
loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# Start training
train(train_loader, val_loader, model, epochs, optimizer, loss_fn)

In [None]:
# Get the test RMSE

test_tokens = Task1Dataset(test_idxs_padded, test_df["meanGrade"])

print(len(test_tokens))
test_loader = DataLoader(test_tokens, shuffle = False, batch_size = batch_size)

# evaluation
test_loss, test_MSE, __, __ = eval(test_loader, model, loss_fn)
print(f'| MSE: {test_MSE:.2f} | RMSE: {test_MSE**0.5:.2f} |')


In [None]:
batch_size = 64
epochs = 8
embedding_dim = len(vocab.idx2word) 
hidden_dim = 15
learning_rate = 0.00001
N_OUT_CHANNELS = 100
WINDOW_SIZE = 2
DROPOUT = 0.4

# Create dataset
train_tokens = Task1Dataset(train_idxs_padded, train_df["meanGrade"])
dev_tokens = Task1Dataset(dev_idxs_padded, dev_df["meanGrade"])

print(len(train_tokens))
print(len(dev_tokens))

# Create the DataLoaders
train_loader = DataLoader(train_tokens, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(dev_tokens, shuffle = False, batch_size = batch_size)

# Create the model, loss function and optimizer
model = CNN(len(vocab.idx2word), embedding_dim, N_OUT_CHANNELS, WINDOW_SIZE, DROPOUT).to(device)
loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# Start training
train(train_loader, val_loader, model, epochs, optimizer, loss_fn)

In [None]:
# Get the test RMSE

test_tokens = Task1Dataset(test_idxs_padded, test_df["meanGrade"])

print(len(test_tokens))
test_loader = DataLoader(test_tokens, shuffle = False, batch_size = batch_size)

# evaluation
test_loss, test_MSE, __, __ = eval(test_loader, model, loss_fn)
print(f'| MSE: {test_MSE:.2f} | RMSE: {test_MSE**0.5:.2f} |')


In [None]:
# bidirectional LSTM model
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)

        out = self.hidden2label(lstm_out[-1])
        return out

In [None]:
# Define training loop

def train(train_iter, dev_iter, model, number_epoch, optimizer, loss_fn):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    
    print("Training model.")

    for epoch in range(number_epoch):
        
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0    # Observations used for training so far
        pred_train = np.empty(0)

        for feature, target in train_iter:

            feature, target = feature.to(device).long(), target.float().to(device)
            
            no_observations += target.shape[0]
            model.batch_size = target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)

            # save the predicted values from the last epoch
            if epoch+1 == number_epoch:
                pred_train = np.append(pred_train, predictions.detach().cpu().numpy(), axis = 0)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]     # Get the SSE of this batch from loss_fn()
            epoch_sse += sse       # Get the SSE of this batch from our model_performance() function

        valid_loss, valid_mse, __, __ = eval(dev_iter, model, loss_fn)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.4f} | Train MSE: {epoch_mse:.4f} | Train RMSE: {epoch_mse**0.5:.4f} | \
        Val. Loss: {valid_loss:.4f} | Val. MSE: {valid_mse:.4f} |  Val. RMSE: {valid_mse**0.5:.4f} |')
        

        if valid_mse**0.5 < 0.540:
            torch.save(model, '/content/drive/MyDrive/nlp_cw/approach1_model.pth')
            break


In [None]:
# Evaluate performance on dev set

def eval(data_iter, model, loss_fn):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for feature, target in data_iter:

            feature, target = feature.to(device).long(), target.float().to(device)


            no_observations = no_observations + target.shape[0]
            model.batch_size = target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)

            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]    # Get the SSE of this batch from loss
            epoch_sse += sse         # Get the SSE from our model_performance() function  
            pred_all.extend(pred)
            trg_all.extend(trg)

    # Return the MSE from loss, MSE from our model_performance(), the predicted and target values
    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [None]:
batch_size = 64
epochs = 10
embedding_dim = len(vocab.idx2word) 
hidden_dim = 10
learning_rate = 0.00008

# Create dataset
train_tokens = Task1Dataset(train_idxs_padded, train_df["meanGrade"])
dev_tokens = Task1Dataset(dev_idxs_padded, dev_df["meanGrade"])

print(len(train_tokens))
print(len(dev_tokens))

# Create the DataLoaders
train_loader = DataLoader(train_tokens, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(dev_tokens, shuffle = False, batch_size = batch_size)

# Create the model, loss function and optimizer
model = BiLSTM(embedding_dim, hidden_dim, len(vocab.idx2word), batch_size, device).to(device)
loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# Start training
train(train_loader, val_loader, model, epochs, optimizer, loss_fn)



In [None]:
# Get the test RMSE

test_tokens = Task1Dataset(test_idxs_padded, test_df["meanGrade"])

print(len(test_tokens))
test_loader = DataLoader(test_tokens, shuffle = False, batch_size = batch_size)

# evaluation
test_loss, test_MSE, __, __ = eval(test_loader, model, loss_fn)
print(f'| MSE: {test_MSE:.2f} | RMSE: {test_MSE**0.5:.2f} |')


In [None]:
# pad the sentence to the maximum lengths in training, validation and testing data
def collate_fn_padd(data):
  lens = []
  for sentence in data:
    lens.append(len(sentence))
  
  padded = torch.zeros((len(data), 31))

  for i in range(len(data)):
    padded[i][: lens[i]] = torch.LongTensor(data[i])

  return padded.to(device)

In [None]:
train_idxs_padded = collate_fn_padd(train_idxs).to(device)
test_idxs_padded = collate_fn_padd(test_idxs).to(device)
dev_idxs_padded = collate_fn_padd(dev_idxs).to(device)

print(train_idxs_padded.size())
print(test_idxs_padded.size())
print(dev_idxs_padded.size())

In [None]:
#logistic regression model
regression_model = LinearRegression().fit(train_idxs_padded.cpu().numpy(), train_df["meanGrade"])

# Train predictions
predicted_train = regression_model.predict(train_idxs_padded.cpu().numpy())


predicted = regression_model.predict(dev_idxs_padded.cpu().numpy())

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, train_df["meanGrade"], True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_df["meanGrade"], True)

In [None]:
# Get the test RMSE
predicted_test = regression_model.predict(test_idxs_padded.cpu().numpy())

print(len(predicted_test))

# evaluation
print("\nTest performance:")
sse, mse = model_performance(predicted_test, test_df["meanGrade"], True)

In [None]:
#random forest regression model
rfc = RandomForestRegressor(n_estimators=30, max_depth=5)
rfc = rfc.fit(train_idxs_padded.cpu().numpy(), train_df["meanGrade"])
predicted_train = rfc.predict(train_idxs_padded.cpu().numpy())
predicted = rfc.predict(dev_idxs_padded.cpu().numpy())

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, train_df["meanGrade"], True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_df["meanGrade"], True)

In [None]:
# Get the test RMSE
predicted_test = rfc.predict(test_idxs_padded.cpu().numpy())

print(len(predicted_test))

# evaluation
print("\nTest performance:")
sse, mse = model_performance(predicted_test, test_df["meanGrade"], True)
