# **NLP SemEval 2021 | Team - 13 | Task - 1**

**Install Requirements**

In [1]:
!pip install wget
!pip install gdown
!pip install syllables
!pip -q install transformers

# Below are required when using all handcarfted features
# !pip install sqlalchemy
# !pip3 install wordfreq

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=629db2e753b8dfc7819a4a862ab7b68534ae0fbac969fac2abc35b025bf25b38
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting syllables
  Downloading https://files.pythonhosted.org/packages/16/d9/81a31f640ccf405fdfd0eae8eebfc2579b438804dbf34dc03cad3e76169a/syllables-0.1.0-py2.py3-none-any.whl
Installing collected packages: syllables
Successfully installed syllables-0.1.0
[K     |████████████████████████████████| 2.2MB 5.4MB/s 
[K     |████████████████████████████████| 3.3MB 17.5MB/s 
[K     |███████████████████

**Download Dataset**

In [2]:
%%bash
rm -r sample_data

mkdir train
cd train
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_single_train.tsv?dl=1' -O 'lcp_single_train.tsv'
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_multi_train.tsv?dl=1' -O 'lcp_multi_train.tsv'
cd ..

mkdir trial
cd trial
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/trial/lcp_single_trial.tsv' -O 'lcp_single_trial.tsv'
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/trial/lcp_multi_trial.tsv' -O 'lcp_multi_trial.tsv'
cd ..

mkdir test
cd test
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test/lcp_single_test.tsv?dl=1' -O 'lcp_single_test.tsv'
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test/lcp_multi_test.tsv?dl=1' -O 'lcp_multi_test.tsv'
cd ..

mkdir test-labels
cd test-labels
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_single_test.tsv?dl=1' -O 'lcp_single_test.tsv'
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_multi_test.tsv?dl=1' -O 'lcp_multi_test.tsv'
cd ..

mkdir score
mkdir glove-embeddings
mkdir saves
# cd glove-embeddings
# wget -q 'http://nlp.stanford.edu/data/glove.6B.zip' -O 'glove.6B.zip'
# unzip 'glove.6B.zip'
# wget -q 'http://nlp.stanford.edu/data/glove.840B.300d.zip' -O 'glove.840B.300d.zip'
# unzip 'glove.840B.300d.zip'
# cd ..
# rm -r mrcDB
# mkdir mrcDB
# cd mrcDB
# git clone 'https://github.com/samzhang111/mrc-psycholinguistics'
# cd mrc-psycholinguistics
# python extract.py
# cd ../..


**Import necessaty packages**

In [3]:
import time
import csv
import wget
import os
import copy
import sqlite3
import syllables
import pytz
from collections import defaultdict
import glob
import sys

import numpy as np
import pandas as pd
import spacy
import torch
from torch import nn
import sklearn
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm, trange
from transformers import BertTokenizer
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, random_split, DataLoader
from torch.utils.data import ConcatDataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

from datetime import datetime
IST = pytz.timezone('Asia/Kolkata')

# from wordfreq import word_frequency

# !python -m spacy download en_core_web_lg
# # !python -m spacy link en_core_web_lg en
# nlp = spacy.load('en')

**Variables used ahead can be changed here**

In [20]:
# Variables
random_seed = 42
random_state = 42
batch_size = 32
emb_dim = 300
epochs = 5

# Directory to save trained models
model_save_dir = '/content/saves/'

# True if you want to use already processed glove embeddings stored in drive
# False to generate glove embeddings using 'glove.840B.300d.zip' file (Time Consuming)
use_pre_processed_glove_emvedding = True

# Below are preprocessed glove embessing urls | Enter below urls if "use_pre_processed_glove_emvedding = True" to use preprocessed word embeddings
preprocessed_glove_embedding_url_test_single = 'https://drive.google.com/uc?id=1tr6wO5JuOJKo0pVsNdjjcR_4xXwIiimk'
preprocessed_glove_embedding_url_test_multi = 'https://drive.google.com/uc?id=1BbPkLw6Obq8g7jdhMDyn-fcougUcjjE-'
preprocessed_glove_embedding_url_train_multi = 'https://drive.google.com/uc?id=1WeJmYV0yIWPvLp2diaMFvPP1-zfP5nZb'

# Enter below urls/path if "use_pre_processed_glove_emvedding = False" i.e. extract glove embeddings from original corpus (Time Consuming)
glove_embedding_url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip' 
glove_embedding_download_path = '/content/glove-embeddings/glove.840B.300d.txt' 
glove_embedding_save_path = '/content/glove-embeddings/preprocessed_glove_embeddings.csv'#do not change

In [21]:
# Initialization
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
print("Device Name: ",torch.cuda.get_device_name())

Device:  cuda:0
Device Name:  Tesla K80


**Load train and test data from csv files**
> Combine both training datasets of single and multi word tokens






In [6]:
# read data
train_data_single =  pd.read_csv('train/lcp_single_train.tsv',  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
train_data_multi =  pd.read_csv('train/lcp_multi_train.tsv',  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
test_data_single = pd.read_csv('test-labels/lcp_single_test.tsv',  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
test_data_multi = pd.read_csv('test-labels/lcp_multi_test.tsv',  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

train_data = train_data_single
train_data = train_data.append(train_data_multi, ignore_index=True)

pandas reads "null" word tokens as null values, converting them back to "null" string

In [7]:
def remove_dataset_errors(data):
    for i, token in enumerate(list(data.token)):
        if isinstance(token, float):
            data.token.iloc[i]="null"
            # print(i,token)
    return data

train_data = remove_dataset_errors(train_data)
test_data_single = remove_dataset_errors(test_data_single)
test_data_multi = remove_dataset_errors(test_data_multi)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


**Dataset Class is used to do all the preprocessings required on datasets**

> Commented part of the code was used to generate features that were used during experimentation phase but removed from the final code



In [8]:
class Dataset():
    def __init__(self, data, split, batch_size=32, max_len=128):
        self.split = split                                          # current dataset type
        self.data = data                                            # complete data
        self.max_len = max_len                                      # max sentence len for Bert Tokenizer
        self.batch_size = batch_size                                

        self.tokens = list(data.token)
        self.sentences = list(data.sentence)
        self.complexity = torch.Tensor(list(data.complexity))

        # Glove word embedding for tokens
        self.token_embedding = torch.FloatTensor(self.word_glove_embeddings(data.token).values)

        # Bert Sentence to vector id conversion with attention mask
        self.tokenised_sent_id, self.sent_attention_mask = self.BertSentTokenizer()

        # Hand-Crafted Features
        self.token_len = self.token_sizes(data.token)                                                       # length of token
        self.syllable_count = self.syllableCount(data.token)                                                # syllable count in token
        self.vowel_count = self.vowelCount(data.token)                                                      # vowel count in token
        self.token_type = self.get_token_type()                                                             # token_type = 1 for single word tokens and 0 for multi word tokens
        # self.imageability, self.concreteness = self.get_Imageability_and_Concreteness(data.token)         
        # self.term_freq_doc = self.wordFrequencyInDoc()
        # self.term_freq_wiki = self.wordFrequencyWiki(data.token)
        # self.pos_tag = self.tokenPOSTag(data.token, data.sentence)

        self.handcrafted_features = torch.FloatTensor(                              # Handcrafted features combined together
                                        np.hstack((
                                            self.token_len,
                                            self.syllable_count,
                                            self.vowel_count,
                                            self.token_type
                                            # self.imageability,
                                            # self.concreteness,
                                            # self.term_freq_doc,
                                            # self.term_freq_wiki,
                                            # self.pos_tag,
                                            ))
                                    )

    
    def get_token_type(self):
        token_type = []
        for token in self.tokens:
            if len(token.split(' '))>1:
                token_type.append([0])
            else:
                token_type.append([1])
        
        return np.array(token_type)

    def vowelCount(self, tokens):   
        vowel_count = []
        vowels = ['a','e','i','o','u']

        for token in tokens:
            count = len([1 for c in token if c in vowels])
            vowel_count.append(count)
        
        max_vowel_count=max(vowel_count)
        vowel_count = [[v/max_vowel_count] for v in vowel_count]
        return np.array(vowel_count)

    def syllableCount(self, tokens):
        syllable_count = []
        
        for token in tokens:
            syllable_count.append(syllables.estimate(token))
        
        max_syllable_count = max(syllable_count)
        syllable_count = [[sc/max_syllable_count] for sc in syllable_count]
        
        return np.array(syllable_count)

    def token_sizes(self, tokens):
        max_token_len = 0
        # token length, word frequency, 
        for token in tokens:
            max_token_len = max(max_token_len, len(token))

        tokenSizes = []
        for token in tokens:
            tokenSizes.append( [len(token)/max_token_len] )

        return np.array(tokenSizes)

    def BertSentTokenizer(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        tokenised_sent_id = []
        sent_attention_mask =[]

        for sent, token in zip(self.sentences, self.tokens):
            inp = str(token).lower() + ' [SEP] ' +  str(sent).lower() 
            out = tokenizer.encode_plus(inp, add_special_tokens=True, max_length=self.max_len, padding='max_length', 
                                            return_attention_mask = True, return_tensors = 'pt', truncation = True)
            tokenised_sent_id.append(out['input_ids'])
            sent_attention_mask.append(out['attention_mask'])

        tokenised_sent_id = torch.cat(tokenised_sent_id, dim=0)
        sent_attention_mask = torch.cat(sent_attention_mask, dim=0)

        return tokenised_sent_id, sent_attention_mask

    def generate_word_glove_embeddings(self,tokens):
        print("Filtering token Embeddings ...")
        embedding_dict={}
        with open(glove_embedding_download_path, 'r', encoding="utf-8") as f:
            for line in f:
                try:
                    cols = line.split()
                    token = cols[0]
                    vector = np.asarray(cols[1:],"float32")
                    embedding_dict[token] = vector
                except:
                    #print(line)
                    flag=1
        
        glove_embeddings=[]
        for multi_word in tokens:
            if len(multi_word.split(' '))==1:
                if token in embedding_dict.keys():
                    glove_embeddings.append(embedding_dict[token])
                else:
                    glove_embeddings.append(np.asarray([0]*300))
                    # print(token)
            else:
                words = multi_word.split(' ')
                if words[0] in embedding_dict.keys():
                    e1=embedding_dict[words[0]]
                else:
                    e1=np.asarray([0]*300)
                
                if words[1] in embedding_dict.keys():
                    e2=embedding_dict[words[1]]
                else:
                    e2=np.asarray([0]*300)
                
                glove_embeddings.append((e1+e2)/2)

        return pd.DataFrame(np.array(glove_embeddings))

    def word_glove_embeddings(self,tokens):
        if use_pre_processed_glove_emvedding == True:

            if self.split=='single test':
                out='/content/glove-embeddings/preprocessed_glove_embeddings_test_single.csv'
                if not os.path.isfile(out):
                    cmd = "gdown '{url}' -O '{out}'".format(url=preprocessed_glove_embedding_url_test_single, out=out)
                    !eval {cmd}

            elif self.split=='multi test':
                out='/content/glove-embeddings/preprocessed_glove_embeddings_test_multi.csv'
                if not os.path.isfile(out):
                    cmd = "gdown '{url}' -O '{out}'".format(url=preprocessed_glove_embedding_url_test_multi, out=out)
                    !eval {cmd}

            elif self.split=='train':
                out='/content/glove-embeddings/preprocessed_glove_embeddings_train_multi.csv'
                if not os.path.isfile(out):
                    cmd = "gdown '{url}' -O '{out}'".format(url=preprocessed_glove_embedding_url_train_multi, out=out)
                    !eval {cmd}
        else:     
            if not os.path.isfile('/content/glove-embeddings/'+glove_embedding_url.split('/')[-1]):
                print("Downloading glove dataset ... (This may take few minutes)")
                wget.download(url=glove_embedding_url, out='/content/glove-embeddings/'+glove_embedding_url.split('/')[-1])
                print("Download Complete")
                cmd = "unzip '{input}' -d '{out}'".format(input='/content/glove-embeddings/'+glove_embedding_url.split('/')[-1], out='/content/glove-embeddings/')
                !eval {cmd}

            glove_embedding = self.generate_word_glove_embeddings(tokens)
            if self.split=='single test':
                glove_embedding.to_csv(glove_embedding_save_path[:-4]+"_test_single.csv")
            elif self.split=='multi test':
                glove_embedding.to_csv(glove_embedding_save_path[:-4]+"_test_multi.csv")
            elif self.split=='train':
                glove_embedding.to_csv(glove_embedding_save_path[:-4]+"_train_multi.csv")
        
        if self.split=='single test':
             embedding = pd.read_csv('/content/glove-embeddings/preprocessed_glove_embeddings_test_single.csv', encoding='utf-8')
        elif self.split=='multi test':
            embedding = pd.read_csv('/content/glove-embeddings/preprocessed_glove_embeddings_test_multi.csv', encoding='utf-8')
        elif self.split=='train':
            embedding = pd.read_csv('/content/glove-embeddings/preprocessed_glove_embeddings_train_multi.csv', encoding='utf-8')
        
        embedding = embedding.loc[:, ~embedding.columns.str.contains('^Unnamed')]

        return embedding

    # def tokenPOSTag(self, tokens, sentences):
    #     tag_dict=['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE']
    #     pos_tag=[]
    #     for token, sent in zip(list(tokens),list(sentences)):
    #         try:
    #             doc = nlp(sent)
    #             f=1
    #             for word in doc:
    #                 if word.text == token:
    #                     pos_tag.append([tag_dict.index(word.pos_)])
    #                     f=0
    #                     break
    #             if f==1:
    #                 pos_tag.append([8])
    #         except:
    #             pos_tag.append([8])
    #             print(token, sent)

    #     return np.array(pos_tag)

    # def wordFrequencyWiki(self, tokens):
    #     term_freq=[]
    #     for token in tokens:
    #         term_freq.append([word_frequency(token,'en')])
        
    #     return np.array(term_freq)

    # def wordFrequencyInDoc(self):
    #     wordFreqNorm = []
    #     for token in self.data.token:
    #         wordFreqNorm.append([combinedTermValues.wordFreqNormDict[token]])

    #     return np.array(wordFreqNorm)
        
    # def get_Imageability_and_Concreteness(self,tokens):
        # connection = sqlite3.connect('/content/mrcDB/mrc-psycholinguistics/mrc2.db')
        
        # imageability, concreteness = [], []

        # for token in tokens:
        #     token=token.upper()
        #     cursor = connection.cursor()
        #     cursor.execute("SELECT imag,conc FROM 'word' WHERE word = '" + token + "'")
            
        #     try:
        #         val = cursor.fetchall()[0][0]
        #         if val==0:
        #             imageability.append([-1,0])
        #         else:
        #             imageability.append([val,1])
        #     except:
        #         imageability.append([-1,0])

        #     try:
        #         val = cursor.fetchall()[0][1]
        #         if val==0:
        #             concreteness.append([-1,0])
        #         else:
        #             concreteness.append([val,1])
        #     except:
        #         concreteness.append([-1,0])

        # connection.close()
        # return np.array(imageability), np.array(concreteness)


In [9]:
dataset={
    'single test':Dataset(data=test_data_single, split='single test'),
    'multi test':Dataset(data=test_data_multi, split='multi test'),
    'train':Dataset(data=train_data, split='train'),
}

Downloading...
From: https://drive.google.com/uc?id=1tr6wO5JuOJKo0pVsNdjjcR_4xXwIiimk
To: /content/glove-embeddings/preprocessed_glove_embeddings_test_single.csv
0.00B [00:00, ?B/s]2.38MB [00:00, 74.8MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Downloading...
From: https://drive.google.com/uc?id=1BbPkLw6Obq8g7jdhMDyn-fcougUcjjE-
To: /content/glove-embeddings/preprocessed_glove_embeddings_test_multi.csv
100% 571k/571k [00:00<00:00, 75.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WeJmYV0yIWPvLp2diaMFvPP1-zfP5nZb
To: /content/glove-embeddings/preprocessed_glove_embeddings_train_multi.csv
24.5MB [00:00, 149MB/s]


**Creating dataloaders using Datasets**

In [10]:
def get_dataloaders(dataset, single_size):
    dataloaders={}

    # Breaking train dataset into train and validation equally from single and muilti word dataset with split = 0.2
    train=dataset['train']
    tokenised_sent_id, sent_attention_mask, complexity, token_embedding, handcrafted_features = sklearn.utils.shuffle(train.tokenised_sent_id[:single_size], train.sent_attention_mask[:single_size], train.complexity[:single_size], train.token_embedding[:single_size], train.handcrafted_features[:single_size], random_state=random_state)
    train_ids, val_ids, _, _ = train_test_split(range(tokenised_sent_id.shape[0]), tokenised_sent_id, test_size = 0.2, random_state = random_state)
    train_data_single = TensorDataset(tokenised_sent_id[train_ids], sent_attention_mask[train_ids], complexity[train_ids], token_embedding[train_ids], handcrafted_features[train_ids])
    val_data_single = TensorDataset(tokenised_sent_id[val_ids], sent_attention_mask[val_ids], complexity[val_ids], token_embedding[val_ids], handcrafted_features[val_ids])

    tokenised_sent_id, sent_attention_mask, complexity, token_embedding, handcrafted_features = sklearn.utils.shuffle(train.tokenised_sent_id[single_size:], train.sent_attention_mask[single_size:], train.complexity[single_size:], train.token_embedding[single_size:], train.handcrafted_features[single_size:], random_state=random_state)
    train_ids, val_ids, _, _ = train_test_split(range(tokenised_sent_id.shape[0]), tokenised_sent_id, test_size = 0.2, random_state = random_state)
    train_data_multi = TensorDataset(tokenised_sent_id[train_ids], sent_attention_mask[train_ids], complexity[train_ids], token_embedding[train_ids], handcrafted_features[train_ids])
    val_data_multi = TensorDataset(tokenised_sent_id[val_ids], sent_attention_mask[val_ids], complexity[val_ids], token_embedding[val_ids], handcrafted_features[val_ids])

    train_data = ConcatDataset([train_data_single, train_data_multi])
    val_data = ConcatDataset([val_data_single, val_data_multi])

    # Train Dataloader
    train_sampler = RandomSampler(train_data)
    dataloaders['train'] = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Validation Dataloader
    val_sampler = SequentialSampler(val_data)
    dataloaders['val'] = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    # Single word token Test Dataloader
    test=dataset['single test']
    test_data = TensorDataset(test.tokenised_sent_id, test.sent_attention_mask, test.complexity, test.token_embedding, test.handcrafted_features)
    test_sampler = SequentialSampler(test_data) 
    dataloaders['single test'] = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    # Multi word token Test Dataloader
    test=dataset['multi test']
    test_data = TensorDataset(test.tokenised_sent_id, test.sent_attention_mask, test.complexity, test.token_embedding, test.handcrafted_features)
    test_sampler = SequentialSampler(test_data)
    dataloaders['multi test'] = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    return dataloaders


In [11]:
dataloader = get_dataloaders(dataset, len(train_data_single.id))

**Our Model**


> LSTM -> For Glove word embeddings

> BERT -> For Sentence embeddings

> Fully connected layed to combine handcrafted features and generate probability







In [12]:
class Model(torch.nn.Module):
    def __init__(self, input_dim, n_hidden=100, n_layers=2, dropout=0.25, lr=0.001):
        super(Model, self).__init__()

        self.dropout = dropout
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        # LSTM 
        self.lstm =  nn.LSTM(emb_dim, n_hidden, n_layers, dropout=dropout, batch_first=True, bidirectional = True)
        self.linear1 = nn.Linear(self.n_hidden * 2, 500, bias = True)

        # BERT for Sentence
        self.sent_embd = BertModel.from_pretrained('bert-base-uncased',  output_hidden_states = True)
        self.l1 = nn.Linear(input_dim, 500, bias = True)
        self.dropout = nn.Dropout(0.25)
        self.relu = nn.ReLU()

        # Word + Sentence + Handcrafted
        self.linear = nn.Sequential(
                      nn.Linear(1004, 400),
                      nn.Dropout(0.25),
                      nn.ReLU(),
                      nn.Linear(400, 100),
                      nn.Dropout(0.25),
                      nn.ReLU(),
                      nn.Linear(100, 1),
        )
        self.sigmoid = nn.Sigmoid()


    
    def init_hidden(self, batch_size):

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
        hidden = ((torch.zeros(2 * self.n_layers, batch_size, self.n_hidden)).to(device),
                  (torch.zeros(2 * self.n_layers, batch_size, self.n_hidden)).to(device))

        return hidden

    def forward(self, tokenised_sent_id, sent_attention_mask, token_embedding, handcrafted_features, batch_size):
        # LSTM
        hidden = self.init_hidden(batch_size)
        lstm_out, hidden = self.lstm(token_embedding.unsqueeze(dim=1), hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.linear1(lstm_out)
        y_pred_word = self.relu(lstm_out)
        y_pred_word = y_pred_word.squeeze(dim=1)

        # BERT
        sent_embed = self.sent_embd(tokenised_sent_id,sent_attention_mask)[1]
        sent_embed = self.dropout(sent_embed)
        sent_embed = self.l1(sent_embed)
        y_pred = self.relu(sent_embed)
        y_pred = y_pred.view(y_pred.shape[0], -1)

        # Combine
        # print(y_pred_word.shape)
        comb = torch.cat([y_pred, y_pred_word, handcrafted_features], dim = 1)
        final = self.linear(comb)
        final = final.view(final.shape[0])
        out = self.sigmoid(final)

        return out

**Functions to save and load model metrics**

In [13]:
def save_metrics(model_save_dir, epochs, model, optimizer, L1):

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'epochs': epochs+1,
                  'L1': L1}
    
    file_name = model_save_dir+"saved_model_epoch_"+str(epochs)+".pt"
    torch.save(state_dict, file_name)
    print(f'Model saved to ==> {file_name}')

def load_metrics(load_path, model, optimizer):
    try: 
        state_dict = torch.load(load_path, map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        optimizer.load_state_dict(state_dict['optimizer_state_dict'])
        print(f'Model loaded from <== {load_path}')
    except: 
        if load_path!='':
            print('pretrained model load failed!!! \n Continiing training without loading ...')
        state_dict = {}

    
    return state_dict.get('epochs', 0), state_dict.get('L1', 1000)

In [14]:
model = Model(768).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




# **Train Model**

In [22]:
# Function to evaluate model on validation dataset
def evaluate(test_dataloader, model):
    model.eval()
    total_eval_accuracy=0

    total_loss = 0
    criterion = nn.L1Loss()

    y_preds = np.array([])
    y_test = np.array([])

    for batch in test_dataloader:
        tokenised_sent_id = batch[0].to(device)
        sent_attention_mask = batch[1].to(device)
        complexity_label = batch[2].to(device)
        token_embedding = batch[3].to(device)
        handcrafted_features = batch[4].to(device)

        with torch.no_grad():
            ypred = model(tokenised_sent_id, sent_attention_mask, token_embedding, handcrafted_features, batch_size = batch[0].to(device).shape[0])        

        ypred = ypred.to('cpu').numpy()
        complexity_label = complexity_label.to('cpu').numpy()

        y_preds = np.hstack((y_preds, ypred))
        y_test = np.hstack((y_test, complexity_label))

    corr, _ = pearsonr(y_preds, y_test)
    loss = np.mean(np.abs(y_preds-y_test))
    return loss, y_preds, y_test, corr
 
def train(training_dataloader, validation_dataloader, model, model_save_dir, load_pretrained_model_path='', epochs = 4):
    optimizer = AdamW(model.parameters(), lr=2e-5, eps = 1e-8)
    criterion = nn.L1Loss()

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(training_dataloader) * epochs)
    
    best_model = copy.deepcopy(model)

    cur_epoch, best_l1 = load_metrics(load_pretrained_model_path, model, optimizer)                 # Used to continue training from a checkpoint (load_pretrained_model_path must be provided)
    best_epoch = -1

    for epoch_i in range(0, epochs):
        print("Epoch: ",epoch_i,"/",epochs)
        total_train_loss = 0
        model.train()

        for step, batch in tqdm(enumerate(training_dataloader), total=230):
            tokenised_sent_id = batch[0].to(device)
            sent_attention_mask = batch[1].to(device)
            complexity_label = batch[2].to(device)
            token_embedding = batch[3].to(device)
            handcrafted_features = batch[4].to(device)

            outputs = model(tokenised_sent_id, sent_attention_mask, token_embedding, handcrafted_features, batch_size = batch[0].to(device).shape[0])

            loss = criterion(outputs, complexity_label)
 
            if step%50 == 0:
                print("Current Loss - ", loss.item())
 
            total_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()


        print(f'Total Train Loss = {total_train_loss}')
        print('#############  Validation  ###########')
        l1_loss, _, _, corr = evaluate(validation_dataloader, model)
        print("  L1 loss: ",l1_loss)
        print("  Pearson correlation: ",corr)
        
        if l1_loss < best_l1:
            best_l1 = l1_loss
            best_epoch = epoch_i
        save_metrics(model_save_dir, epoch_i, model, optimizer, l1_loss)
        print()

    return best_epoch

In [23]:
best_epoch = train(dataloader['train'], dataloader['val'], model, model_save_dir, epochs=epochs)

Epoch:  0 / 5


HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))

Current Loss -  0.15128256380558014
Current Loss -  0.09272909164428711
Current Loss -  0.0920928344130516
Current Loss -  0.08367624133825302
Current Loss -  0.06419429183006287

Total Train Loss = 23.4163875579834
#############  Validation  ###########
  L1 loss:  0.07568500482385485
  Pearson correlation:  0.7412422966988244
Model saved to ==> /content/saves/saved_model_epoch_0.pt

Epoch:  1 / 5


HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))

Current Loss -  0.09652426838874817
Current Loss -  0.07925502955913544
Current Loss -  0.08344821631908417
Current Loss -  0.06920888274908066
Current Loss -  0.060986779630184174

Total Train Loss = 16.690052032470703
#############  Validation  ###########
  L1 loss:  0.0687123005241399
  Pearson correlation:  0.7949601610017077
Model saved to ==> /content/saves/saved_model_epoch_1.pt

Epoch:  2 / 5


HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))

Current Loss -  0.05589332804083824
Current Loss -  0.06873182207345963
Current Loss -  0.0522407665848732
Current Loss -  0.05804141238331795
Current Loss -  0.060518693178892136

Total Train Loss = 15.018747329711914
#############  Validation  ###########
  L1 loss:  0.06565339433178347
  Pearson correlation:  0.8109830148762929
Model saved to ==> /content/saves/saved_model_epoch_2.pt

Epoch:  3 / 5


HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))

Current Loss -  0.06554285436868668
Current Loss -  0.0519709587097168
Current Loss -  0.05277273431420326
Current Loss -  0.05769840627908707
Current Loss -  0.06296104192733765

Total Train Loss = 13.87000846862793
#############  Validation  ###########
  L1 loss:  0.06517783281680807
  Pearson correlation:  0.8163811650153832
Model saved to ==> /content/saves/saved_model_epoch_3.pt

Epoch:  4 / 5


HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))

Current Loss -  0.07119307667016983
Current Loss -  0.05450516939163208
Current Loss -  0.06415815651416779
Current Loss -  0.04879383742809296
Current Loss -  0.056053049862384796

Total Train Loss = 13.030644416809082
#############  Validation  ###########
  L1 loss:  0.06377465297581063
  Pearson correlation:  0.8206857287634116
Model saved to ==> /content/saves/saved_model_epoch_4.pt



In [24]:
print("Best Epoch -> ",best_epoch)

Best Epoch ->  4


# **TEST**

**Function to generate predictions on test dataset**

In [25]:
def get_preds(test_dataloader, model):
    model.eval()
    y_preds = []
    total_loss = 0

    for batch in test_dataloader:
        tokenised_sent_id = batch[0].to(device)
        sent_attention_mask = batch[1].to(device)
        complexity_label = batch[2].to(device)
        token_embedding = batch[3].to(device)
        handcrafted_features = batch[4].to(device)

        with torch.no_grad():
            ypred = model(tokenised_sent_id, sent_attention_mask, token_embedding, handcrafted_features, batch_size = batch[0].to(device).shape[0])      
        
        ypred = ypred.to('cpu').numpy()
        y_preds = y_preds + list(ypred)

    return np.array(y_preds)

In [26]:
model = Model(768).to(device)

###**Function to generate scores on predicted labels** 
(reference -> https://github.com/MMU-TDMLab/CompLex/blob/master/evaluate.py)

Note: *Generated* Scores will be saved inside score directory (output format -> ```score/scores_<split_type>_epoch_<epoch_no>.txt```)

In [32]:
def generate_scores(split, filename, epoch_no):
    submission_dict = {}
    with open(filename, 'r') as sf:
        csv_reader = csv.reader(sf)
        for item in csv_reader:
            try:
                submission_dict[item[0]] = float(item[1])
            except:
                # print(item)
                pass

    #Read reference:
    gold_dict = {}
    if split=='single test':
        path='test-labels/lcp_single_test.tsv'
    else:
        path='test-labels/lcp_multi_test.tsv'

    reference_file = pd.read_csv(path,  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
    for id, complexity in zip(list(reference_file.id), list(reference_file.complexity)):
        gold_dict[id] = float(complexity)

    #Produce vectors with reference labels and predictions:
    gold = []
    predicted = []
    
    for key, label in gold_dict.items():
        gold.append(label)
        predicted.append(submission_dict[key])
    
    #Calculate scores:
    pearson_score = pearsonr(gold, predicted)[0]
    spearman_score = spearmanr(gold, predicted)[0]
    mae_score = mean_absolute_error(gold, predicted)
    mse_score = mean_squared_error(gold, predicted)
    rsq_score = r2_score(gold, predicted)

    result = ''
    result += "pearson:{0}\n".format(pearson_score)
    result += "spearman:{0}\n".format(spearman_score)
    result += "mae:{0}\n".format(mae_score)
    result += "mse:{0}\n".format(mse_score)
    result += "r2:{0}".format(rsq_score)

    print(result)
    with open('score/scores_'+split+'_epoch_'+str(epoch_no)+'.txt', 'w') as output_file:
        output_file.write(result)

**Function to generate test results and save predicted labels**

In [35]:
def generate_test_result(model, epoch_no, split):
    print('\n#### Result for {split} at epoch {epoch}'.format(split=split, epoch=epoch_no))

    state_dict = torch.load(f = '/content/saves/saved_model_epoch_'+str(epoch_no)+'.pt', map_location=device)
    model.load_state_dict(state_dict['model_state_dict'])

    preds = get_preds(dataloader[split], model)

    results= pd.DataFrame()
    results['id'] = list(dataset[split].data.id)
    results['complexity'] = list(preds)
    save_file = 'predictions_'+split+'_epoch_'+str(epoch_no)+'.csv'
    results.to_csv(save_file, header=False, index=False)

    generate_scores(split, save_file, epoch_no)

In [38]:
generate_test_result(model, epoch_no=best_epoch, split='single test')
generate_test_result(model, epoch_no=best_epoch, split='multi test')


#### Result for single test at epoch 4
pearson:0.7466255355101618
spearman:0.7112309325758975
mae:0.0655800635379018
mse:0.007282427964185199
r2:0.550052886911859

#### Result for multi test at epoch 4
pearson:0.8143426997868701
spearman:0.8180721397194806
mae:0.07184100362025027
mse:0.00815950547889697
r2:0.6619521559897441


**Move Result to Google Drive** (Optional)

In [39]:
# from google.colab import drive
# # drive.mount('/content/gdrive/')
# drive.mount('/content/drive/',force_remount=True)


# def save_to_drive(files, folder):
#     for file in files:
#         cmd = "cp -r '{file1}' '/content/drive/MyDrive/NLP_SemEval21/{time}/{file2}'".format(file1=file[0], time=folder, file2=file[1])
#         print(cmd)
#         !eval {cmd}
        

In [40]:
# folder = datetime.now(IST).strftime("%d-%m-%Y_%H:%M:%S")

In [41]:
# cmd="mkdir -p '/content/drive/MyDrive/NLP_SemEval21/{time}'".format(time=folder)
# !eval {cmd}
# cmd="mkdir -p '/content/drive/MyDrive/NLP_SemEval21/{time}/{fold}'".format(time=folder,fold="saves")
# !eval {cmd}
# cmd="mkdir -p '/content/drive/MyDrive/NLP_SemEval21/{time}/{fold}'".format(time=folder,fold="saves")
# !eval {cmd}
# files=[
#     ('/content/score',''),
#     ('/content/saves/saved_model_epoch_4.pt','saves/saved_model_epoch_4.pt'),
#     ('/content/saves/saved_model_epoch_2.pt','saves/saved_model_epoch_2.pt'),
#     ('predictions*.csv','')
# ]
# save_to_drive(files, folder)