In [1]:
!pip install spacy-syllables
!python -m spacy download en_core_web_sm
!pip3 install wordfreq

2021-04-11 15:13:05.724542: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from wordfreq import word_frequency
from scipy import stats
import csv
import spacy
from spacy_syllables import SpacySyllables
import os
import random

In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2021-04-11 15:13:17--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-11 15:13:17--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-11 15:13:17--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2021

In [4]:
!unzip glove*.zip

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [5]:
# https://www.kaggle.com/bminixhofer/deterministic-neural-networks-using-pytorch
# Seed all rngs for deterministic results
def seed_all(seed = 0):
  random.seed(0)
  os.environ['PYTHONHASHSEED'] = str(seed)
  torch.manual_seed(seed)
  np.random.seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True

In [6]:
seed_all(0)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("syllables", after='tagger') # Add the syllable tagger pipe

<spacy_syllables.SpacySyllables at 0x7f2d3b095210>

In [9]:
SINGLE_TRAIN_DATAPATH = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_single_train.tsv"
SINGLE_TEST_DATAPATH = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_single_test.tsv"

MULTI_TRAIN_DATAPATH = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_multi_train.tsv"
MULTI_TEST_DATAPATH = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_multi_test.tsv"

In [10]:
def get_data_frames():
  df_train_single = pd.read_csv(SINGLE_TRAIN_DATAPATH, sep='\t', quotechar="'", quoting=csv.QUOTE_NONE)
  df_test_single = pd.read_csv(SINGLE_TEST_DATAPATH, sep='\t', quotechar="'", quoting=csv.QUOTE_NONE)

  df_train_multi = pd.read_csv(MULTI_TRAIN_DATAPATH, sep='\t', quotechar="'", quoting=csv.QUOTE_NONE)
  df_test_multi = pd.read_csv(MULTI_TEST_DATAPATH, sep='\t', quotechar="'", quoting=csv.QUOTE_NONE)

  return df_train_single, df_test_single, df_train_multi, df_test_multi

In [11]:
df_train_single, df_test_single, df_train_multi, df_test_multi = get_data_frames()

Features used 

* Word Embedding [GloVe 50 dimensional embeddings](http://nlp.stanford.edu/data/glove.6B.zip)
* Length of word
* Syllable count [PyPy](https://pypi.org/project/syllables/)
* Word Frequency [PyPy](https://pypi.org/project/wordfreq/)
* POS tag [Spacy](https://spacy.io/usage/linguistic-features#pos-tagging)

[Reference](https://www.aclweb.org/anthology/W18-0508.pdf)


In [12]:
single_tokens_train_raw = df_train_single["token"].astype(str).to_list()
single_tokens_test_raw = df_test_single["token"].astype(str).to_list()

y_single_train = df_train_single["complexity"].astype(np.float32).to_numpy()
y_single_test = df_test_single["complexity"].astype(np.float32).to_numpy()

multi_tokens_train_raw = df_train_multi["token"].astype(str).to_list()
multi_tokens_test_raw = df_test_multi["token"].astype(str).to_list()

y_multi_train = df_train_multi["complexity"].astype(np.float32).to_numpy()
y_multi_test = df_test_multi["complexity"].astype(np.float32).to_numpy()

sent_train_single_raw = df_train_single["sentence"].to_list()
sent_test_single_raw = df_test_single["sentence"].to_list()

sent_train_multi_raw = df_train_multi["sentence"].to_list()
sent_test_multi_raw = df_test_multi["sentence"].to_list()

In [13]:
EMBEDDING_DIM = 50

def get_embeddings():
  embedding_index = {}
  with open('glove.6B.{}d.txt'.format(EMBEDDING_DIM), 'r', encoding='utf-8') as f:
    for line in f:
      values = line.split()
      token = values[0]
      embedding_index[token] = np.asarray(values[1:], dtype='float32')
  return embedding_index

In [14]:
embedding_index = get_embeddings()
print('Token count in embeddings: {}'.format(len(embedding_index)))

Token count in embeddings: 400000


biLSTM to predict target probability

In [15]:
HIDDEN_DIM = 10

In [16]:
def prepare_sequence(seq, to_ix):
  seq = seq.split()
  idxs = [to_ix[w.lower()] if w.lower() in to_ix else len(to_ix) for w in seq]
  idxs = torch.tensor(idxs)
  idxs = nn.functional.one_hot(idxs, num_classes=len(to_ix))
  idxs = torch.tensor(idxs, dtype=torch.float32)
  return idxs


def map_token_to_idx():
  word_to_ix = {}
  word_to_ix_multi = {}
  for sent in sent_train_single_raw:
    sent = sent.split()
    for word in sent:
      word = word.lower()
      if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)

  for sent in sent_train_multi_raw:
    sent = sent.split()
    for word in sent:
      word = word.lower()
      if word not in word_to_ix_multi:
        word_to_ix_multi[word] = len(word_to_ix_multi)
  
  return word_to_ix, word_to_ix_multi

In [17]:
word_to_ix, word_to_ix_multi = map_token_to_idx()
print('SWE vocab size: {}\nMWE vocab size: {}'.format(len(word_to_ix), len(word_to_ix_multi)))

SWE vocab size: 24350
MWE vocab size: 9699


biLSTM to calculate token probability given context

In [18]:
class biLSTM(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
    super(biLSTM, self).__init__()
    self.hidden_dim = hidden_dim
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
    self.hidden2tag = nn.Linear(2 * hidden_dim, output_size)

  def prepare_embedding(self, sentence):
    embeddings = []
    for word in sentence:
      word = word.lower()
      if word in embedding_index:
        embeddings.extend(embedding_index[word])
      else:
        embeddings.extend(np.random.random(EMBEDDING_DIM).tolist())
    embeddings = torch.tensor(embeddings, dtype=torch.float32, device=device)
    return embeddings

  def forward(self, sentence):
    sentence = sentence.split()
    embeds = self.prepare_embedding(sentence)
    lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.softmax(tag_space, dim=1)
    return tag_scores

biLSTM model for single word targets

In [19]:
model = biLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(word_to_ix))

In [20]:
print('Training biLSTM on single target expressions')
# Train the model for 10 epochs
model = biLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(word_to_ix))
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(10):
  loss_sum = 0
  for sentence in sent_train_single_raw:
    model.zero_grad()
    targets = prepare_sequence(sentence, word_to_ix)
    tag_scores = model(sentence)
    loss = loss_function(tag_scores, targets)
    loss_sum += loss
    loss.backward()
    optimizer.step()
  print('Epoch: {} Loss: {}'.format(epoch, loss_sum.item()))

Training biLSTM on single target expressions


  


Epoch: 0 Loss: 0.2653784155845642
Epoch: 1 Loss: 0.22749324142932892
Epoch: 2 Loss: 0.21301747858524323
Epoch: 3 Loss: 0.20228148996829987
Epoch: 4 Loss: 0.19432274997234344
Epoch: 5 Loss: 0.1892627626657486
Epoch: 6 Loss: 0.18443353474140167
Epoch: 7 Loss: 0.1822148710489273
Epoch: 8 Loss: 0.1814003884792328
Epoch: 9 Loss: 0.178586944937706


biLSTM model for multi word targets

In [21]:
model_multi = biLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix_multi), len(word_to_ix_multi))

In [22]:
print('Training biLSTM on multi target expressions')
model_multi = biLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix_multi), len(word_to_ix_multi))
loss_function = nn.MSELoss()
optimizer = optim.Adam(model_multi.parameters(), lr=0.01)
for epoch in range(10):
  loss_sum = 0
  for sentence in sent_train_multi_raw:
    model_multi.zero_grad()
    targets = prepare_sequence(sentence, word_to_ix_multi)
    tag_scores = model_multi(sentence)
    loss = loss_function(tag_scores, targets)
    loss_sum += loss
    loss.backward()
    optimizer.step()
  print('Epoch: {} Loss: {}'.format(epoch, loss_sum.item()))

Training biLSTM on multi target expressions


  


Epoch: 0 Loss: 0.1351313591003418
Epoch: 1 Loss: 0.11546913534402847
Epoch: 2 Loss: 0.10997606068849564
Epoch: 3 Loss: 0.10428568720817566
Epoch: 4 Loss: 0.09884557127952576
Epoch: 5 Loss: 0.09597007930278778
Epoch: 6 Loss: 0.09406128525733948
Epoch: 7 Loss: 0.09307729452848434
Epoch: 8 Loss: 0.09183567017316818
Epoch: 9 Loss: 0.09023026376962662


In [23]:
def prepare_features_single_word(tokens, sentences):
  features = []
  for idx, word in enumerate(tokens):
    word = word.lower()
    feature = []

    # Word length
    feature.append(len(word))
    doc = nlp(word)

    # Syllable count and word frequency in the corpus
    # Spacy tokenizes the input sentence
    # In this case we would have only one token, the target word
    for token in doc:
      feature.append(token._.syllables_count)
      feature.append(word_frequency(word, 'en'))

    # Probability of target word `word` in the sentence estimated from by `model`
    if word in word_to_ix:
      # Output scores for each of the word in the sentence
      out = model(sentences[idx])
      pos = -1
      for itr, token in enumerate(sentences[idx].split()):
        if token.lower() == word:
          pos = itr
          break
      id_pos = word_to_ix[word] # word to id mapping
      feature.append(float(out[pos][id_pos]))
    else:
      # `word` not in vocabulary, so cannot predict probability in context
      feature.append(0.0)

    features.append(feature)

    if (idx + 1) % 500 == 0:
      print('Prepared features for {} single target word sentences'.format(idx + 1))
  return features

In [24]:
def prepare_features_multi_word(tokens, sentences):
  features = []
  for idx, word in enumerate(tokens):
    word = word.lower()
    feature = []
    doc = nlp(word)
    word = word.split(' ')
    assert(len(word) == 2)

    # MWE length = sum(length of individual words)
    feature.append(len(word[0]) + len(word[1]))

    syllables = 0
    probability = 1
    embedding = np.zeros(EMBEDDING_DIM)

    # Syllable count and word frequency in the corpus
    # Spacy tokenizes the input sentence
    # In this case we would have two tokens

    for token in doc:
      word_ = token.text
      syllables += token._.syllables_count
      probability *= word_frequency(word_, 'en')

      # GloVE embedding current `word_` of the MWE
      if word_ in embedding_index:
        embedding = embedding + embedding_index[word_]
      else:
        # `word_` not in the GloVE corpus, take a random embedding
        embedding = embedding + np.random.random(EMBEDDING_DIM)

    # Average embedding of the two tokens in the MWE
    embedding = embedding / 2
    feature.append(syllables)
    feature.append(probability)

    # Product of probabilities of constituent words in the MWE
    if word[0] in word_to_ix_multi and word[1] in word_to_ix_multi:
      # Output scores for each of the word in the sentence
      out = model_multi(sentences[idx])
      pos0, pos1 = -1, -1
      for itr, token in enumerate(sentences[idx].split()):
        if token.lower() == word[0]:
          pos0 = itr
          pos1 = itr + 1
          break
      id_pos0 = word_to_ix_multi[word[0]]
      id_pos1 = word_to_ix_multi[word[1]]
      feature.append(float(out[pos0][id_pos0] * out[pos1][id_pos1]))
    else:
      # Either of the constituent words of the MWE not in vocabulary \
      # So cannot predict probability in context
      feature.append(0.0)

    features.append(feature)

    if (idx + 1) % 500 == 0:
      print('Prepared features for {} multi target word sentences'.format(idx + 1))

  return features

In [25]:
print('+++ Generating Train features for Single word expressions +++')
features_train_single = prepare_features_single_word(single_tokens_train_raw, sent_train_single_raw)
print('+++ Generating Test features for Single word expressions +++')
features_test_single = prepare_features_single_word(single_tokens_test_raw, sent_test_single_raw)

+++ Generating Train features for Single word expressions +++
Prepared features for 500 single target word sentences
Prepared features for 1000 single target word sentences
Prepared features for 1500 single target word sentences
Prepared features for 2000 single target word sentences
Prepared features for 2500 single target word sentences
Prepared features for 3000 single target word sentences
Prepared features for 3500 single target word sentences
Prepared features for 4000 single target word sentences
Prepared features for 4500 single target word sentences
Prepared features for 5000 single target word sentences
Prepared features for 5500 single target word sentences
Prepared features for 6000 single target word sentences
Prepared features for 6500 single target word sentences
Prepared features for 7000 single target word sentences
Prepared features for 7500 single target word sentences
+++ Generating Test features for Single word expressions +++
Prepared features for 500 single targe

In [26]:
print('+++ Generating Train features for Multi word expressions +++')
features_train_multi = prepare_features_multi_word(multi_tokens_train_raw, sent_train_multi_raw)
print('+++ Generating Test features for Multi word expressions +++')
features_test_multi = prepare_features_multi_word(multi_tokens_test_raw, sent_test_multi_raw)

+++ Generating Train features for Multi word expressions +++
Prepared features for 500 multi target word sentences
Prepared features for 1000 multi target word sentences
Prepared features for 1500 multi target word sentences
+++ Generating Test features for Multi word expressions +++


In [27]:
# Convert all features to torch.tensor to enable use in PyTorch models
X_train_single_tensor = torch.tensor(features_train_single, dtype=torch.float32, device=device)
X_test_single_tensor = torch.tensor(features_test_single, dtype=torch.float32, device=device)
X_train_multi_tensor = torch.tensor(features_train_multi, dtype=torch.float32, device=device)
X_test_multi_tensor = torch.tensor(features_test_multi, dtype=torch.float32, device=device)

In [28]:
# Reshape all output complexity scores to single dimension vectors
y_single_train = y_single_train.reshape(y_single_train.shape[0], -1)
y_single_test = y_single_test.reshape(y_single_test.shape[0], -1)
y_multi_train = y_multi_train.reshape(y_multi_train.shape[0], -1)
y_multi_test = y_multi_test.reshape(y_multi_test.shape[0], -1)

In [29]:
# Convert all target outputs to torch.tensor to enable use in PyTorch models
Y_train_single_tensor = torch.tensor(y_single_train, dtype=torch.float32, device=device)
Y_test_single_tensor = torch.tensor(y_single_test, dtype=torch.float32, device=device)
Y_train_multi_tensor = torch.tensor(y_multi_train, dtype=torch.float32, device=device)
Y_test_multi_tensor = torch.tensor(y_multi_test, dtype=torch.float32, device=device)

In [30]:
# Ensure each sample from test and train for single word expression is taken
print(X_train_single_tensor.shape)
print(X_test_single_tensor.shape)
print(Y_train_single_tensor.shape)
print(Y_test_single_tensor.shape)

torch.Size([7662, 4])
torch.Size([917, 4])
torch.Size([7662, 1])
torch.Size([917, 1])


In [31]:
# Ensure each sample from test and train for multi word expression is taken
print(X_train_multi_tensor.shape)
print(X_test_multi_tensor.shape)
print(Y_train_multi_tensor.shape)
print(Y_test_multi_tensor.shape)

torch.Size([1517, 4])
torch.Size([184, 4])
torch.Size([1517, 1])
torch.Size([184, 1])


In [32]:
def convert_tensor_to_np(y):
  if device == torch.device("cuda"):
    y = y.cpu()
  y = y.detach().numpy()
  return y

In [33]:
from copy import deepcopy

In [34]:
# Evaluate the metrics upon which the model would be evaluated
def evaluate_metrics(labels, predicted):
  vx, vy = [], []
  if torch.is_tensor(labels):
    vx = labels.clone()
    vx = convert_tensor_to_np(vx)
  else:
    vx = deepcopy(labels)
  if torch.is_tensor(predicted):
    vy = predicted.clone()
    vy = convert_tensor_to_np(vy)
  else:
    vy = deepcopy(predicted)

  pearsonR = np.corrcoef(vx.T, vy.T)[0, 1]
  spearmanRho = stats.spearmanr(vx, vy)
  MSE = np.mean((vx - vy) ** 2)
  MAE = np.mean(np.absolute(vx - vy))
  RSquared = pearsonR ** 2

  print("Peason's R: {}".format(pearsonR))
  print("Spearman's rho: {}".format(spearmanRho))
  print("R Squared: {}".format(RSquared))
  print("MSE: {}".format(MSE))
  print("MAE: {}".format(MAE))

## Neural Network

* $N$ input sentences  

* d (=EMBEDDING_DIM) word embedding

* $I$ = Word Embedding matrix ($N \times d$)

* $W_1, W_2, W_3, W_4 := (d \times 256), (256 \times 128), (128 \times 64), (64 \times 1)$

* Equations

  * $o_1 = tanh(I \times W_1 + b_1)$

  * $o_2 = tanh(o_1 \times W_2 + b_2)$

  * $o_3 = tanh(o_2 \times W_3 + b_3)$

  * $o_4 = \sigma(o_3 \times W_4)$


In [35]:
class NN(nn.Module):
  def __init__(self, embedding_dim):
    super(NN, self).__init__()
    self.linear1 = nn.Linear(embedding_dim, 128, bias=True)
    self.linear2 = nn.Linear(128, 256, bias=True)
    self.linear3 = nn.Linear(256, 64, bias=True)
    self.linear4 = nn.Linear(64, 1)

  def forward(self, input):
    out = torch.tanh(self.linear1(input))
    out = torch.tanh(self.linear2(out))
    out = torch.tanh(self.linear3(out))
    out = torch.sigmoid(self.linear4(out))
    return out

In [36]:
loss_function = nn.MSELoss()

In [37]:
embedding_dim = X_train_single_tensor.shape[1]
model_NN = NN(embedding_dim)
model_NN.to(device)

NN(
  (linear1): Linear(in_features=4, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=64, bias=True)
  (linear4): Linear(in_features=64, out_features=1, bias=True)
)

In [38]:
print('Training NN on single target expressions...')
model_NN = NN(embedding_dim)
model_NN.to(device)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model_NN.parameters(), lr=0.002)
for epoch in range(30):
  optimizer.zero_grad()
  out = model_NN(X_train_single_tensor)
  loss = loss_function(out, Y_train_single_tensor)
  loss.backward()
  optimizer.step()
  print("Epoch {} : {}".format(epoch + 1, loss.item()))

Training NN on single target expressions...
Epoch 1 : 0.04156719148159027
Epoch 2 : 0.03742283955216408
Epoch 3 : 0.0448017381131649
Epoch 4 : 0.03419572860002518
Epoch 5 : 0.019205523654818535
Epoch 6 : 0.024642007425427437
Epoch 7 : 0.029018502682447433
Epoch 8 : 0.019712666049599648
Epoch 9 : 0.01783045008778572
Epoch 10 : 0.021568909287452698
Epoch 11 : 0.023631364107131958
Epoch 12 : 0.022505754604935646
Epoch 13 : 0.019629282876849174
Epoch 14 : 0.01747821271419525
Epoch 15 : 0.017897097393870354
Epoch 16 : 0.01982138119637966
Epoch 17 : 0.020360076799988747
Epoch 18 : 0.018984435126185417
Epoch 19 : 0.017557309940457344
Epoch 20 : 0.01733284443616867
Epoch 21 : 0.01798972673714161
Epoch 22 : 0.01864529773592949
Epoch 23 : 0.01875516213476658
Epoch 24 : 0.018305545672774315
Epoch 25 : 0.01765372045338154
Epoch 26 : 0.01725248247385025
Epoch 27 : 0.017331808805465698
Epoch 28 : 0.017708735540509224
Epoch 29 : 0.017957434058189392
Epoch 30 : 0.017834357917308807


In [39]:
out_NN = model_NN(X_test_single_tensor)
evaluate_metrics(out_NN, Y_test_single_tensor)

Peason's R: 0.255279427729366
Spearman's rho: SpearmanrResult(correlation=0.09160778046822918, pvalue=0.005501141655991219)
R Squared: 0.06516758622183261
MSE: 0.01610724814236164
MAE: 0.100863516330719


In [40]:
embedding_dim = X_train_multi_tensor.shape[1]
model_NN_multi = NN(embedding_dim)
model_NN_multi.to(device)

NN(
  (linear1): Linear(in_features=4, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=64, bias=True)
  (linear4): Linear(in_features=64, out_features=1, bias=True)
)

In [41]:
print('Training NN on multi target expressions...')
model_NN_multi = NN(embedding_dim)
model_NN_multi.to(device)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model_NN_multi.parameters(), lr=0.002)
for epoch in range(30):
  optimizer.zero_grad()
  out = model_NN_multi(X_train_multi_tensor)
  loss = loss_function(out, Y_train_multi_tensor)
  loss.backward()
  optimizer.step()
  print("Epoch {} : {}".format(epoch + 1, loss.item()))

Training NN on multi target expressions...
Epoch 1 : 0.029958447441458702
Epoch 2 : 0.07558488845825195
Epoch 3 : 0.04377608373761177
Epoch 4 : 0.02466835454106331
Epoch 5 : 0.03941410779953003
Epoch 6 : 0.033858101814985275
Epoch 7 : 0.02456013672053814
Epoch 8 : 0.02601884864270687
Epoch 9 : 0.030937250703573227
Epoch 10 : 0.030795661732554436
Epoch 11 : 0.026925483718514442
Epoch 12 : 0.023892369121313095
Epoch 13 : 0.024261007085442543
Epoch 14 : 0.026419680565595627
Epoch 15 : 0.02721245028078556
Epoch 16 : 0.025941576808691025
Epoch 17 : 0.02418563887476921
Epoch 18 : 0.02347237803041935
Epoch 19 : 0.023981263861060143
Epoch 20 : 0.0248402189463377
Epoch 21 : 0.02515379525721073
Epoch 22 : 0.024688439443707466
Epoch 23 : 0.023858550935983658
Epoch 24 : 0.023283232003450394
Epoch 25 : 0.02329162321984768
Epoch 26 : 0.023695172742009163
Epoch 27 : 0.023999052122235298
Epoch 28 : 0.023872535675764084
Epoch 29 : 0.023417780175805092
Epoch 30 : 0.023007284849882126


In [42]:
out_NN_multi = model_NN_multi(X_test_multi_tensor)
evaluate_metrics(out_NN_multi, Y_test_multi_tensor)

Peason's R: 0.3035891713215042
Spearman's rho: SpearmanrResult(correlation=0.29376409033713474, pvalue=5.180762380401474e-05)
R Squared: 0.09216638494367763
MSE: 0.022757170721888542
MAE: 0.12122879922389984


## Machine Learning Methods

* Linear Regression

* Support Vector Regressor

In [43]:
X_train_single_np = np.array(features_train_single)
X_test_single_np = np.array(features_test_single)
Y_train_single_np = np.array(y_single_train.reshape(y_single_train.shape[0], -1))
Y_test_single_np = np.array(y_single_test.reshape(y_single_test.shape[0], -1))

In [44]:
print(X_train_single_np.shape)
print(X_test_single_np.shape)
print(Y_train_single_np.shape)
print(Y_test_single_np.shape)

(7662, 4)
(917, 4)
(7662, 1)
(917, 1)


In [45]:
X_train_multi_np = np.array(features_train_multi)
X_test_multi_np = np.array(features_test_multi)
Y_train_multi_np = np.array(y_multi_train.reshape(y_multi_train.shape[0], -1))
Y_test_multi_np = np.array(y_multi_test.reshape(y_multi_test.shape[0], -1))

In [46]:
print(X_train_multi_np.shape)
print(X_test_multi_np.shape)
print(Y_train_multi_np.shape)
print(Y_test_multi_np.shape)

(1517, 4)
(184, 4)
(1517, 1)
(184, 1)


### Linear Regression

In [47]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [48]:
from sklearn.linear_model import LinearRegression

In [49]:
def evaluateLinearRegression(X_train, Y_train, X_test, Y_test):
  reg = make_pipeline(StandardScaler(), LinearRegression())
  reg.fit(X_train, Y_train)
  out = reg.predict(X_test)
  out = out.reshape((out.shape[0], 1))
  evaluate_metrics(out, Y_test)
  return out

In [50]:
print('Linear Regression for Single word expressions')
out_LR = evaluateLinearRegression(X_train_single_np, Y_train_single_np, X_test_single_np, Y_test_single_np)

Linear Regression for Single word expressions
Peason's R: 0.25215877405576104
Spearman's rho: SpearmanrResult(correlation=0.5123028932445765, pvalue=1.6738377269910443e-62)
R Squared: 0.06358404733330435
MSE: 0.01601122468731589
MAE: 0.09477343728946037


In [51]:
print('Linear Regression for Multi word expressions')
out_LR_multi = evaluateLinearRegression(X_train_multi_np, Y_train_multi_np, X_test_multi_np, Y_test_multi_np)

Linear Regression for Multi word expressions
Peason's R: 0.33362246167053294
Spearman's rho: SpearmanrResult(correlation=0.3569627919764681, pvalue=6.560849830391453e-07)
R Squared: 0.11130394693110622
MSE: 0.021454264898172603
MAE: 0.11666720338424778


### Support Vector Regressor

* Radial basis function
* C = 0.05
* epsilon = 0.01

In [52]:
from sklearn.svm import SVR

In [53]:
def evaluateSVR(X_train, Y_train, X_test, Y_test):
  svr = make_pipeline(StandardScaler(), SVR(C=0.05, epsilon=0.01))
  svr.fit(X_train, Y_train.reshape(-1))
  out = svr.predict(X_test)
  out = out.reshape((out.shape[0], 1))
  evaluate_metrics(out, Y_test)
  return out

In [54]:
print('SVR for Single word expressions')
out_svr = evaluateSVR(X_train_single_np, Y_train_single_np, X_test_single_np, Y_test_single_np)

SVR for Single word expressions
Peason's R: 0.5991510970546696
Spearman's rho: SpearmanrResult(correlation=0.6132253992379839, pvalue=8.226304990972235e-96)
R Squared: 0.35898203710181414
MSE: 0.010451324849359353
MAE: 0.07663730477750667


In [55]:
print('SVR for Multi word expressions')
out_svr_multi = evaluateSVR(X_train_multi_np, Y_train_multi_np, X_test_multi_np, Y_test_multi_np)

SVR for Multi word expressions
Peason's R: 0.44558775336090417
Spearman's rho: SpearmanrResult(correlation=0.43843646812241605, pvalue=4.823255574297711e-10)
R Squared: 0.19854844594521798
MSE: 0.019483109791224158
MAE: 0.10993192395291536


In [56]:
single_ids = df_test_single["id"].astype(str).to_list()
multi_ids = df_test_multi["id"].astype(str).to_list()

In [57]:
out_ensemble = []

for idx in range(len(out_NN)):
  score = 0
  score += float(out_NN[idx])
  score += float(out_LR[idx])
  score += float(out_svr[idx])
  score /= 3
  out_ensemble.append(score)
out_ensemble = np.array(out_ensemble)
out_ensemble = out_ensemble.reshape((out_ensemble.shape[0], 1))

In [58]:
evaluate_metrics(out_ensemble, Y_test_single_np)

Peason's R: 0.5342335105464795
Spearman's rho: SpearmanrResult(correlation=0.5977430202357636, pvalue=6.4351742398284515e-90)
R Squared: 0.2854054437908154
MSE: 0.012517453367202091
MAE: 0.08572586829832668


In [59]:
out_ensemble_multi = []

for idx in range(len(out_NN_multi)):
  score = 0
  score += float(out_NN_multi[idx])
  score += float(out_LR_multi[idx])
  score += float(out_svr_multi[idx])
  score /= 3
  out_ensemble_multi.append(score)
out_ensemble_multi = np.array(out_ensemble_multi)
out_ensemble_multi = out_ensemble_multi.reshape((out_ensemble_multi.shape[0], 1))

In [60]:
evaluate_metrics(out_ensemble_multi, Y_test_multi_np)

Peason's R: 0.4020044519530314
Spearman's rho: SpearmanrResult(correlation=0.4060615878947246, pvalue=1.0723742486350019e-08)
R Squared: 0.16160757939005715
MSE: 0.020633678652037416
MAE: 0.11451810403223067
