# Demo
---

## Global Config

In [42]:
PATH_TO_SENTEVAL = 'SentEval'
PATH_TO_DATA = 'SentEval/data'

## Import Libraries

In [62]:
import sys
import torch
import logging
import numpy as np

In [47]:
from trainer import Trainer
from models.embedding_encoder import EmbeddingEncoder

In [48]:
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval

# Baseline: Embedding Average Encoder

In [49]:
from configs.baseline import config
config

{'exp_name': 'baseline',
 'epochs': 20,
 'encoder': 'EmbeddingEncoder',
 'batch_size': 128,
 'learning_rate': 0.001,
 'seed': 42,
 'debug': False,
 'device': 'cpu',
 'num_workers': 4,
 'valid_freq': 1000,
 'save_freq': 2000,
 'test_checkpoint': 'best-model.pt'}

In [50]:
trainer = Trainer(config) 
print(trainer.test())

INFO:trainer:--------------------------------------------------
INFO:trainer:EXPERIMENT: baseline
INFO:trainer:--------------------------------------------------
INFO:trainer:Setting seed: 42
INFO:trainer:Loading data ...
INFO:trainer:Using device: cpu
INFO:trainer:Loading checkpoint from checkpoints/baseline/best-model.pt


[Test]	Accuracy: 0.777 Total Loss: 1.528


In [51]:
trainer.model.eval()

Classifier(
  (encoder): EmbeddingEncoder(
    (emb): Embedding(33672, 300)
  )
  (layers): Sequential(
    (0): Linear(in_features=1200, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [52]:
def prepare(params, samples):
    params.vocab = trainer.vocab
    params.encoder = trainer.model.encoder    
    return

In [53]:
sentence = ['this', 'is', 'an', 'apple', '.']

In [54]:
tokenlist = torch.tensor([[trainer.vocab.stoi[word] for word in sentence]])
tokenlist

tensor([[ 220,    6,   20, 1507,    3]])

In [55]:
[trainer.vocab.itos[token] for token in tokenlist[0]]

['this', 'is', 'an', 'apple', '.']

In [56]:
trainer.model.encoder((tokenlist, [len(tokenlist)])).shape

torch.Size([1, 300])

In [57]:
def batcher(params, batch):
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        sent_tokens = torch.tensor([[params.vocab.stoi[word] for word in sent]])
        x = (sent_tokens, [len(sent_tokens)])
        with torch.no_grad():
            sent_vec = params.encoder(x)
        embeddings.append(sent_vec.detach().numpy())
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings

In [65]:
params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 2}
params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 128,
                                 'tenacity': 2, 'epoch_size': 4}

In [66]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
se = senteval.engine.SE(params_senteval, batcher, prepare)

In [67]:
# here you define the NLP taks that your embedding model is going to be evaluated
# in (https://arxiv.org/abs/1802.05883) we use the following :
# SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
# but STS14 (semantic textual similarity) is a similar type of semantic task
transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
                  'MRPC', 'SICKEntailment', 'STS14']

In [None]:
# senteval prints the results and returns a dictionary with the scores
results = se.eval(transfer_tasks)
print(results)

# Unidirectional LSTM

In [69]:
from configs.uni_lstm import config
config

{'exp_name': 'uni_lstm',
 'epochs': 20,
 'encoder': 'UniLSTM',
 'batch_size': 128,
 'hidden_dim': 2048,
 'num_layers': 1,
 'learning_rate': 0.001,
 'seed': 42,
 'debug': False,
 'device': 'cpu',
 'num_workers': 4,
 'valid_freq': 1000,
 'save_freq': 4000,
 'test_checkpoint': 'best-model.pt'}

In [70]:
trainer = Trainer(config) 
print(trainer.test())

INFO:trainer:--------------------------------------------------
INFO:trainer:EXPERIMENT: uni_lstm
INFO:trainer:--------------------------------------------------
INFO:trainer:Setting seed: 42
INFO:trainer:Loading data ...
INFO:trainer:Using device: cpu
INFO:trainer:Loading checkpoint from checkpoints/uni_lstm/best-model.pt


[Test]	Accuracy: 0.812 Total Loss: 0.517


In [71]:
trainer.model.eval()

Classifier(
  (encoder): UniLSTM(
    (emb): Embedding(33672, 300)
    (lstm): LSTM(300, 2048, batch_first=True)
  )
  (layers): Sequential(
    (0): Linear(in_features=8192, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [72]:
def prepare(params, samples):
    params.vocab = trainer.vocab
    params.encoder = trainer.model.encoder    
    return

In [73]:
sentence = ['this', 'is', 'an', 'apple', '.']

In [74]:
tokenlist = torch.tensor([[trainer.vocab.stoi[word] for word in sentence]])
tokenlist

tensor([[ 220,    6,   20, 1507,    3]])

In [75]:
[trainer.vocab.itos[token] for token in tokenlist[0]]

['this', 'is', 'an', 'apple', '.']

In [76]:
trainer.model.encoder((tokenlist, [len(tokenlist)])).shape

torch.Size([2048])

In [77]:
def batcher(params, batch):
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        sent_tokens = torch.tensor([[params.vocab.stoi[word] for word in sent]])
        x = (sent_tokens, [len(sent_tokens)])
        with torch.no_grad():
            sent_vec = params.encoder(x)
        embeddings.append(sent_vec.detach().numpy())
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings

In [78]:
params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 2}
params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 128,
                                 'tenacity': 2, 'epoch_size': 4}

In [79]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
se = senteval.engine.SE(params_senteval, batcher, prepare)

In [80]:
# here you define the NLP taks that your embedding model is going to be evaluated
# in (https://arxiv.org/abs/1802.05883) we use the following :
# SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
# but STS14 (semantic textual similarity) is a similar type of semantic task
transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
                  'MRPC', 'SICKEntailment', 'STS14']

In [None]:
# senteval prints the results and returns a dictionary with the scores
results = se.eval(transfer_tasks)
print(results)