In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
# !pip install /content/gdrive/Shareddrives/520_Project/en_vectors_web_lg-2.1.0.tar.gz

Mounted at /content/gdrive


In [2]:
# !pip install datasets
import gensim
import pandas as pd
import argparse
import numpy as np
from collections import Counter
# from datasets import load_dataset
import os
import torch
import pickle
import re
import time
import copy
import math
from torch.utils.data import DataLoader, Dataset
import torch.optim as optimizer 
import torch.nn.functional as F
from torch import nn
from sklearn.metrics import accuracy_score
# import en_vectors_web_lg

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = "sans-serif"
plt.rcParams['font.sans-serif'] = ['Times New Roman']
sns.set_style("whitegrid")
sns.set_style({'font.family':'serif', 'font.serif':'Times New Roman'})
sns.set(font_scale=1.2)


In [13]:
data_dir = '/content/gdrive/MyDrive/530project/Liar-Plus'
def clean_text(w):
  if type(w)==float:
    return " "
  return re.sub(
          r"([.,'!?\"()*#:;])",
          '',
          w.lower()
          ).replace('/', ' ')
def preprocessing(train=False, eval=False, test=False):
  cols = ['id','label','statement','subject','speaker','job_title','state_info',
          'party_affiliation','barely_true_counts','false_counts',
          'half_true_counts','mostly_true_counts','pants_on_fire_counts',
          'context','justification']
  label_dict = {"false" : 0, "half-true" : 1, "mostly-true" : 2, "true": 3, "barely-true" : 4, "pants-fire" : 5 } 
  def get_label(x):
    if x not in label_dict:
      return 1
    return label_dict[x]
  if train:
    dst_path = os.path.join(data_dir,'train2.tsv')
  if eval:
    dst_path = os.path.join(data_dir,'val2.tsv')
  if test:
    dst_path = os.path.join(data_dir,'test2.tsv')
  current_dataset = pd.read_csv(dst_path, sep='\t', header = None, names=cols)
  current_dataset['label'] = current_dataset['label'].apply(lambda x: get_label(x))
  current_dataset.reset_index(drop=True,inplace=True)
  return current_dataset

def get_word2vec_embedding(statements, data_dir):
  token_file = os.path.join(data_dir,'token_to_ix_w2v.pkl')
  w2v_file = os.path.join(data_dir,'train_w2v.npy')

  if os.path.exists(w2v_file) and os.path.exists(token_file):
        print("Loading train language files")
        return pickle.load(open(token_file, "rb")), np.load(w2v_file)

  token2ix = {'PAD': 0, 'UNK': 1}
  for s in statements:
    s = clean_text(s).split()
    for word in s:
      if word not in token2ix:
        token2ix[word] = len(token2ix)
  ix2token = {token2ix[k]: k for k in token2ix.keys()}
  w2v_path = '/content/gdrive/MyDrive/530project/GoogleNews-vectors-negative300.bin.gz'
  w2vmodel = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
  pretrained_emb = torch.randn([len(token2ix),300])
  for i in range(len(token2ix)):
    word = ix2token[i]
    if word in w2vmodel:
      vec = w2vmodel[word]
      pretrained_emb[i, :] = torch.from_numpy(vec)
  np.save(w2v_file, pretrained_emb)
  pickle.dump(token2ix, open(token_file, "wb"))
  return token2ix, pretrained_emb

def get_glove_embedding(reviews, data_dir):
  token_file = os.path.join(data_dir,'token_to_ix_glove.pkl')
  glove_file = os.path.join(data_dir,'train_glove.npy')
  if os.path.exists(glove_file) and os.path.exists(token_file):
        print("Loading saved embedding")
        return pickle.load(open(token_file, "rb")), np.load(glove_file)
  all_reviews = {}
  for idx, s in enumerate(reviews):
    all_reviews[idx] = clean_text(s).split()

  from collections import defaultdict
  token_to_ix = defaultdict(int)
  token_to_ix['UNK'] = 0
  token_to_ix['SS'] = 1

  spacy_tool = en_vectors_web_lg.load()
  pretrained_emb = []
  pretrained_emb.append(spacy_tool('UNK').vector)
  pretrained_emb.append(spacy_tool('SS').vector)
  
  for k, v in all_reviews.items():
      for word in v:
          if word not in token_to_ix:
              token_to_ix[word] = len(token_to_ix)
              pretrained_emb.append(spacy_tool(word).vector)

  pretrained_emb = np.array(pretrained_emb)
  np.save(glove_file, pretrained_emb)
  pickle.dump(token_to_ix, open(token_file, "wb"))
  return token_to_ix, pretrained_emb

def embed_text(x, max_len, token2ix):
  ques_ix = np.zeros(max_len, np.int64)
  x = clean_text(x).split()
  for ix, word in enumerate(x):
    if word in token2ix:
      ques_ix[ix] = token2ix[word]
    else:
      ques_ix[ix] = 1
    if ix + 1 == max_len:
      break
  return ques_ix

def category_from_output(output):
  res = []
  for i in output:
    top_n, top_i = i.topk(1)
    category_i = top_i[0].item()
    res.append(category_i)
  return res

def get_meta_embed(dst, meta_cols):
  all_text = []
  for i in range(len(dst)):
    cur = ''
    for c in meta_cols:
      try:
        cur += str(dst[c][i]) + ' SS '
      except:
        print(c,i)
        return
    all_text.append(cur)
  token2ix, pretrained_emb = get_glove_embedding(all_text, data_dir)
  lengths = [len(x.split()) for x in all_text]
  max_len = int(np.percentile(lengths,90))
  return token2ix,pretrained_emb, max_len
def embed_meta(dst, meta_cols, token2ix, max_len):
  all_features, all_text = [], []
  for i in range(len(dst)):
    cur = ''
    for c in meta_cols:
      try:
        cur += str(dst[c][i]) + ' SS '
      except:
        print(c,i)
        return
    all_text.append(cur)
  for t in all_text:
    all_features.append(embed_text(t, max_len, token2ix))
  return np.array(all_features)

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_model(best_model, data_iter):
  best_model.eval()
  y_pred, y_true = [], []
  for batch_idx, (text, label, meta) in enumerate(data_iter):
        text, label, meta = text.to(device), label.to(device), meta.to(device)
        output = best_model(text, meta)
        categories = category_from_output(output)
        loss = criteon(output,label)

        y_pred += categories
        y_true += label.tolist()

  acc = accuracy_score(y_pred,y_true)#, f1_score(y_pred,y_true)
  print('acc: ', acc)

In [4]:
meta_cols = ['subject','speaker','job_title','state_info','party_affiliation','context','justification']
class liar_dataset(Dataset):
  def __init__(self, dst,max_len,token2ix, token2ix_meta, max_len_meta):
    self.embed_statement = np.array([embed_text(i,max_len,token2ix) for i in list(dst['statement'])])
    self.label = np.array(dst['label'])
    self.embed_meta = np.array(embed_meta(dst,meta_cols, token2ix_meta, max_len_meta))
  def __getitem__(self, index):
    return self.embed_statement[index],\
          self.label[index],\
          self.embed_meta[index]
  def __len__(self):
    return len(self.label)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBEDDING_DIM = 300
BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCH = 11

train_dataset = preprocessing(train=True)
statements = train_dataset['statement']
token2ix, pretrained_emb = get_word2vec_embedding(statements, data_dir)
print(pretrained_emb.shape) # (len(vocab), embedding_dim)
lengths = [len(x.split()) for x in statements if type(x)!=float]
max_len = int(np.percentile(lengths,90))

dev_dataset = preprocessing(eval=True)
test_dataset = preprocessing(test=True)

token2ix_meta, pretrained_emb_meta, max_len_meta = get_meta_embed(train_dataset, meta_cols)

train_dst = liar_dataset(train_dataset,max_len,token2ix, token2ix_meta, max_len_meta)
train_data_iter = DataLoader(train_dst, batch_size=BATCH_SIZE, shuffle=True)
dev_dst = liar_dataset(dev_dataset ,max_len,token2ix, token2ix_meta, max_len_meta)
dev_data_iter = DataLoader(dev_dst, batch_size=BATCH_SIZE, shuffle=True)
test_dst = liar_dataset(test_dataset,max_len,token2ix, token2ix_meta,max_len_meta)
test_data_iter = DataLoader(test_dst, batch_size=BATCH_SIZE, shuffle=True)

Loading train language files
(13930, 300)
Loading saved embedding


In [6]:
def train(epoch,train_data_iter,dev_data_iter,opt,criteon, net, device):
  def timeSince(since):
      now = time.time()
      s = now - since
      m = math.floor(s / 60)
      s -= m * 60
      return '%dm %ds' % (m, s)
  train_losses, dev_losses, dev_acc_list = [], [], []
  best_model, best_val_acc = None, float('-inf')
  cnt_step = 0
  current_loss = 0
  plot_every = 2
  dev_every = 2
  print('train len:',len(train_data_iter),'dev len:',len(dev_data_iter))
  print('learning_rate',LEARNING_RATE,'n_iters',epoch, 'batch size', BATCH_SIZE, 'optim','Adam', 'lr_scheduler',None, 'device',device)
  start = time.time()
  for e in range(epoch): 
    print('Epoch', e)
    net.train()
    for batch_idx, (text, label, meta) in enumerate(train_data_iter):
      # if text.shape[0]!=BATCH_SIZE:
        # continue
      text, label, meta = text.to(device), label.to(device), meta.to(device)
      output = net(text,meta)
      loss = criteon(output,label)
      current_loss += loss
      cnt_step += 1
      opt.zero_grad()
      loss.backward()
      opt.step()
    if e==0:
      print(time.time()-start)
    if e % plot_every == 0:
      tmp_loss = current_loss.item() / cnt_step
      train_losses.append(tmp_loss)
      current_loss, cnt_step = 0, 0
      print('%d %d%% (%s) %.4f ' % (e, e / EPOCH * 100, timeSince(start), tmp_loss))
    if e % dev_every ==0:
      net.eval()
      eval_loss = 0
      y_pred, y_true = [], []
      cnt_eval_step = 0
      for batch_idx, (text, label, meta) in enumerate(dev_data_iter):
        text, label, meta = text.to(device), label.to(device), meta.to(device)
        output = net(text, meta)
        categories = category_from_output(output)
        loss = criteon(output,label)
        eval_loss += loss
        cnt_eval_step += 1

        y_pred += categories
        y_true += label.tolist()
      # print(cnt_eval_step, eval_loss, len(dev_data_iter))
      dev_losses.append(eval_loss.item()/cnt_eval_step)
      acc = accuracy_score(y_pred,y_true)
      dev_acc_list.append(acc)
      if acc>best_val_acc:
        best_val_acc = acc
        best_model = copy.deepcopy(net)
      print('%d %d%% (%s) %.4f %s %s %.4f' % (e, e / EPOCH * 100, timeSince(start), eval_loss.item()/cnt_eval_step, categories[:4], label.tolist()[:4], acc))
  print('best_val_acc',best_val_acc)
  return train_losses, dev_losses, dev_acc_list, best_model # best_model

In [None]:
class BiLSTM_Attention(nn.Module):
    def __init__(self, token_size, pretrained_emb, token_size_meta, pretrained_emb_meta, 
                 hidden_dim=64, n_layers=2,dropout = 0.5):
        super(BiLSTM_Attention, self).__init__()
        print('hidden_dim',hidden_dim, 'n_layers',n_layers, 'dropout',dropout)

        self.embedding = nn.Embedding(num_embeddings=token_size,
                                      embedding_dim=300)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb).type(torch.float))

        self.meta_embedding = nn.Embedding(num_embeddings=token_size_meta,
                                      embedding_dim=300)
        self.meta_embedding.weight.data.copy_(torch.from_numpy(pretrained_emb_meta).type(torch.float))

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn = nn.LSTM(300, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(0.5)

        self.w_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, hidden_dim * 2))
        self.u_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, 1))

        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)
        self.fc = nn.Linear(hidden_dim * 2, 6)
        
    def attention_net(self, x):       #x:[batch, seq_len, hidden_dim*2]
        u = torch.tanh(torch.matmul(x, self.w_omega))         #[batch, seq_len, hidden_dim*2]
        att = torch.matmul(u, self.u_omega)                   #[batch, seq_len, 1]
        att_score = F.softmax(att, dim=1)
        scored_x = x * att_score                              #[batch, seq_len, hidden_dim*2]
        context = torch.sum(scored_x, dim=1)                  #[batch, hidden_dim*2]
        # print('context',context.shape)
        return context

    def forward(self, x, meta):
        # print(x.shape,meta.shape)
        embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]

        meta = self.meta_embedding(meta)
        embedding = torch.cat((embedding,meta), dim = 1)

        embedding = torch.transpose(embedding,0,1)
        output, (final_hidden_state, final_cell_state) = self.rnn(embedding) # [28, 64, 128]
        output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]
        attn_output = self.attention_net(output) # [64, 128]
        logit = self.fc(attn_output)
        return logit
net = BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta).to(device)
criteon = nn.CrossEntropyLoss().to(device)
for batch_idx, (text, label, meta) in enumerate(dev_data_iter):
    text, label, meta = text.to(device), label.to(device), meta.to(device)
    # print(text.shape,meta.shape)
    output = net(text,meta)
    loss = criteon(output,label)
    break

hidden_dim 64 n_layers 2 dropout 0.5


In [None]:
LEARNING_RATE = 0.0005
criteon = nn.CrossEntropyLoss().to(device)
for h in [32,48,64,96,128]:
  net = BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta, hidden_dim=h).to(device)
  opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE,weight_decay=1e-4)
  train_losses, dev_losses, dev_acc_list, best_model = train(16,train_data_iter,dev_data_iter,opt,criteon, net, device)
for l in [1,3]:
  net = BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta, n_layers=l).to(device)
  opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE,weight_decay=1e-4)
  train_losses, dev_losses, dev_acc_list, best_model = train(16,train_data_iter,dev_data_iter,opt,criteon, net, device)
for do in [0.3,0.8]:
  net = BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta, dropout1 = do).to(device)
  opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE,weight_decay=1e-4)
  train_losses, dev_losses, dev_acc_list, best_model = train(16,train_data_iter,dev_data_iter,opt,criteon, net, device)

hidden_dim 32 n_layers 2 dropout1 0.5 dropout2 0.8 hidden2 64 filter2 64 n_layers2 2
train len: 161 dev len: 21
learning_rate 0.0005 n_iters 16 batch size 64 optim Adam lr_scheduler None device cpu
Epoch 0
69.33374762535095
0 0% (1m 10s) 1.7614 
0 0% (1m 12s) 1.7606 [1, 1, 1, 1] [4, 1, 1, 2] 0.1931
Epoch 1
Epoch 2
2 18% (3m 31s) 1.7560 
2 18% (3m 34s) 1.7538 [0, 0, 0, 0] [1, 1, 2, 4] 0.2188
Epoch 3
Epoch 4
4 36% (5m 50s) 1.7313 
4 36% (5m 53s) 1.7002 [2, 2, 0, 1] [2, 3, 5, 2] 0.2492
Epoch 5
Epoch 6
6 54% (8m 11s) 1.6300 
6 54% (8m 14s) 1.7066 [2, 0, 0, 1] [2, 4, 2, 5] 0.2687
Epoch 7
Epoch 8
8 72% (10m 32s) 1.4493 
8 72% (10m 35s) 1.9092 [0, 1, 2, 0] [2, 4, 0, 5] 0.2695
Epoch 9
Epoch 10
10 90% (12m 54s) 1.2460 
10 90% (12m 57s) 2.1153 [1, 1, 0, 0] [2, 1, 1, 3] 0.2648
Epoch 11
Epoch 12
12 109% (15m 16s) 1.0610 
12 109% (15m 20s) 2.3848 [4, 1, 1, 2] [5, 4, 4, 2] 0.2710
Epoch 13
Epoch 14
14 127% (17m 40s) 0.8959 
14 127% (17m 43s) 2.6214 [1, 4, 1, 2] [5, 1, 2, 2] 0.2718
Epoch 15
best_val_a

  "num_layers={}".format(dropout, num_layers))


75.2760682106018
0 0% (1m 15s) 1.7585 
0 0% (1m 18s) 1.7528 [1, 1, 1, 1] [1, 1, 3, 0] 0.2040
Epoch 1
Epoch 2
2 18% (3m 48s) 1.7461 
2 18% (3m 51s) 1.7296 [0, 2, 2, 0] [4, 3, 0, 4] 0.2344
Epoch 3
Epoch 4
4 36% (6m 21s) 1.7032 
4 36% (6m 24s) 1.7033 [1, 1, 0, 0] [4, 1, 1, 1] 0.2352
Epoch 5
Epoch 6
6 54% (8m 53s) 1.5925 
6 54% (8m 56s) 1.7262 [1, 1, 2, 4] [0, 4, 4, 0] 0.2780
Epoch 7
Epoch 8
8 72% (11m 27s) 1.3331 
8 72% (11m 30s) 2.1538 [3, 1, 2, 3] [1, 2, 1, 5] 0.2438
Epoch 9
Epoch 10
10 90% (14m 3s) 1.0455 
10 90% (14m 6s) 2.5491 [1, 3, 0, 2] [5, 2, 3, 3] 0.2391
Epoch 11
Epoch 12
12 109% (16m 39s) 0.7393 
12 109% (16m 42s) 3.0363 [4, 1, 4, 1] [1, 1, 1, 2] 0.2438
Epoch 13
Epoch 14
14 127% (19m 15s) 0.4877 
14 127% (19m 19s) 3.6198 [1, 4, 0, 0] [2, 5, 1, 5] 0.2453
Epoch 15
best_val_acc 0.2780373831775701
hidden_dim 64 n_layers 3 dropout1 0.5 dropout2 0.8 hidden2 64 filter2 64 n_layers2 2
train len: 161 dev len: 21
learning_rate 0.0005 n_iters 16 batch size 64 optim Adam lr_scheduler None 

In [None]:
for optimizer in []
for BATCH_SIZE in []

In [None]:
LEARNING_RATE = 0.0002
criteon = nn.CrossEntropyLoss().to(device)
net = BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta, hidden_dim=48, n_layers = 2, dropout = 0.3).to(device)
opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE,weight_decay=1e-4)
train_losses, dev_losses, dev_acc_list, best_model = train(21,train_data_iter,dev_data_iter,opt,criteon, net, device)

hidden_dim 48 n_layers 2 dropout 0.3
train len: 161 dev len: 21
learning_rate 0.0002 n_iters 21 batch size 64 optim Adam lr_scheduler None device cpu
Epoch 0
87.85314655303955
0 0% (1m 28s) 1.7631 
0 0% (1m 32s) 1.7597 [1, 1, 1, 1] [1, 4, 4, 2] 0.1939
Epoch 1
Epoch 2
2 18% (4m 26s) 1.7564 
2 18% (4m 30s) 1.7665 [1, 1, 1, 1] [5, 4, 0, 0] 0.1931
Epoch 3
Epoch 4
4 36% (7m 23s) 1.7527 
4 36% (7m 26s) 1.7516 [1, 0, 0, 0] [2, 0, 5, 2] 0.2407
Epoch 5
Epoch 6
6 54% (10m 21s) 1.7348 
6 54% (10m 24s) 1.7039 [2, 0, 0, 1] [3, 5, 1, 2] 0.2430
Epoch 7
Epoch 8
8 72% (13m 19s) 1.6678 
8 72% (13m 23s) 1.6670 [2, 0, 1, 2] [1, 4, 2, 2] 0.2835
Epoch 9
Epoch 10
10 90% (16m 19s) 1.5817 
10 90% (16m 22s) 1.7217 [2, 0, 0, 0] [2, 1, 3, 4] 0.2718
Epoch 11
Epoch 12
12 109% (19m 18s) 1.4580 
12 109% (19m 22s) 1.8012 [0, 0, 2, 4] [0, 1, 1, 4] 0.2850
Epoch 13
Epoch 14
14 127% (22m 19s) 1.2956 
14 127% (22m 22s) 1.9835 [4, 5, 1, 2] [1, 5, 4, 3] 0.2492
Epoch 15
Epoch 16
16 145% (25m 20s) 1.1644 
16 145% (25m 24s) 2.2

In [None]:
evaluate_model(best_model, dev_data_iter) # 0.2850467289719626
evaluate_model(best_model, test_data_iter) # 0.27624309392265195

acc:  0.2850467289719626
acc:  0.27624309392265195


In [None]:
# torch.save(best_model.state_dict(), os.path.join(data_dir,'liar_BiLSTM_Attention-acc2850-2762.pth'))

In [None]:
import warnings
warnings.filterwarnings("ignore")
fig, ((ax1, ax2))= plt.subplots(1,2,figsize = (15,5))
x_axis = [i*2 for i in range(len(dev_losses))]
sns.lineplot(x_axis, dev_acc_list, ax = ax1)
ax1.set_ylabel('Accuracy')
ax1.set_xlabel("Number of Iterations")
sns.lineplot(x_axis, train_losses, ax = ax2, label = 'train loss')
sns.lineplot(x_axis, dev_losses, ax = ax2, label = 'dev loss')
ax2.set_ylabel("Loss")
ax2.set_xlabel("Number of Iterations")
ax2.legend()
plt.tight_layout()
plt.show()

## SEPARATE

In [7]:
class sep_BiLSTM_Attention(nn.Module):
    def __init__(self, token_size, pretrained_emb, token_size_meta, pretrained_emb_meta, 
                 hidden_dim=16, n_layers=1,dropout1 = 0.8, dropout2 = 0.8,
                 hidden2 = 16, filter2 = 32, n_layers2=1):
        super(sep_BiLSTM_Attention, self).__init__()
        print('hidden_dim',hidden_dim, 'n_layers',n_layers, 'dropout1',dropout1, 'dropout2',dropout2,
                 'hidden2',hidden2, 'filter2',filter2, 'n_layers2',n_layers2)

        self.embedding = nn.Embedding(num_embeddings=token_size,
                                      embedding_dim=300)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb).type(torch.float))

        self.meta_embedding = nn.Embedding(num_embeddings=token_size_meta,
                                      embedding_dim=300)
        self.meta_embedding.weight.data.copy_(torch.from_numpy(pretrained_emb_meta).type(torch.float))

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn = nn.LSTM(300, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout1)
        self.dropout = nn.Dropout(0.5)

        self.w_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, hidden_dim * 2))
        self.u_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, 1))

        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)
        self.fc = nn.Linear(hidden_dim * 2, 6)
        
        #separate hybrid
        self.conv_unit = nn.Sequential(
            torch.nn.Conv1d(in_channels=300, out_channels=filter2, kernel_size=3),
            # torch.nn.MaxPool1d(kernel_size=self.ksizes[2]),
        )
        self.lstm_meta = nn.LSTM(300, hidden2, num_layers=n_layers2, bidirectional=True, dropout=dropout2)
        

    def attention_net(self, x):       #x:[batch, seq_len, hidden_dim*2]
        u = torch.tanh(torch.matmul(x, self.w_omega))         #[batch, seq_len, hidden_dim*2]
        att = torch.matmul(u, self.u_omega)                   #[batch, seq_len, 1]
        att_score = F.softmax(att, dim=1)
        scored_x = x * att_score                              #[batch, seq_len, hidden_dim*2]
        context = torch.sum(scored_x, dim=1)                  #[batch, hidden_dim*2]
        # print('context',context.shape)
        return context

    def forward(self, x, meta):
      
        # print(x.shape,meta.shape)
        embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]
        embedding = torch.transpose(embedding,0,1)
        # output: [seq_len, batch, hidden_dim*2]     hidden/cell: [n_layers*2, batch, hidden_dim]

        output, (final_hidden_state, final_cell_state) = self.rnn(embedding) # [28, 64, 128]
        output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]
        attn_output = self.attention_net(output) # [64, 128]
        
        meta = self.dropout(self.meta_embedding(meta)) # [4, len, 300] (4=bsz)

        # meta = torch.transpose(meta,1,2)
        # meta = self.conv_unit(meta) # x1: [4, 128, len_a]
        # meta = torch.transpose(meta,1,2)

        meta, (_,_) = self.lstm_meta(meta)
        # meta = meta.squeeze(-1) # x: [4, 128, len_d]
        meta = meta[:,:,-1]
        x = torch.cat((attn_output,meta), dim=1)
        fc = nn.Linear(x.shape[1], 6)
        x = fc(x)
        return x
net = sep_BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta).to(device)
criteon = nn.CrossEntropyLoss().to(device)
for batch_idx, (text, label, meta) in enumerate(dev_data_iter):
    text, label, meta = text.to(device), label.to(device), meta.to(device)
    # print(text.shape,meta.shape)
    output = net(text,meta)
    loss = criteon(output,label)
    break

hidden_dim 16 n_layers 1 dropout1 0.8 dropout2 0.8 hidden2 16 filter2 32 n_layers2 1


  "num_layers={}".format(dropout, num_layers))


In [19]:
LEARNING_RATE = 0.0005
criteon = nn.CrossEntropyLoss().to(device)
net = sep_BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta).to(device)
opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE,weight_decay=1e-4)
train_losses, dev_losses, dev_acc_list, best_model = train(21,train_data_iter,dev_data_iter,opt,criteon, net, device)

hidden_dim 16 n_layers 1 dropout1 0.8 dropout2 0.8 hidden2 16 filter2 32 n_layers2 1
train len: 161 dev len: 21
learning_rate 0.0005 n_iters 21 batch size 64 optim Adam lr_scheduler None device cpu
Epoch 0


  "num_layers={}".format(dropout, num_layers))


51.013997316360474
0 0% (0m 51s) 1.7960 
0 0% (0m 52s) 1.7904 [5, 5, 5, 5] [1, 0, 5, 0] 0.1799
Epoch 1
Epoch 2
2 18% (2m 33s) 1.7943 
2 18% (2m 34s) 1.7967 [2, 1, 1, 5] [3, 0, 3, 4] 0.1745
Epoch 3
Epoch 4
4 36% (4m 17s) 1.7928 
4 36% (4m 18s) 1.7916 [4, 4, 3, 3] [2, 3, 4, 3] 0.1682
Epoch 5
Epoch 6
6 54% (5m 54s) 1.7925 
6 54% (5m 55s) 1.7926 [3, 3, 3, 2] [1, 1, 2, 1] 0.1713
Epoch 7
Epoch 8
8 72% (7m 35s) 1.7927 
8 72% (7m 37s) 1.7970 [5, 5, 5, 5] [4, 3, 0, 2] 0.1589
Epoch 9
Epoch 10
10 90% (10m 50s) 1.7930 
10 90% (11m 10s) 1.7901 [2, 2, 2, 2] [1, 4, 2, 4] 0.1908
Epoch 11
Epoch 12
12 109% (16m 0s) 1.7922 
12 109% (16m 24s) 1.7908 [1, 4, 1, 4] [1, 5, 5, 2] 0.1776
Epoch 13
Epoch 14
14 127% (21m 26s) 1.7928 
14 127% (21m 51s) 1.7938 [4, 4, 4, 4] [5, 4, 0, 1] 0.1589
Epoch 15
Epoch 16
16 145% (27m 1s) 1.7923 
16 145% (27m 26s) 1.7936 [2, 1, 1, 2] [4, 5, 3, 0] 0.1612
Epoch 17
Epoch 18
18 163% (32m 37s) 1.7928 
18 163% (33m 2s) 1.7932 [3, 3, 3, 0] [2, 0, 3, 3] 0.1534
Epoch 19
Epoch 20
20 181%

In [20]:
b = copy.deepcopy(best_model)

In [21]:
evaluate_model(best_model, dev_data_iter) # 0.19080996884735202
evaluate_model(best_model, test_data_iter) #0.18389897395422258

acc:  0.16121495327102803
acc:  0.18389897395422258


## concat then attention

In [26]:
class attlast_BiLSTM_Attention(nn.Module):
    def __init__(self, token_size, pretrained_emb, token_size_meta, pretrained_emb_meta, 
                 hidden_dim=64, n_layers=2,dropout1 = 0.3, dropout2 = 0.8,
                 hidden2 = 64, filter2 = 64, n_layers2=2):
        super(attlast_BiLSTM_Attention, self).__init__()
        print('hidden_dim',hidden_dim, 'n_layers',n_layers, 'dropout1',dropout1, 'dropout2',dropout2,
                 'hidden2',hidden2, 'filter2',filter2, 'n_layers2',n_layers2)

        self.embedding = nn.Embedding(num_embeddings=token_size,
                                      embedding_dim=300)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb).type(torch.float))

        self.meta_embedding = nn.Embedding(num_embeddings=token_size_meta,
                                      embedding_dim=300)
        self.meta_embedding.weight.data.copy_(torch.from_numpy(pretrained_emb_meta).type(torch.float))

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn = nn.LSTM(300, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout1)
        self.dropout = nn.Dropout(0.5)

        self.w_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, hidden_dim * 2))
        self.u_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, 1))

        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)
        self.fc = nn.Linear(hidden_dim * 2, 6)
        
        #separate hybrid
        self.conv_unit = nn.Sequential(
            torch.nn.Conv1d(in_channels=300, out_channels=filter2, kernel_size=3),
            # torch.nn.MaxPool1d(kernel_size=self.ksizes[2]),
        )
        self.lstm_meta = nn.LSTM(132, hidden2, num_layers=n_layers2, bidirectional=True, dropout=dropout2)
        

    def attention_net(self, x):       #x:[batch, seq_len, hidden_dim*2]
        u = torch.tanh(torch.matmul(x, self.w_omega))         #[batch, seq_len, hidden_dim*2]
        att = torch.matmul(u, self.u_omega)                   #[batch, seq_len, 1]
        att_score = F.softmax(att, dim=1)
        scored_x = x * att_score                              #[batch, seq_len, hidden_dim*2]
        context = torch.sum(scored_x, dim=1)                  #[batch, hidden_dim*2]
        # print('context',context.shape)
        return context

    def forward(self, x, meta):
        # print(x.shape,meta.shape)
        embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]
        embedding = torch.transpose(embedding,0,1)
        # output: [seq_len, batch, hidden_dim*2]     hidden/cell: [n_layers*2, batch, hidden_dim]

        output, (final_hidden_state, final_cell_state) = self.rnn(embedding) # [28, 64, 128]
        output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]

        meta = self.meta_embedding(meta) # [4, len, 300] (4=bsz)
        meta = torch.transpose(meta,1,2)
        # meta = self.conv_unit(meta) # x1: [4, 128, len_a]
        meta, (_,_) = self.lstm_meta(meta)
        # meta = meta.squeeze(-1) # x: [4, 128, len_d]
        # meta = meta[:,:,-1]
        # meta = meta.permute(0,1, 2)

        output = torch.cat((output,meta),dim=1)

        attn_output = self.attention_net(output) # [64, 128]
        
        x = attn_output
        # x = torch.cat((attn_output,meta), dim=1)
        fc = nn.Linear(x.shape[1], 6)
        x = fc(x)
        return x
net = attlast_BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta).to(device)
criteon = nn.CrossEntropyLoss().to(device)
for batch_idx, (text, label, meta) in enumerate(dev_data_iter):
    text, label, meta = text.to(device), label.to(device), meta.to(device)
    # print(text.shape,meta.shape)
    output = net(text,meta)
    loss = criteon(output,label)
    break

hidden_dim 64 n_layers 2 dropout1 0.3 dropout2 0.8 hidden2 64 filter2 64 n_layers2 2


In [29]:
LEARNING_RATE = 0.0002
criteon = nn.CrossEntropyLoss().to(device)
net = attlast_BiLSTM_Attention(len(token2ix), pretrained_emb, len(token2ix_meta), pretrained_emb_meta).to(device)
opt = optimizer.Adam(net.parameters(), lr=LEARNING_RATE,weight_decay=1e-4)
train_losses, dev_losses, dev_acc_list, best_model = train(21,train_data_iter,dev_data_iter,opt,criteon, net, device)

hidden_dim 64 n_layers 2 dropout1 0.3 dropout2 0.8 hidden2 64 filter2 64 n_layers2 2
train len: 161 dev len: 21
learning_rate 0.0002 n_iters 21 batch size 64 optim Adam lr_scheduler None device cpu
Epoch 0
181.9822280406952
0 0% (3m 2s) 1.7929 
0 0% (3m 14s) 1.7963 [3, 3, 3, 3] [1, 1, 3, 3] 0.1519
Epoch 1
Epoch 2
2 18% (9m 35s) 1.7934 
2 18% (9m 44s) 1.7923 [0, 0, 0, 0] [1, 0, 0, 0] 0.1597
Epoch 3
Epoch 4
4 36% (16m 4s) 1.7935 
4 36% (16m 13s) 1.7963 [3, 3, 3, 3] [0, 2, 1, 4] 0.1628
Epoch 5
Epoch 6
6 54% (22m 29s) 1.7930 
6 54% (22m 38s) 1.7942 [5, 5, 5, 5] [4, 3, 4, 3] 0.1768
Epoch 7
Epoch 8
8 72% (29m 4s) 1.7935 
8 72% (29m 13s) 1.7938 [5, 2, 2, 2] [5, 3, 2, 4] 0.1534
Epoch 9
Epoch 10
10 90% (35m 33s) 1.7922 
10 90% (35m 42s) 1.7915 [1, 0, 0, 0] [0, 1, 0, 4] 0.1495
Epoch 11
Epoch 12
12 109% (42m 8s) 1.7921 
12 109% (42m 18s) 1.7943 [5, 5, 5, 5] [4, 3, 1, 3] 0.1729
Epoch 13
Epoch 14
14 127% (48m 36s) 1.7931 
14 127% (48m 45s) 1.7944 [1, 1, 1, 1] [1, 1, 4, 0] 0.1456
Epoch 15
Epoch 16
1

In [None]:
b = copy.deepcopy(best_model)

In [None]:
best_model.eval()
evaluate_model(best_model, dev_data_iter) # 0.19080996884735202
evaluate_model(best_model, test_data_iter) #0.18389897395422258