In [2]:
import torch
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### data preprocess

In [19]:
train = pd.read_csv("hw2_data/snli_train.tsv", sep = '\t')

In [20]:
val = pd.read_csv("hw2_data/snli_val.tsv", sep = '\t')

In [6]:
PAD_IDX = 0
UNK_IDX = 1

ft_home = 'C:/Users/gong/Documents/learning/NLP/wiki-news-300d-1M.vec/'
with open(ft_home + 'wiki-news-300d-1M.vec', encoding = "utf-8") as f:
    loaded_embeddings_ft = np.zeros((999996, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    for i, line in enumerate(f):
        if(i == 0):
            continue
        s = line.split()
        loaded_embeddings_ft[(i+1), :] = np.asarray(s[1:])
        words_ft[s[0]] = i+1
        idx2words_ft[i+1] = s[0]
        ordered_words_ft.append(s[0])

In [21]:
s1 = train['sentence1'].values
s2 = train['sentence2'].values
label = train['label'].values
s1_val = val['sentence1'].values
s2_val = val['sentence2'].values
label_val = val['label'].values

In [22]:
s1 = [i.split() for i in s1]
s2 = [i.split() for i in s2]
s1_val = [i.split() for i in s1_val]
s2_val = [i.split() for i in s2_val]

In [28]:
all_words = []
for i in s1:
    all_words += i
for i in s2:
    all_words += i
max_vocab_size = 20000

def build_vocab(all_tokens):
    words_counter = Counter(all_words)
    vocab, count = zip(*words_counter.most_common(max_vocab_size))
    PAD_IDX = 0 
    UNK_IDX = 1
    vocab = [i for i in list(vocab) if i in ordered_words_ft]
    vocab_size = len(vocab)
    loaded_embeddings = np.zeros((vocab_size+2, 300))
    for i in range(vocab_size):
        loaded_embeddings[(i+2), :] = loaded_embeddings_ft[words_ft[vocab[i]],]
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return  vocab_size, loaded_embeddings, token2id, id2token
num_embeddings, loaded_embeddings, token2id, id2token = build_vocab(all_words)

In [29]:
num_embeddings

18832

In [30]:
voc_data = {}
voc_data["num_embeddings"] = num_embeddings
voc_data["loaded_embeddings"] = loaded_embeddings
voc_data["token2id"] =  token2id
voc_data["id2token"] = id2token

In [37]:
pkl.dump(voc_data, open("voc_data_2.p","wb"))

In [23]:
import pickle as pkl
voc_data = pkl.load(open("voc_data_2.p","rb"))
num_embeddings, loaded_embeddings, token2id, id2token = voc_data["num_embeddings"], voc_data["loaded_embeddings"], voc_data["token2id"], voc_data["id2token"]

In [42]:
loaded_embeddings.shape

(18834, 300)

In [25]:
PAD_IDX = 0
UNK_IDX = 1
def word_emb(x):
    return [token2id[i] if i in token2id else UNK_IDX for i in x]
#s1_val = [i.split() for i in s1_val]
#s2_val = [i.split() for i in s2_val]
s1 = list(map(word_emb, s1))
s2 = list(map(word_emb, s2))
s1_val = list(map(word_emb, s1_val))
s2_val = list(map(word_emb, s2_val))

In [26]:
x_train = {"s1": s1,
           "s2": s2}
x_val = {"s1": s1_val,
           "s2": s2_val}

In [27]:
def one_hot_label(x):
    if(x == "neutral"):
        return 0
    elif(x=="entailment"):
        return 1
    else:
        return 2
label = list(map(one_hot_label, label))
label_val = list(map(one_hot_label, label_val))

In [28]:
import torch.utils.data as Data
from torch.utils.data import Dataset, DataLoader, TensorDataset
class MyDataset(Dataset):
    def __init__(self, x, y):
        assert len(x["s1"]) == len(y)
        self.length = len(y)
        self.x = x
        self.y = y
        
    def __getitem__(self, index):
        return {
        "s1": self.x["s1"][index],
        "s2": self.x["s2"][index],
        "s1_len": len(self.x["s1"][index]),
        "s2_len": len(self.x["s2"][index])
        }, self.y[index]
    
    def __len__(self):
        return self.length

In [76]:
MAX_WORD_LENGTH = 82
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    s1_list = []
    s2_list = []
    s1_len_list = []
    s2_len_list = []
    label_list = []

    for datum in batch:
        label_list.append(datum[1])
        s1_len_list.append(min(MAX_WORD_LENGTH, datum[0]["s1_len"]))
        s2_len_list.append(min(MAX_WORD_LENGTH, datum[0]["s2_len"]))
        s1 = np.pad(np.array(datum[0]["s1"][:MAX_WORD_LENGTH]),
                                pad_width=((0,max(0, MAX_WORD_LENGTH-datum[0]["s1_len"]))),
                                mode="constant", constant_values=PAD_IDX)
        s1_list.append(s1)
        s2 = np.pad(np.array(datum[0]["s2"][:MAX_WORD_LENGTH]),
                                pad_width=((0,max(0, MAX_WORD_LENGTH-datum[0]["s2_len"]))),
                                mode="constant", constant_values=PAD_IDX)
        s2_list.append(s2)
    ind_dec_order1 = np.argsort(s1_len_list)[::-1]
    s1_list = np.array(s1_list)[ind_dec_order1]
    s1_len_list = np.array(s1_len_list)[ind_dec_order1]
    ind_dec_order2 = np.argsort(s2_len_list)[::-1]
    s2_list = np.array(s2_list)[ind_dec_order2]
    s2_len_list = np.array(s2_len_list)[ind_dec_order2]       
    x = {
        "s1": torch.from_numpy(np.array(s1_list)).long().to(device),
        "s2": torch.from_numpy(np.array(s2_list)).long().to(device),
        "s1_len": torch.LongTensor(s1_len_list).to(device),
        "s2_len": torch.LongTensor(s2_len_list).to(device),
        "s1_order": torch.from_numpy(ind_dec_order1.copy()).to(device),
        "s2_order": torch.from_numpy(ind_dec_order2.copy()).to(device)
    }
    y = torch.LongTensor(np.array(label_list)[ind_dec_order1])
    return x, y.to(device)

In [30]:
train_dataset = MyDataset(x_train, label)
val_dataset = MyDataset(x_val, label_val)

BATCH_SIZE = 64

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                batch_size=BATCH_SIZE,
                                collate_fn=vocab_collate_func,
                                shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                batch_size=BATCH_SIZE,
                                collate_fn=vocab_collate_func,
                                shuffle=False)

In [31]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp - (num_embeddings - 2) * emb_size

## RNN
### baseline model

In [32]:
import torch.nn as nn
import torch.nn.functional as F
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, middle_size, num_embeddings):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        #
        super(RNN, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        #self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        self.linear1 = nn.Linear(4*hidden_size, middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        
        self.init_weights()
  
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size).float()

        return hidden
    
    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return

    def forward(self, x):
        # reset hidden state
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()
        self.hidden1 = self.init_hidden(batch_size).to(device)
        self.hidden2 = self.init_hidden(batch_size).to(device)
        out1 = torch.nn.utils.rnn.pack_padded_sequence(out1, x['s1_len'], batch_first=True)
        out2 = torch.nn.utils.rnn.pack_padded_sequence(out2, x['s2_len'], batch_first=True)
        _, self.hidden1 = self.rnn(out1, self.hidden1)
        _, self.hidden2 = self.rnn(out2, self.hidden2)
        final_hidden1 = self.hidden1.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        final_hidden2 = self.hidden2.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        tmp = torch.zeros(final_hidden2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = final_hidden2[i]
        final_hidden2 = tmp[list(x['s1_order']),:]
        out = torch.cat([final_hidden1,final_hidden2], dim=1)
        out = self.linear1(out)
        #out = torch.sigmoid(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out

In [33]:
def do_train(model, dataloader, criterion, optimizer):
    model.train()
    train_loss = 0
    for data, labels in dataloader:
        optimizer.zero_grad()
        y_hat = model(data)
        loss = criterion(y_hat, labels.long())
        loss.backward()
        model.embed.weight.grad[2:,:] = torch.zeros(model.embed.weight.grad[2:,:].size()).to(device)
        optimizer.step()
        train_loss += loss.item() * len(data) / len(dataloader.dataset)
    return train_loss

def do_eval(model, dataloader):
    model.eval()
    y_ls = []
    y_hat_ls = []
    val_loss = 0
    with torch.no_grad():
        for data, labels in dataloader:
            y_hat = model(data)
            loss = criterion(y_hat, labels.long())
            y_hat_ls.append(y_hat)
            y_ls.append(labels)
            val_loss += loss.item() * len(data) / len(dataloader.dataset)
    optimizer.zero_grad()
    return val_loss, torch.cat(y_hat_ls, dim=0), torch.cat(y_ls, dim=0)

def acc(model, dataloader):
    val_loss, pred,true = do_eval(
    model = model,
    dataloader = dataloader,
    )
    return val_loss, (torch.exp(pred).max(1)[1] == true).float().mean().item()

### hidden size

In [31]:
learning_rate = 0.001
emb_size = 300
num_layers = 1
middle_size = 200
num_epochs = 10
hidden_size = 100


loss_train_t = []
loss_val_t = []
acc_val_t = []
acc_train_t = []

for hidden_size in [100, 300, 500, 700]:
    print("hidden size = {}".format(hidden_size))
    loss_train = []
    loss_val = []
    acc_val = []
    acc_train = []

    model = RNN(emb_size, hidden_size, num_layers, middle_size, num_embeddings).to(device)
    criterion = torch.nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    print("Number of parameters is {}".format(get_n_params(model)))
    
    for epoch in range(num_epochs):
    #scheduler.step()
        loss = do_train(
            model=model, 
            criterion=criterion,
            dataloader=train_loader,
            optimizer=optimizer,
        )
        val_loss, val_acc = acc(model, val_loader)
        train_loss, train_acc = acc(model,train_loader)
        loss_val.append(val_loss)
        acc_val.append(val_acc)
        loss_train.append(train_loss)
        acc_train.append(train_acc)
        print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
    loss_train_t.append(loss_train)
    loss_val_t.append(loss_val)
    acc_val_t.append(acc_val)
    acc_train_t.append(acc_train)

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(loss_train_t[0])+1), loss_train_t[0], label="Training Loss")
plt.plot(range(1, len(loss_val_t[0])+1), loss_val_t[0], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[0])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 100")

plt.subplot(222)
plt.plot(range(1, len(loss_train_t[1])+1), loss_train_t[1], label="Training Loss")
plt.plot(range(1, len(loss_val_t[1])+1), loss_val_t[1], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[1])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 300")

plt.subplot(223)
plt.plot(range(1, len(loss_train_t[2])+1), loss_train_t[2], label="Training Loss")
plt.plot(range(1, len(loss_val_t[2])+1), loss_val_t[2], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[2])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 500")

plt.subplot(224)
plt.plot(range(1, len(loss_train_t[3])+1), loss_train_t[3], label="Training Loss")
plt.plot(range(1, len(loss_val_t[3])+1), loss_val_t[3], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[3])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 700")
plt.savefig("hidden_rnn_loss.pdf")
plt.show()

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(acc_train_t[0])+1), acc_train_t[0], label="Training Acc")
plt.plot(range(1, len(acc_val_t[0])+1), acc_val_t[0], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[0])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 100")

plt.subplot(222)
plt.plot(range(1, len(acc_train_t[1])+1), acc_train_t[1], label="Training Acc")
plt.plot(range(1, len(acc_val_t[1])+1), acc_val_t[1], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[1])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 300")

plt.subplot(223)
plt.plot(range(1, len(acc_train_t[2])+1), acc_train_t[2], label="Training Acc")
plt.plot(range(1, len(acc_val_t[2])+1), acc_val_t[2], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[2])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 500")

plt.subplot(224)
plt.plot(range(1, len(acc_train_t[3])+1), acc_train_t[3], label="Training Acc")
plt.plot(range(1, len(acc_val_t[3])+1), acc_val_t[3], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[3])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 700")
plt.savefig("hidden_rnn_acc.pdf")
plt.show()

hidden size = 200
Number of parameters is 1325603
Epoch: [1/10], Train Loss: 0.08003874897480018, Val Loss: 0.08503139090538026, Val Acc: 0.5940000414848328
Epoch: [2/10], Train Loss: 0.07269508834004401, Val Loss: 0.07836294972896575, Val Acc: 0.6330000162124634
Epoch: [3/10], Train Loss: 0.0667122457051277, Val Loss: 0.07433435440063477, Val Acc: 0.659000039100647
Epoch: [4/10], Train Loss: 0.061480742329359066, Val Loss: 0.07074023938179018, Val Acc: 0.6880000233650208
Epoch: [5/10], Train Loss: 0.05649339353263373, Val Loss: 0.0701916482448578, Val Acc: 0.6910000443458557
Epoch: [6/10], Train Loss: 0.052581016169786464, Val Loss: 0.06877575552463532, Val Acc: 0.7010000348091125
Epoch: [7/10], Train Loss: 0.04771245790421956, Val Loss: 0.07116731357574463, Val Acc: 0.6890000104904175
Epoch: [8/10], Train Loss: 0.041403533698618405, Val Loss: 0.07115082263946533, Val Acc: 0.6960000395774841


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/sg5722/NLP/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-31-ba78b2f0f587>", line 36, in <module>
    train_loss, train_acc = acc(model,train_loader)
  File "<ipython-input-30-4fbe3bd327bd>", line 32, in acc
    dataloader = dataloader,
  File "<ipython-input-30-4fbe3bd327bd>", line 20, in do_eval
    for data, labels in dataloader:
  File "/home/sg5722/NLP/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 314, in __next__
    batch = self.collate_fn([self.dataset[i] for i in indices])
  File "<ipython-input-15-8ed3d69c115f>", line 23, in vocab_collate_func
    mode="constant", constant_values=PAD_IDX)
  File "/home/sg5722/NLP/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 1276, in pad
    newmat = _append_const(newmat, pad_after, after_val, axis)
  File "/home/sg5722/NLP/lib/python3.6/site-package

KeyboardInterrupt: 

### ways of interacting

In [None]:
class RNN_max(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, middle_size, num_embeddings):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        #
        super(RNN_max, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        #self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        self.linear1 = nn.Linear(2*hidden_size, middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        
        self.init_weights()
  
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size).float()

        return hidden
    
    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return

    def forward(self, x):
        # reset hidden state
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()
        self.hidden1 = self.init_hidden(batch_size).to(device)
        self.hidden2 = self.init_hidden(batch_size).to(device)
        out1 = torch.nn.utils.rnn.pack_padded_sequence(out1, x['s1_len'], batch_first=True)
        out2 = torch.nn.utils.rnn.pack_padded_sequence(out2, x['s2_len'], batch_first=True)
        _, self.hidden1 = self.rnn(out1, self.hidden1)
        _, self.hidden2 = self.rnn(out2, self.hidden2)
        final_hidden1 = self.hidden1.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        final_hidden2 = self.hidden2.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        tmp = torch.zeros(final_hidden2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = final_hidden2[i]
        final_hidden2 = tmp[list(x['s1_order']),:]
        out = torch.max(final_hidden1,final_hidden2)
        out = self.linear1(out)
        #out = torch.sigmoid(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out

learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
hidden_size = 

loss_train_t = [loss_train_t[]]
loss_val_t = [loss_train_t[]]
acc_val_t = [loss_train_t[]]
acc_train_t = [loss_train_t[]]


print("maxpooling")

loss_train = []
loss_val = []
acc_val = []
acc_train = []


model = RNN_max(emb_size, hidden_size, num_layers, middle_size, num_embeddings).to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Number of parameters is {}".format(get_n_params(model)))
for epoch in range(num_epochs):
#scheduler.step()
    loss = do_train(
        model=model, 
        criterion=criterion,
        dataloader=train_loader,
        optimizer=optimizer,
    )
    val_loss, val_acc = acc(model, val_loader)
    train_loss, train_acc = acc(model,train_loader)
    loss_val.append(val_loss)
    acc_val.append(val_acc)
    loss_train.append(train_loss)
    acc_train.append(train_acc)
    print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
loss_train_t.append(loss_train)
loss_val_t.append(loss_val)
acc_val_t.append(acc_val)
acc_train_t.append(acc_train)

In [None]:
class RNN_pair_mul(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, middle_size, num_embeddings):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        #
        super(RNN_pair_mul, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        #self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        self.linear1 = nn.Linear(2*hidden_size, middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        
        self.init_weights()
  
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size).float()

        return hidden
    
    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return

    def forward(self, x):
        # reset hidden state
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()
        self.hidden1 = self.init_hidden(batch_size).to(device)
        self.hidden2 = self.init_hidden(batch_size).to(device)
        out1 = torch.nn.utils.rnn.pack_padded_sequence(out1, x['s1_len'], batch_first=True)
        out2 = torch.nn.utils.rnn.pack_padded_sequence(out2, x['s2_len'], batch_first=True)
        _, self.hidden1 = self.rnn(out1, self.hidden1)
        _, self.hidden2 = self.rnn(out2, self.hidden2)
        final_hidden1 = self.hidden1.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        final_hidden2 = self.hidden2.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        tmp = torch.zeros(final_hidden2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = final_hidden2[i]
        final_hidden2 = tmp[list(x['s1_order']),:]
        out = torch.max(final_hidden1,final_hidden2)
        out = self.linear1(out)
        #out = torch.sigmoid(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out

learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
hidden_size = 


print("pairwise multiplication")

loss_train = []
loss_val = []
acc_val = []
acc_train = []


model = RNN_pair_mul(emb_size, hidden_size, num_layers, middle_size, num_embeddings).to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Number of parameters is {}".format(get_n_params(model)))
for epoch in range(num_epochs):
#scheduler.step()
    loss = do_train(
        model=model, 
        criterion=criterion,
        dataloader=train_loader,
        optimizer=optimizer,
    )
    val_loss, val_acc = acc(model, val_loader)
    train_loss, train_acc = acc(model,train_loader)
    loss_val.append(val_loss)
    acc_val.append(val_acc)
    loss_train.append(train_loss)
    acc_train.append(train_acc)
    print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
loss_train_t.append(loss_train)
loss_val_t.append(loss_val)
acc_val_t.append(acc_val)
acc_train_t.append(acc_train)

In [None]:
plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(loss_train_t[0])+1), loss_train_t[0], label="Training Loss")
plt.plot(range(1, len(loss_val_t[0])+1), loss_val_t[0], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[0])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("concentenation")

plt.subplot(222)
plt.plot(range(1, len(loss_train_t[1])+1), loss_train_t[1], label="Training Loss")
plt.plot(range(1, len(loss_val_t[1])+1), loss_val_t[1], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[1])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("max pooling")

plt.subplot(223)
plt.plot(range(1, len(loss_train_t[2])+1), loss_train_t[2], label="Training Loss")
plt.plot(range(1, len(loss_val_t[2])+1), loss_val_t[2], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[2])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("pairwise multiplication")

plt.savefig("interacting_rnn_loss.pdf")
plt.show()

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(acc_train_t[0])+1), acc_train_t[0], label="Training Acc")
plt.plot(range(1, len(acc_val_t[0])+1), acc_val_t[0], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[0])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("concentenation")

plt.subplot(222)
plt.plot(range(1, len(acc_train_t[1])+1), acc_train_t[1], label="Training Acc")
plt.plot(range(1, len(acc_val_t[1])+1), acc_val_t[1], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[1])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("max pooling")

plt.subplot(223)
plt.plot(range(1, len(acc_train_t[2])+1), acc_train_t[2], label="Training Acc")
plt.plot(range(1, len(acc_val_t[2])+1), acc_val_t[2], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[2])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("pairwise multiplication")

plt.savefig("interacting_acc_rnn.pdf")
plt.show()

### Regularization

In [None]:
learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
hidden_size = 

loss_train_t = [loss_train_t[]]
loss_val_t = [loss_train_t[]]
acc_val_t = [loss_train_t[]]
acc_train_t = [loss_train_t[]]


print("weight decay")

loss_train = []
loss_val = []
acc_val = []
acc_train = []


model = RNN(emb_size, hidden_size, num_layers, middle_size, num_embeddings).to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-8)
print("Number of parameters is {}".format(get_n_params(model)))
for epoch in range(num_epochs):
#scheduler.step()
    loss = do_train(
        model=model, 
        criterion=criterion,
        dataloader=train_loader,
        optimizer=optimizer,
    )
    val_loss, val_acc = acc(model, val_loader)
    train_loss, train_acc = acc(model,train_loader)
    loss_val.append(val_loss)
    acc_val.append(val_acc)
    loss_train.append(train_loss)
    acc_train.append(train_acc)
    print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
loss_train_t.append(loss_train)
loss_val_t.append(loss_val)
acc_val_t.append(acc_val)
acc_train_t.append(acc_train)

In [None]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, middle_size, num_embeddings):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        #
        super(RNN, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        #self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        self.linear1 = nn.Linear(4*hidden_size, middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        self.dropout = nn.Dropout(p=0.5)
        
        self.init_weights()
  
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size).float()

        return hidden
    
    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return

    def forward(self, x):
        # reset hidden state
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()
        self.hidden1 = self.init_hidden(batch_size).to(device)
        self.hidden2 = self.init_hidden(batch_size).to(device)
        out1 = torch.nn.utils.rnn.pack_padded_sequence(out1, x['s1_len'], batch_first=True)
        out2 = torch.nn.utils.rnn.pack_padded_sequence(out2, x['s2_len'], batch_first=True)
        _, self.hidden1 = self.rnn(out1, self.hidden1)
        _, self.hidden2 = self.rnn(out2, self.hidden2)
        final_hidden1 = self.hidden1.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        final_hidden2 = self.hidden2.view(1, 2, batch_size, self.hidden_size).transpose(1,2).contiguous().view(batch_size, -1)
        tmp = torch.zeros(final_hidden2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = final_hidden2[i]
        final_hidden2 = tmp[list(x['s1_order']),:]
        out = torch.cat([final_hidden1,final_hidden2], dim=1)
        out = self.linear1(out)
        #out = torch.sigmoid(out)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out


learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
hidden_size = 


print("dropout")

loss_train = []
loss_val = []
acc_val = []
acc_train = []


model = RNN(emb_size, hidden_size, num_layers, middle_size, num_embeddings).to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-8)
print("Number of parameters is {}".format(get_n_params(model)))
for epoch in range(num_epochs):
#scheduler.step()
    loss = do_train(
        model=model, 
        criterion=criterion,
        dataloader=train_loader,
        optimizer=optimizer,
    )
    val_loss, val_acc = acc(model, val_loader)
    train_loss, train_acc = acc(model,train_loader)
    loss_val.append(val_loss)
    acc_val.append(val_acc)
    loss_train.append(train_loss)
    acc_train.append(train_acc)
    print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
loss_train_t.append(loss_train)
loss_val_t.append(loss_val)
acc_val_t.append(acc_val)
acc_train_t.append(acc_train)

In [None]:
plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(loss_train_t[0])+1), loss_train_t[0], label="Training Loss")
plt.plot(range(1, len(loss_val_t[0])+1), loss_val_t[0], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[0])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("no regularization")

plt.subplot(222)
plt.plot(range(1, len(loss_train_t[1])+1), loss_train_t[1], label="Training Loss")
plt.plot(range(1, len(loss_val_t[1])+1), loss_val_t[1], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[1])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("weight decay")

plt.subplot(223)
plt.plot(range(1, len(loss_train_t[2])+1), loss_train_t[2], label="Training Loss")
plt.plot(range(1, len(loss_val_t[2])+1), loss_val_t[2], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[2])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("dropout")

plt.savefig("regularization_rnn_loss.pdf")
plt.show()

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(acc_train_t[0])+1), acc_train_t[0], label="Training Acc")
plt.plot(range(1, len(acc_val_t[0])+1), acc_val_t[0], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[0])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("no regularization")

plt.subplot(222)
plt.plot(range(1, len(acc_train_t[1])+1), acc_train_t[1], label="Training Acc")
plt.plot(range(1, len(acc_val_t[1])+1), acc_val_t[1], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[1])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("weight decay")

plt.subplot(223)
plt.plot(range(1, len(acc_train_t[2])+1), acc_train_t[2], label="Training Acc")
plt.plot(range(1, len(acc_val_t[2])+1), acc_val_t[2], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[2])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("dropout")

plt.savefig("regularization_acc_rnn.pdf")
plt.show()

### multiNLI data

In [77]:
MAX_WORD_LENGTH = 82
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    s1_list = []
    s2_list = []
    s1_len_list = []
    s2_len_list = []
    label_list = []

    for datum in batch:
        label_list.append(datum[1])
        s1_len_list.append(min(MAX_WORD_LENGTH, datum[0]["s1_len"]))
        s2_len_list.append(min(MAX_WORD_LENGTH, datum[0]["s2_len"]))
        s1 = np.pad(np.array(datum[0]["s1"][:MAX_WORD_LENGTH]),
                                pad_width=((0,max(0, MAX_WORD_LENGTH-datum[0]["s1_len"]))),
                                mode="constant", constant_values=PAD_IDX)
        s1_list.append(s1)
        s2 = np.pad(np.array(datum[0]["s2"][:MAX_WORD_LENGTH]),
                                pad_width=((0,max(0, MAX_WORD_LENGTH-datum[0]["s2_len"]))),
                                mode="constant", constant_values=PAD_IDX)
        s2_list.append(s2)
    ind_dec_order1 = np.argsort(s1_len_list)[::-1]
    s1_list = np.array(s1_list)[ind_dec_order1]
    s1_len_list = np.array(s1_len_list)[ind_dec_order1]
    ind_dec_order2 = np.argsort(s2_len_list)[::-1]
    s2_list = np.array(s2_list)[ind_dec_order2]
    s2_len_list = np.array(s2_len_list)[ind_dec_order2]       
    x = {
        "s1": torch.from_numpy(np.array(s1_list)).long().to(device),
        "s2": torch.from_numpy(np.array(s2_list)).long().to(device),
        "s1_len": torch.LongTensor(s1_len_list).to(device),
        "s2_len": torch.LongTensor(s2_len_list).to(device),
        "s1_order": torch.from_numpy(ind_dec_order1.copy()).to(device),
        "s2_order": torch.from_numpy(ind_dec_order2.copy()).to(device)
    }
    y = torch.LongTensor(np.array(label_list)[ind_dec_order1])
    return x, y.to(device)

In [78]:
learning_rate = 0.001
emb_size = 300
num_layers = 1
middle_size = 200
num_epochs = 10
hidden_size = 100
model = RNN(emb_size, hidden_size, num_layers, middle_size, num_embeddings).to(device)
#criterion = torch.nn.NLLLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
#for epoch in range(num_epochs):
    #scheduler.step()
#    _ = do_train(
#            model=model, 
#            criterion=criterion,
#            dataloader=train_loader,
#            optimizer=optimizer,
#        )

In [80]:
mnli_val = pd.read_csv("hw2_data/mnli_val.tsv", sep = '\t')
PAD_IDX = 0
UNK_IDX = 1
def word_emb(x):
    return [token2id[i] if i in token2id else UNK_IDX for i in x]
genre = mnli_val['genre'].unique()
mnli_val_acc = {}
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-8)
for gen in genre:
    s1_mnli_val = mnli_val.loc[mnli_val['genre']==gen,'sentence1'].values
    s2_mnli_val = mnli_val.loc[mnli_val['genre']==gen,'sentence2'].values
    label_mnli_val = mnli_val.loc[mnli_val['genre']==gen,'label'].values
    label_mnli_val = list(map(one_hot_label, label_mnli_val))
    s1_mnli_val = [i.split() for i in s1_mnli_val]
    s2_mnli_val  = [i.split() for i in s2_mnli_val]
    s1_mnli_val = list(map(word_emb, s1_mnli_val))
    s2_mnli_val  = list(map(word_emb, s2_mnli_val))
    x_mnli_val = {"s1": s1_mnli_val,
                  "s2": s2_mnli_val}
    mnli_val_dataset = MyDataset(x_mnli_val, label_mnli_val)
     
    BATCH_SIZE = 64
 
    mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                                batch_size=BATCH_SIZE,
                                collate_fn=vocab_collate_func,
                                shuffle=False)
    _, mnli_val_acc_value = acc(model,mnli_val_loader)
    mnli_val_acc[gen] = mnli_val_acc_value

In [81]:
mnli_val_acc

{'fiction': 0.34371858835220337,
 'telephone': 0.36616915464401245,
 'slate': 0.3313373327255249,
 'government': 0.3868110179901123,
 'travel': 0.35132384300231934}

## CNN
### baseline model

In [32]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, middle_size,  num_embeddings, kernel_size, padding):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=kernel_size, padding=padding)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=kernel_size, padding=padding)

        self.linear1 = nn.Linear(hidden_size*2 , middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        
        self.init_weights()

    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return
    
    def forward(self, x):
        
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()

        out1 = self.conv1(out1.transpose(1, 2)).transpose(1, 2)
        out2 = self.conv1(out2.transpose(1, 2)).transpose(1, 2)
        
        out1 = F.relu(out1.contiguous().view(-1, out1.size(-1))).view(batch_size, seq_len, out1.size(-1))
        out2 = F.relu(out2.contiguous().view(-1, out2.size(-1))).view(batch_size, seq_len, out2.size(-1))
        

        out1 = self.conv2(out1.transpose(1, 2)).transpose(1, 2)
        out2 = self.conv2(out2.transpose(1, 2)).transpose(1, 2)
        
        out1 = F.relu(out1.contiguous().view(-1, out1.size(-1))).view(batch_size, seq_len, out1.size(-1))
        out2 = F.relu(out2.contiguous().view(-1, out2.size(-1))).view(batch_size, seq_len, out2.size(-1))
        
        out1 = torch.max(out1, dim=1)[0]
        out2 = torch.max(out2, dim=1)[0]
        
        tmp = torch.zeros(out2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = out2[i]
        out2 = tmp[list(x['s1_order']),:]
        
        out = torch.cat([out1, out2], dim=1)
        
        out = self.linear1(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out

### hidden size

In [None]:
learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
kernel_size = 3
padding = 1

loss_train_t = []
loss_val_t = []
acc_val_t = []
acc_train_t = []

for hidden_size in [100, 300, 500, 700]:
    print("hidden size = {}".format(hidden_size))
    loss_train = []
    loss_val = []
    acc_val = []
    acc_train = []


    model = CNN(emb_size, hidden_size, middle_size, num_embeddings, kernel_size, padding).to(device)
    criterion = torch.nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    print("Number of parameters is {}".format(get_n_params(model)))
    for epoch in range(num_epochs):
    #scheduler.step()
        loss = do_train(
            model=model, 
            criterion=criterion,
            dataloader=train_loader,
            optimizer=optimizer,
        )
        val_loss, val_acc = acc(model, val_loader)
        train_loss, train_acc = acc(model,train_loader)
        loss_val.append(val_loss)
        acc_val.append(val_acc)
        loss_train.append(train_loss)
        acc_train.append(train_acc)
        print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
    loss_train_t.append(loss_train)
    loss_val_t.append(loss_val)
    acc_val_t.append(acc_val)
    acc_train_t.append(acc_train)


plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(loss_train_t[0])+1), loss_train_t[0], label="Training Loss")
plt.plot(range(1, len(loss_val_t[0])+1), loss_val_t[0], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[0])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 100")

plt.subplot(222)
plt.plot(range(1, len(loss_train_t[1])+1), loss_train_t[1], label="Training Loss")
plt.plot(range(1, len(loss_val_t[1])+1), loss_val_t[1], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[1])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 300")

plt.subplot(223)
plt.plot(range(1, len(loss_train_t[2])+1), loss_train_t[2], label="Training Loss")
plt.plot(range(1, len(loss_val_t[2])+1), loss_val_t[2], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[2])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 500")

plt.subplot(224)
plt.plot(range(1, len(loss_train_t[3])+1), loss_train_t[3], label="Training Loss")
plt.plot(range(1, len(loss_val_t[3])+1), loss_val_t[3], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[3])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("hidden size = 700")
plt.savefig("hidden_cnn_loss.pdf")
plt.show()

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(acc_train_t[0])+1), acc_train_t[0], label="Training Acc")
plt.plot(range(1, len(acc_val_t[0])+1), acc_val_t[0], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[0])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 100")

plt.subplot(222)
plt.plot(range(1, len(acc_train_t[1])+1), acc_train_t[1], label="Training Acc")
plt.plot(range(1, len(acc_val_t[1])+1), acc_val_t[1], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[1])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 300")

plt.subplot(223)
plt.plot(range(1, len(acc_train_t[2])+1), acc_train_t[2], label="Training Acc")
plt.plot(range(1, len(acc_val_t[2])+1), acc_val_t[2], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[2])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 500")

plt.subplot(224)
plt.plot(range(1, len(acc_train_t[3])+1), acc_train_t[3], label="Training Acc")
plt.plot(range(1, len(acc_val_t[3])+1), acc_val_t[3], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[3])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("hidden size = 700")
plt.savefig("hidden_cnn_acc.pdf")
plt.show()

hidden size = 300
Number of parameters is 602203
Epoch: [1/15], Train Loss: 0.07387643548607828, Val Loss: 0.08019021928310394, Val Acc: 0.6130000352859497
Epoch: [2/15], Train Loss: 0.06615986651301378, Val Loss: 0.07435415089130401, Val Acc: 0.6580000519752502
Epoch: [3/15], Train Loss: 0.05936328333020223, Val Loss: 0.07112922763824463, Val Acc: 0.6960000395774841
Epoch: [4/15], Train Loss: 0.05185062176048753, Val Loss: 0.0706097549200058, Val Acc: 0.6880000233650208
Epoch: [5/15], Train Loss: 0.04721668162107466, Val Loss: 0.07306521856784819, Val Acc: 0.6880000233650208
Epoch: [6/15], Train Loss: 0.03967039755642419, Val Loss: 0.07561740696430207, Val Acc: 0.6790000200271606
Epoch: [7/15], Train Loss: 0.03209091636121273, Val Loss: 0.0810650485754013, Val Acc: 0.6860000491142273
Epoch: [8/15], Train Loss: 0.026664241744279896, Val Loss: 0.08933041548728943, Val Acc: 0.6630000472068787
Epoch: [9/15], Train Loss: 0.02188171897262333, Val Loss: 0.09915066361427308, Val Acc: 0.683000

### kernel size

In [None]:
learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
kernel_size = 3
padding = 1
hidden_size = 

loss_train_t = [loss_train_t[]]
loss_val_t = [loss_train_t[]]
acc_val_t = [loss_train_t[]]
acc_train_t = [loss_train_t[]]

for kernel_size in [3, 7, 11, 15]:
    print("kernel size = {}".format(kernel_size))
    padding = int((kernel_size-1)/2)
    loss_train = []
    loss_val = []
    acc_val = []
    acc_train = []


    model = CNN(emb_size, hidden_size, middle_size, num_embeddings, kernel_size, padding).to(device)
    criterion = torch.nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    print("Number of parameters is {}".format(get_n_params(model)))
    for epoch in range(num_epochs):
    #scheduler.step()
        loss = do_train(
            model=model, 
            criterion=criterion,
            dataloader=train_loader,
            optimizer=optimizer,
        )
        val_loss, val_acc = acc(model, val_loader)
        train_loss, train_acc = acc(model,train_loader)
        loss_val.append(val_loss)
        acc_val.append(val_acc)
        loss_train.append(train_loss)
        acc_train.append(train_acc)
        print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
    loss_train_t.append(loss_train)
    loss_val_t.append(loss_val)
    acc_val_t.append(acc_val)
    acc_train_t.append(acc_train)


plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(loss_train_t[0])+1), loss_train_t[0], label="Training Loss")
plt.plot(range(1, len(loss_val_t[0])+1), loss_val_t[0], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[0])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("kernel size = 3")

plt.subplot(222)
plt.plot(range(1, len(loss_train_t[1])+1), loss_train_t[1], label="Training Loss")
plt.plot(range(1, len(loss_val_t[1])+1), loss_val_t[1], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[1])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("kernel size = 7")

plt.subplot(223)
plt.plot(range(1, len(loss_train_t[2])+1), loss_train_t[2], label="Training Loss")
plt.plot(range(1, len(loss_val_t[2])+1), loss_val_t[2], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[2])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("kernel size = 11")

plt.subplot(224)
plt.plot(range(1, len(loss_train_t[3])+1), loss_train_t[3], label="Training Loss")
plt.plot(range(1, len(loss_val_t[3])+1), loss_val_t[3], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[3])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("kernel size = 15")
plt.savefig("kernel_cnn_loss.pdf")
plt.show()

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(acc_train_t[0])+1), acc_train_t[0], label="Training Acc")
plt.plot(range(1, len(acc_val_t[0])+1), acc_val_t[0], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[0])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("kernel size = 3")

plt.subplot(222)
plt.plot(range(1, len(acc_train_t[1])+1), acc_train_t[1], label="Training Acc")
plt.plot(range(1, len(acc_val_t[1])+1), acc_val_t[1], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[1])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("kernel size = 7")

plt.subplot(223)
plt.plot(range(1, len(acc_train_t[2])+1), acc_train_t[2], label="Training Acc")
plt.plot(range(1, len(acc_val_t[2])+1), acc_val_t[2], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[2])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("kernel size = 11")

plt.subplot(224)
plt.plot(range(1, len(acc_train_t[3])+1), acc_train_t[3], label="Training Acc")
plt.plot(range(1, len(acc_val_t[3])+1), acc_val_t[3], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[3])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("kernel size = 15")
plt.savefig("kernel_cnn_acc.pdf")
plt.show()

### ways of interaction

In [23]:
class CNN_max(nn.Module):
    def __init__(self, emb_size, hidden_size, middle_size,  num_embeddings, kernel_size, padding):

        super(CNN_max, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=kernel_size, padding=padding)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=kernel_size, padding=padding)

        self.linear1 = nn.Linear(hidden_size , middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        
        self.init_weights()

    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return
    
    def forward(self, x):
        
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()

        out1 = self.conv1(out1.transpose(1, 2)).transpose(1, 2)
        out2 = self.conv1(out2.transpose(1, 2)).transpose(1, 2)
        
        out1 = F.relu(out1.contiguous().view(-1, out1.size(-1))).view(batch_size, seq_len, out1.size(-1))
        out2 = F.relu(out2.contiguous().view(-1, out2.size(-1))).view(batch_size, seq_len, out2.size(-1))
        

        out1 = self.conv2(out1.transpose(1, 2)).transpose(1, 2)
        out2 = self.conv2(out2.transpose(1, 2)).transpose(1, 2)
        
        out1 = F.relu(out1.contiguous().view(-1, out1.size(-1))).view(batch_size, seq_len, out1.size(-1))
        out2 = F.relu(out2.contiguous().view(-1, out2.size(-1))).view(batch_size, seq_len, out2.size(-1))
        
        out1 = torch.max(out1, dim=1)[0]
        out2 = torch.max(out2, dim=1)[0]
        
        tmp = torch.zeros(out2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = out2[i]
        out2 = tmp[list(x['s1_order']),:]
        
        out = torch.max(out1, out2)
        
        out = self.linear1(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out

learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
kernel_size = 3
padding = 1
hidden_size = 

loss_train_t = [loss_train_t[]]
loss_val_t = [loss_train_t[]]
acc_val_t = [loss_train_t[]]
acc_train_t = [loss_train_t[]]


print("maxpooling")

loss_train = []
loss_val = []
acc_val = []
acc_train = []


model = CNN_max(emb_size, hidden_size, middle_size, num_embeddings, kernel_size, padding).to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Number of parameters is {}".format(get_n_params(model)))
for epoch in range(num_epochs):
#scheduler.step()
    loss = do_train(
        model=model, 
        criterion=criterion,
        dataloader=train_loader,
        optimizer=optimizer,
    )
    val_loss, val_acc = acc(model, val_loader)
    train_loss, train_acc = acc(model,train_loader)
    loss_val.append(val_loss)
    acc_val.append(val_acc)
    loss_train.append(train_loss)
    acc_train.append(train_acc)
    print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
loss_train_t.append(loss_train)
loss_val_t.append(loss_val)
acc_val_t.append(acc_val)
acc_train_t.append(acc_train)

SyntaxError: invalid syntax (<ipython-input-23-d2fd37800a7a>, line 63)

In [None]:
class CNN_pair_mul(nn.Module):
    def __init__(self, emb_size, hidden_size, middle_size,  num_embeddings, kernel_size, padding):

        super(CNN_pair_mul, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embed = nn.Embedding(num_embeddings, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=kernel_size, padding=padding)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=kernel_size, padding=padding)

        self.linear1 = nn.Linear(hidden_size , middle_size)
        self.linear2 = nn.Linear(middle_size, 3)
        
        self.init_weights()

    def init_weights(self):
        loaded_embeddings[:2, :] = np.random.randn(2,300)
        self.embed.weight = nn.Parameter(torch.from_numpy(loaded_embeddings).float())
        return
    
    def forward(self, x):
        
        out1 = self.embed(x['s1'])
        out2 = self.embed(x['s2'])
        batch_size, seq_len = x['s1'].size()

        out1 = self.conv1(out1.transpose(1, 2)).transpose(1, 2)
        out2 = self.conv1(out2.transpose(1, 2)).transpose(1, 2)
        
        out1 = F.relu(out1.contiguous().view(-1, out1.size(-1))).view(batch_size, seq_len, out1.size(-1))
        out2 = F.relu(out2.contiguous().view(-1, out2.size(-1))).view(batch_size, seq_len, out2.size(-1))
        

        out1 = self.conv2(out1.transpose(1, 2)).transpose(1, 2)
        out2 = self.conv2(out2.transpose(1, 2)).transpose(1, 2)
        
        out1 = F.relu(out1.contiguous().view(-1, out1.size(-1))).view(batch_size, seq_len, out1.size(-1))
        out2 = F.relu(out2.contiguous().view(-1, out2.size(-1))).view(batch_size, seq_len, out2.size(-1))
        
        out1 = torch.max(out1, dim=1)[0]
        out2 = torch.max(out2, dim=1)[0]
        
        tmp = torch.zeros(out2.size()).to(device)
        for i in range(batch_size):
            tmp[x['s2_order'][i],:] = out2[i]
        out2 = tmp[list(x['s1_order']),:]
        
        out = out1 * out2
        
        out = self.linear1(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out

learning_rate = 0.001
emb_size = 300
middle_size = 200
num_epochs =15
kernel_size = 3
padding = 1
hidden_size = 


print("pairwise multiplication")

loss_train = []
loss_val = []
acc_val = []
acc_train = []


model = CNN_pair_mul(emb_size, hidden_size, middle_size, num_embeddings, kernel_size, padding).to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Number of parameters is {}".format(get_n_params(model)))
for epoch in range(num_epochs):
#scheduler.step()
    loss = do_train(
        model=model, 
        criterion=criterion,
        dataloader=train_loader,
        optimizer=optimizer,
    )
    val_loss, val_acc = acc(model, val_loader)
    train_loss, train_acc = acc(model,train_loader)
    loss_val.append(val_loss)
    acc_val.append(val_acc)
    loss_train.append(train_loss)
    acc_train.append(train_acc)
    print('Epoch: [{}/{}], Train Loss: {}, Val Loss: {}, Val Acc: {}'.format( 
                           epoch+1, num_epochs, train_loss, val_loss, val_acc))
        
loss_train_t.append(loss_train)
loss_val_t.append(loss_val)
acc_val_t.append(acc_val)
acc_train_t.append(acc_train)

In [None]:
plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(loss_train_t[0])+1), loss_train_t[0], label="Training Loss")
plt.plot(range(1, len(loss_val_t[0])+1), loss_val_t[0], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[0])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("concentenation")

plt.subplot(222)
plt.plot(range(1, len(loss_train_t[1])+1), loss_train_t[1], label="Training Loss")
plt.plot(range(1, len(loss_val_t[1])+1), loss_val_t[1], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[1])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("max pooling")

plt.subplot(223)
plt.plot(range(1, len(loss_train_t[2])+1), loss_train_t[2], label="Training Loss")
plt.plot(range(1, len(loss_val_t[2])+1), loss_val_t[2], label="Validation Loss")
plt.xticks(range(1, len(loss_val_t[2])+1))
plt.ylim(0, 0.2)
plt.legend()
plt.title("pairwise multiplication")

plt.savefig("interacting_cnn_loss.pdf")
plt.show()

plt.figure(figsize=(15, 10))
plt.subplot(221)
plt.plot(range(1, len(acc_train_t[0])+1), acc_train_t[0], label="Training Acc")
plt.plot(range(1, len(acc_val_t[0])+1), acc_val_t[0], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[0])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("concentenation")

plt.subplot(222)
plt.plot(range(1, len(acc_train_t[1])+1), acc_train_t[1], label="Training Acc")
plt.plot(range(1, len(acc_val_t[1])+1), acc_val_t[1], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[1])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("max pooling")

plt.subplot(223)
plt.plot(range(1, len(acc_train_t[2])+1), acc_train_t[2], label="Training Acc")
plt.plot(range(1, len(acc_val_t[2])+1), acc_val_t[2], label="Validation Acc")
plt.xticks(range(1, len(acc_val_t[2])+1))
plt.ylim(0, 1)
plt.legend()
plt.title("pairwise multiplication")

plt.savefig("interacting_cnn_acc.pdf")
plt.show()