In [59]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb

In [2]:
train_data = open('hw2_data/snli_train.tsv').read().split('\n')

In [4]:
train_data_df = pd.DataFrame.from_csv('hw2_data/snli_train.tsv', sep='\t', index_col=None)
test_data_df = pd.DataFrame.from_csv('hw2_data/snli_val.tsv', sep='\t', index_col=None)
train_data_df

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction
5,A female laying on her stomach in the water ou...,There is a women outdoors,entailment
6,Children eat at a long table with black chairs .,Kids at a short table with red chairs .,contradiction
7,A person rides a motorcycle quickly .,The man is racing his motorcycle in a race .,neutral
8,Woman riding a red bicycle down a city street ...,person riding a bike,entailment
9,"Two young women dancers , one brunette and one...",Two women are wearing costumes,entailment


In [5]:
train_data_df['token1'] = [x.split() for x in train_data_df.sentence1]
train_data_df['token2'] = [x.split() for x in train_data_df.sentence2]

In [6]:
test_data_df['token1'] = [x.split() for x in test_data_df.sentence1]
test_data_df['token2'] = [x.split() for x in test_data_df.sentence2]

In [7]:
words_to_load = 50000
PAD_IDX = 0
UNK_IDX = 1
import numpy as np
# reserve the 1st 2nd token for padding and <UNK> respectively
with open('wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    ordered_words_ft.extend(['<pad>', '<unk>'])
    loaded_embeddings_ft[0,:] = np.zeros(300)
    loaded_embeddings_ft[1,:] = np.random.normal(size = 300)
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i] = s[0]
        ordered_words_ft.append(s[0])
    words_ft['<pad>'] = 0
    words_ft['<unk>'] = 1

In [8]:
words_ft

{'999994': 2,
 ',': 3,
 'the': 4,
 '.': 5,
 'and': 6,
 'of': 7,
 'to': 8,
 'in': 9,
 'a': 10,
 '"': 11,
 ':': 12,
 ')': 13,
 'that': 14,
 '(': 15,
 'is': 16,
 'for': 17,
 'on': 18,
 '*': 19,
 'with': 20,
 'as': 21,
 'it': 22,
 'The': 23,
 'or': 24,
 'was': 25,
 "'": 26,
 "'s": 27,
 'by': 28,
 'from': 29,
 'at': 30,
 'I': 31,
 'this': 32,
 'you': 33,
 '/': 34,
 'are': 35,
 '=': 36,
 'not': 37,
 '-': 38,
 'have': 39,
 '?': 40,
 'be': 41,
 'which': 42,
 ';': 43,
 'all': 44,
 'his': 45,
 'has': 46,
 'one': 47,
 'their': 48,
 'about': 49,
 'but': 50,
 'an': 51,
 '|': 52,
 'said': 53,
 'more': 54,
 'page': 55,
 'he': 56,
 'your': 57,
 'will': 58,
 'its': 59,
 'so': 60,
 'were': 61,
 'had': 62,
 'also': 63,
 'only': 64,
 'if': 65,
 'time': 66,
 'some': 67,
 'people': 68,
 'like': 69,
 'who': 70,
 'them': 71,
 'other': 72,
 'they': 73,
 'when': 74,
 'Wikipedia': 75,
 'article': 76,
 'what': 77,
 '#': 78,
 'just': 79,
 '!': 80,
 'any': 81,
 'after': 82,
 'there': 83,
 'would': 84,
 'can': 85,
 

In [9]:
loaded_embeddings_ft

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -1.25525490e+00,  -1.06705833e+00,   1.37764632e+00, ...,
          3.30495049e-01,   7.41800195e-01,   1.19833225e+00],
       [  3.00000000e+02,   3.00000000e+02,   3.00000000e+02, ...,
          3.00000000e+02,   3.00000000e+02,   3.00000000e+02],
       ..., 
       [ -1.88900000e-01,  -1.19400000e-01,  -1.56700000e-01, ...,
         -1.83300000e-01,  -2.76000000e-02,  -2.48500000e-01],
       [ -9.17000000e-02,  -1.52800000e-01,  -7.59000000e-02, ...,
         -3.60000000e-03,   2.48000000e-02,   3.47000000e-02],
       [ -1.86000000e-01,  -7.20000000e-02,  -2.73000000e-02, ...,
         -1.55800000e-01,  -7.49000000e-02,   2.58400000e-01]])

In [10]:
ordered_words_ft

['<pad>',
 '<unk>',
 '999994',
 ',',
 'the',
 '.',
 'and',
 'of',
 'to',
 'in',
 'a',
 '"',
 ':',
 ')',
 'that',
 '(',
 'is',
 'for',
 'on',
 '*',
 'with',
 'as',
 'it',
 'The',
 'or',
 'was',
 "'",
 "'s",
 'by',
 'from',
 'at',
 'I',
 'this',
 'you',
 '/',
 'are',
 '=',
 'not',
 '-',
 'have',
 '?',
 'be',
 'which',
 ';',
 'all',
 'his',
 'has',
 'one',
 'their',
 'about',
 'but',
 'an',
 '|',
 'said',
 'more',
 'page',
 'he',
 'your',
 'will',
 'its',
 'so',
 'were',
 'had',
 'also',
 'only',
 'if',
 'time',
 'some',
 'people',
 'like',
 'who',
 'them',
 'other',
 'they',
 'when',
 'Wikipedia',
 'article',
 'what',
 '#',
 'just',
 '!',
 'any',
 'after',
 'there',
 'would',
 'can',
 'In',
 'her',
 'talk',
 'use',
 'then',
 'into',
 'up',
 '...',
 'we',
 'over',
 'my',
 'out',
 'here',
 'now',
 'because',
 'do',
 'work',
 'than',
 'no',
 'UTC',
 'me',
 'A',
 'two',
 'our',
 'been',
 'new',
 'where',
 '–',
 'first',
 'such',
 'made',
 '--',
 'If',
 "'t",
 'both',
 'before',
 'way',
 '1',

In [25]:
# convert tokens in df to word vectors 
label_dic = {'neutral':0, 'contradiction':1, 'entailment':2}
def word_to_vec(word_list):
    return [loaded_embeddings_ft[words_ft[x]] if x in ordered_words_ft else loaded_embeddings_ft[1] for x in word_list]
def word_to_id(word_list):
    return [words_ft[x] if x in ordered_words_ft else UNK_IDX for x in word_list]
def convert_data(df):
    df['token1_x'] = df['token1'].apply(lambda x: word_to_vec(x))
    df['token2_x'] = df['token2'].apply(lambda x: word_to_vec(x))
    df['y'] = df['label'].apply(lambda x: label_dic[x])
    return df

In [13]:
train_data_df2 = convert_data(train_data_df)
test_data_df2 = convert_data(test_data_df)

In [18]:
train_list_pre = train_data_df['token1'].values.tolist()
train_list_hypo = train_data_df['token2'].values.tolist()
test_list_pre = test_data_df['token1'].values.tolist()
test_list_hypo = test_data_df['token2'].values.tolist()

In [26]:
def list2id(sent_list):
    return [word_to_id(x) for x in sent_list]

In [28]:
train_id_pre = list2id(train_list_pre)
train_id_hypo = list2id(train_list_hypo)
test_id_pre = list2id(test_list_pre)
test_id_hypo = list2id(test_list_hypo)

In [138]:
#save list to pickle
import pickle as pkl
pkl.dump(train_id_pre, open("train_id_pre.p", "wb"))
pkl.dump(train_id_hypo, open("train_id_hypo.p", "wb"))
pkl.dump(test_id_pre, open("test_id_pre.p", "wb"))
pkl.dump(test_id_hypo, open("test_id_hypo.p", "wb"))
pkl.dump(list(train_data_df2.y.values), open("train_y.p", "wb"))
pkl.dump(list(test_data_df2.y.values), open("test_y.p", "wb"))

In [139]:
def read_tokenizing_data():
    train_id_pre = pkl.load(open("train_id_pre.p", 'rb'))
    train_id_hypo = pkl.load(open("train_id_hypo.p", 'rb'))
    test_id_pre = pkl.load(open("test_id_pre.p", 'rb'))
    test_id_hypo = pkl.load(open("test_id_hypo.p", 'rb'))
    train_y = pkl.load(open("train_y.p", 'rb'))
    test_y = pkl.load(open("test_y.p", 'rb'))
    return train_id_pre, train_id_hypo, test_id_pre, test_id_hypo, train_y, test_y

In [140]:
train_id_pre, train_id_hypo, test_id_pre, test_id_hypo, train_y, test_y = read_tokenizing_data()

In [36]:
train_tuple = list(zip(train_id_pre, train_id_hypo, train_y))
test_tuple = list(zip(test_id_pre, test_id_hypo, test_y))

In [69]:
# top 1% length
MAX_LENGTH = sorted(len(x) for x in train_id_pre + train_id_hypo)[-int(len(train_id_pre + train_id_hypo)*0.01)]
BATCH_SIZE = 32

In [76]:
# build DataLoader
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_pre_list, self.data_hypo_list, self.target_list = zip(*data_tuple)
        assert (len(self.data_pre_list) == len(self.target_list))
        assert (len(self.data_hypo_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        char_idx1 = self.data_pre_list[key][:MAX_LENGTH]
        char_idx2 = self.data_hypo_list[key][:MAX_LENGTH]
        label = self.target_list[key]
        return [char_idx1, char_idx2, len(char_idx1), len(char_idx2), label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list_pre = []
    data_list_hypo = []
    label_list = []
    length_list_pre = []
    length_list_hypo = []

    for datum in batch:
        label_list.append(datum[4])
        length_list_pre.append(datum[2])
        length_list_hypo.append(datum[3])
    # padding
    for datum in batch:
        padded_vec_pre = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        padded_vec_hypo = np.pad(np.array(datum[1]),
                                pad_width = ((0, MAX_LENGTH-datum[3])), 
                                mode = "constant", constant_values= 0)
        data_list_pre.append(padded_vec_pre)
        data_list_hypo.append(padded_vec_hypo)
#     ind_dec_order_pre = np.argsort(length_list_pre)[::-1]
#     ind_dec_order_hypo = np.argsort(length_list_hypo)[::-1]
#     data_list_pre = np.array(data_list_pre)[ind_dec_order_pre]
#     data_list_hypo = np.array(data_list_pre)[ind_dec_order_hypo]
#     length_list_pre = np.array(length_list_pre)[ind_dec_order_pre]
#     length_list_hypo = np.array(length_list_hypo)[ind_dec_order_hypo]
#     label_list = np.array(label_list)[ind_dec_order]
    return [torch.from_numpy(np.array(data_list_pre)), torch.from_numpy(np.array(data_list_hypo)),
            torch.LongTensor(length_list_pre), torch.LongTensor(length_list_hypo), torch.LongTensor(label_list)]


In [77]:
train_dataset = SNLIDataset(train_tuple)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = SNLIDataset(test_tuple)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

In [135]:
#CNN
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft).float(), freeze=True)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2, stride=2)
        self.linear1 = nn.Linear(100*2, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x1, x2, length1, length2):
        batch_size, seq_len1 = x1.size()
        _, seq_len2 = x2.size()

        embed1 = self.embedding(x1)
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1, 2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len1, hidden1.size(-1))

        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len1, hidden1.size(-1))
        hidden1 = self.pool(hidden1)
        hidden1 = torch.sum(hidden1, dim=1)
        
        embed2 = self.embedding(x2)
        hidden2 = self.conv1(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len2, hidden2.size(-1))
        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len2, hidden2.size(-1))
        hidden2 = self.pool(hidden2)
        hidden2 = torch.sum(hidden2, dim=1)
    
        hidden = torch.cat((hidden1, hidden2), dim = 1)
        hidden = self.linear1(hidden)
        hidden = F.relu(hidden)
        logits = self.linear2(hidden)
        
        return logits

In [136]:
def test_model_cnn(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, data2, length1, length2, labels in loader:
        data1_batch, data2_batch, length1_batch, length2_batch, label_batch = data1, data2, length1, length2, labels
        outputs = F.softmax(model(data1_batch,data2_batch, length1_batch, length2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = CNN(emb_size=300, hidden_size=200, num_layers=2, num_classes=3)

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

cnn_train_loss_list = []
cnn_val_acc_list = []
for epoch in range(num_epochs):
    for i, (data1, data2, length1, length2, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data1, data2, length1, length2)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()
        cnn_train_loss_list.append(loss.item())
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model_cnn(val_loader, model)
            cnn_val_acc_list.append(val_acc)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

Epoch: [1/10], Step: [101/3125], Validation Acc: 40.2
Epoch: [1/10], Step: [201/3125], Validation Acc: 53.0
Epoch: [1/10], Step: [301/3125], Validation Acc: 54.3
Epoch: [1/10], Step: [401/3125], Validation Acc: 56.7
Epoch: [1/10], Step: [501/3125], Validation Acc: 57.4
Epoch: [1/10], Step: [601/3125], Validation Acc: 59.8
Epoch: [1/10], Step: [701/3125], Validation Acc: 57.1
Epoch: [1/10], Step: [801/3125], Validation Acc: 60.5
Epoch: [1/10], Step: [901/3125], Validation Acc: 57.6
Epoch: [1/10], Step: [1001/3125], Validation Acc: 60.4
Epoch: [1/10], Step: [1101/3125], Validation Acc: 60.6
Epoch: [1/10], Step: [1201/3125], Validation Acc: 59.4
Epoch: [1/10], Step: [1301/3125], Validation Acc: 61.2
Epoch: [1/10], Step: [1401/3125], Validation Acc: 61.6
Epoch: [1/10], Step: [1501/3125], Validation Acc: 61.1
Epoch: [1/10], Step: [1601/3125], Validation Acc: 63.4
Epoch: [1/10], Step: [1701/3125], Validation Acc: 61.8
Epoch: [1/10], Step: [1801/3125], Validation Acc: 62.6
Epoch: [1/10], Step

Epoch: [5/10], Step: [2701/3125], Validation Acc: 68.5
Epoch: [5/10], Step: [2801/3125], Validation Acc: 69.4
Epoch: [5/10], Step: [2901/3125], Validation Acc: 68.7
Epoch: [5/10], Step: [3001/3125], Validation Acc: 68.8
Epoch: [5/10], Step: [3101/3125], Validation Acc: 69.9
Epoch: [6/10], Step: [101/3125], Validation Acc: 70.8
Epoch: [6/10], Step: [201/3125], Validation Acc: 70.1
Epoch: [6/10], Step: [301/3125], Validation Acc: 70.1
Epoch: [6/10], Step: [401/3125], Validation Acc: 69.3
Epoch: [6/10], Step: [501/3125], Validation Acc: 68.8
Epoch: [6/10], Step: [601/3125], Validation Acc: 70.0
Epoch: [6/10], Step: [701/3125], Validation Acc: 70.2
Epoch: [6/10], Step: [801/3125], Validation Acc: 69.0
Epoch: [6/10], Step: [901/3125], Validation Acc: 70.0
Epoch: [6/10], Step: [1001/3125], Validation Acc: 70.4
Epoch: [6/10], Step: [1101/3125], Validation Acc: 69.8
Epoch: [6/10], Step: [1201/3125], Validation Acc: 69.6
Epoch: [6/10], Step: [1301/3125], Validation Acc: 71.0
Epoch: [6/10], Step

Epoch: [10/10], Step: [2201/3125], Validation Acc: 69.6
Epoch: [10/10], Step: [2301/3125], Validation Acc: 70.1
Epoch: [10/10], Step: [2401/3125], Validation Acc: 68.7
Epoch: [10/10], Step: [2501/3125], Validation Acc: 69.0
Epoch: [10/10], Step: [2601/3125], Validation Acc: 69.2
Epoch: [10/10], Step: [2701/3125], Validation Acc: 68.5
Epoch: [10/10], Step: [2801/3125], Validation Acc: 69.1
Epoch: [10/10], Step: [2901/3125], Validation Acc: 70.0
Epoch: [10/10], Step: [3001/3125], Validation Acc: 68.8
Epoch: [10/10], Step: [3101/3125], Validation Acc: 69.7


In [194]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, linear_size):
        super(RNN, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft).float(), freeze=True)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=1, batch_first=False,bidirectional=True)
        self.linear1 = nn.Linear(hidden_size*2, linear_size)
        self.linear2 = nn.Linear(linear_size, num_classes)
        
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        if torch.cuda.is_available and torch.has_cudnn:
            hidden = torch.randn(self.num_layers, batch_size, self.hidden_size).cuda()
        else:
            hidden = torch.randn(self.num_layers, batch_size, self.hidden_size)
        return hidden
    
    def forward(self, x1, x2, length1, length2):
        # reset hidden state

        batch_size, seq_len1 = x1.size()
        _, seq_len2 = x2.size()

        self.hidden1 = self.init_hidden(batch_size)
        self.hidden2 = self.init_hidden(batch_size)
        _, idx_sort1 = torch.sort(length1, dim=0, descending=True)
        _, idx_sort2 = torch.sort(length2, dim=0, descending=True)
        
        #unsort ?
        _, idx_unsort1 = torch.sort(idx_sort1, dim=0)
        _, idx_unsort2 = torch.sort(idx_sort2, dim=0)

        # get embedding of characters
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        
        # sort embedding
        embed1 = embed1.index_select(0, idx_sort1)
        embed2 = embed2.index_select(0, idx_sort2)
        sorted_len1 = length1[idx_sort1]
        sorted_len2 = length2[idx_sort2]
        
        # pack padded sequence
        embed1 = torch.nn.utils.rnn.pack_padded_sequence(embed1, sorted_len1.numpy(), batch_first=True)
        embed2 = torch.nn.utils.rnn.pack_padded_sequence(embed2, sorted_len2.numpy(), batch_first=True)
        # fprop though RNN
        rnn_out1, self.hidden1 = self.rnn(embed1, self.hidden1)
        rnn_out2, self.hidden2 = self.rnn(embed2, self.hidden2)
        
        # sum hidden activations of RNN across time
        rnn_out1 = torch.sum(self.hidden1, dim=0)
        rnn_out2 = torch.sum(self.hidden2, dim=0)
        
        #unsort
        output1 = rnn_out1.index_select(0, idx_unsort1)
        output2 = rnn_out2.index_select(0, idx_unsort2)
        
        #concat
        hidden3 = torch.cat((output1, output2), dim = 1)
        
        hidden3 = self.linear1(hidden3)
        hidden3 = F.relu(hidden3)
        logits = self.linear2(hidden3)

        return logits

In [None]:
def test_model_rnn(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, data2, length1, length2, labels in loader:
        data1_batch, data2_batch, length1_batch, length2_batch, label_batch = data1, data2, length1, length2, labels
        outputs = F.softmax(model(data1_batch,data2_batch, length1_batch, length2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

if torch.cuda.is_available and torch.has_cudnn:
    model = RNN(emb_size=300, hidden_size=200, num_layers=2, num_classes=3, linear_size=200).cuda()
else:
    model = RNN(emb_size=300, hidden_size=200, num_layers=2, num_classes=3, linear_size=200)

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

rnn_train_loss_list = []
rnn_val_acc_list = []
for epoch in range(num_epochs):
    for i, (data1, data2, length1, length2, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data1, data2, length1, length2)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()
        rnn_train_loss_list.append(loss.item())
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model_rnn(val_loader, model)
            rnn_val_acc_list.append(val_acc)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

Epoch: [1/10], Step: [101/3125], Validation Acc: 38.8
Epoch: [1/10], Step: [201/3125], Validation Acc: 46.8
Epoch: [1/10], Step: [301/3125], Validation Acc: 50.4
Epoch: [1/10], Step: [401/3125], Validation Acc: 47.5
Epoch: [1/10], Step: [501/3125], Validation Acc: 52.1
Epoch: [1/10], Step: [601/3125], Validation Acc: 51.4
Epoch: [1/10], Step: [701/3125], Validation Acc: 53.5
Epoch: [1/10], Step: [801/3125], Validation Acc: 52.8
Epoch: [1/10], Step: [901/3125], Validation Acc: 55.6
Epoch: [1/10], Step: [1001/3125], Validation Acc: 54.4
Epoch: [1/10], Step: [1101/3125], Validation Acc: 55.7
Epoch: [1/10], Step: [1201/3125], Validation Acc: 54.5
Epoch: [1/10], Step: [1301/3125], Validation Acc: 54.9
Epoch: [1/10], Step: [1401/3125], Validation Acc: 54.0
Epoch: [1/10], Step: [1501/3125], Validation Acc: 55.1
Epoch: [1/10], Step: [1601/3125], Validation Acc: 53.5
Epoch: [1/10], Step: [1701/3125], Validation Acc: 53.5
Epoch: [1/10], Step: [1801/3125], Validation Acc: 53.7
Epoch: [1/10], Step

Epoch: [5/10], Step: [2701/3125], Validation Acc: 65.4
Epoch: [5/10], Step: [2801/3125], Validation Acc: 64.0
Epoch: [5/10], Step: [2901/3125], Validation Acc: 63.1
Epoch: [5/10], Step: [3001/3125], Validation Acc: 63.7
Epoch: [5/10], Step: [3101/3125], Validation Acc: 63.5
Epoch: [6/10], Step: [101/3125], Validation Acc: 63.9
Epoch: [6/10], Step: [201/3125], Validation Acc: 63.0
Epoch: [6/10], Step: [301/3125], Validation Acc: 64.8
Epoch: [6/10], Step: [401/3125], Validation Acc: 62.5
Epoch: [6/10], Step: [501/3125], Validation Acc: 65.2
Epoch: [6/10], Step: [601/3125], Validation Acc: 65.3
Epoch: [6/10], Step: [701/3125], Validation Acc: 65.3
Epoch: [6/10], Step: [801/3125], Validation Acc: 65.9
Epoch: [6/10], Step: [901/3125], Validation Acc: 64.4
Epoch: [6/10], Step: [1001/3125], Validation Acc: 65.6
Epoch: [6/10], Step: [1101/3125], Validation Acc: 64.0
Epoch: [6/10], Step: [1201/3125], Validation Acc: 64.7
Epoch: [6/10], Step: [1301/3125], Validation Acc: 64.4
Epoch: [6/10], Step