In [1]:
import torch
import numpy as np
import pandas as pd
pd.set_option("display.width", 380)
pd.set_option('max_colwidth', 100)

from IPython.display import display

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the sentences

In [2]:
df = pd.read_csv("./sick_train/SICK_train.txt", sep="\t")
df = df.drop(['relatedness_score'], axis=1)

In [3]:
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

In [4]:
from gensim import corpora

In [5]:
class SickDataset(Dataset):
    endOfSentence   = '</s>'
    startOfSentence = '<s>'
    separator2Sentences = '<sep>'
    
    tokens = [startOfSentence, separator2Sentences, endOfSentence]
    
    def join_sentence(self, row):
        """
        Create a new sentence (<s> + s_A + <sep> + s_B + </s>)
        """
        sentence_a = row['sentence_A'].split(" ")
        sentence_b = row['sentence_B'].split(" ")
        return np.concatenate((
            [self.startOfSentence],
            sentence_a,
            [self.separator2Sentences],
            sentence_b,
            [self.endOfSentence]
        ))
    
    def series_text_2_labelID(self, series, keep_n=1000):
        """
        Convert text Label into label id
        """
        return series.map({"NEUTRAL": 0, "ENTAILMENT": 1, "CONTRADICTION": 2})
    
    def series_2_dict(self, series, keep_n):
        """
        Convert document (a list of words) into a list of indexes
        """
        dictionary = corpora.Dictionary(series)
        dictionary.filter_extremes(
            no_below=1,
            no_above=1,
            keep_n=keep_n,
            keep_tokens=self.tokens)
        return dictionary
    
    
    def __init__(self, df, vocabulary_size):
        # Label text as ids
        df["entailment_id"] = self.series_text_2_labelID(df['entailment_judgment'])
        
        # Add <s>,</s>,<sep> tokens to the vocabulary
        df['sentence_AB'] = df.apply(self.join_sentence, axis=1)
        
        # Create the Dictionary
        self.dictionary = self.series_2_dict(df['sentence_AB'], vocabulary_size)
        
        # sentence of words -> array of idx
        # Adds unknown to the voc, Dictionary size vocabulary_size+1
        df["word_idx"] = df["sentence_AB"].apply(
            lambda x: np.array(self.dictionary.doc2idx(x, unknown_word_index=vocabulary_size))
        )
        
        self.df = df
        
    def getRef(self, index):
        return df['sentence_AB'][index]
        
    def __getitem__(self, index):
        return (
            df['word_idx'][index],
            df['entailment_id'][index])
    
    def __len__(self):
        return len(self.df)

vocabulary_size = 1000
sick_dataset = SickDataset(df, vocabulary_size)
sick_dataset.df.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,entailment_judgment,entailment_id,sentence_AB,word_idx
0,1,A group of kids is playing in a yard and an old man is standing in the background,A group of boys in a yard is playing and a man is standing in the background,NEUTRAL,0,"[<s>, A, group, of, kids, is, playing, in, a, yard, and, an, old, man, is, standing, in, the, ba...","[1, 3, 9, 14, 12, 11, 16, 10, 4, 19, 6, 5, 15, 13, 11, 17, 10, 18, 7, 2, 3, 9, 14, 8, 10, 4, 19,..."
1,2,A group of children is playing in the house and there is no man standing in the background,A group of kids is playing in a yard and an old man is standing in the background,NEUTRAL,0,"[<s>, A, group, of, children, is, playing, in, the, house, and, there, is, no, man, standing, in...","[1, 3, 9, 14, 20, 11, 16, 10, 18, 21, 6, 23, 11, 22, 13, 17, 10, 18, 7, 2, 3, 9, 14, 12, 11, 16,..."
2,3,The young boys are playing outdoors and the man is smiling nearby,The kids are playing outdoors near a man with a smile,ENTAILMENT,1,"[<s>, The, young, boys, are, playing, outdoors, and, the, man, is, smiling, nearby, <sep>, The, ...","[1, 24, 31, 8, 25, 16, 27, 6, 18, 13, 11, 29, 1000, 2, 24, 12, 25, 16, 27, 26, 4, 13, 30, 4, 28, 0]"
3,5,The kids are playing outdoors near a man with a smile,A group of kids is playing in a yard and an old man is standing in the background,NEUTRAL,0,"[<s>, The, kids, are, playing, outdoors, near, a, man, with, a, smile, <sep>, A, group, of, kids...","[1, 24, 12, 25, 16, 27, 26, 4, 13, 30, 4, 28, 2, 3, 9, 14, 12, 11, 16, 10, 4, 19, 6, 5, 15, 13, ..."
4,9,The young boys are playing outdoors and the man is smiling nearby,A group of kids is playing in a yard and an old man is standing in the background,NEUTRAL,0,"[<s>, The, young, boys, are, playing, outdoors, and, the, man, is, smiling, nearby, <sep>, A, gr...","[1, 24, 31, 8, 25, 16, 27, 6, 18, 13, 11, 29, 1000, 2, 3, 9, 14, 12, 11, 16, 10, 4, 19, 6, 5, 15..."


In [6]:
pd.DataFrame(list(zip(sick_dataset.getRef(2), sick_dataset[2][0]))).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,<s>,The,young,boys,are,playing,outdoors,and,the,man,...,are,playing,outdoors,near,a,man,with,a,smile,</s>
1,1,24,31,8,25,16,27,6,18,13,...,25,16,27,26,4,13,30,4,28,0


In [7]:
from torch import nn

In [8]:
class RNNClassifier(nn.Module):
    # Our model

    def __init__(self, input_voc_size, embedding_size, hidden_size):
        super(RNNClassifier, self).__init__()
        
        self.input_voc_size = input_voc_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        
        self.num_classes = 3
        
        # Add the unknown token
        self.embedding = nn.Embedding(input_voc_size+1, embedding_size)
        self.rnn = nn.RNN(
              input_size=embedding_size,
              hidden_size=hidden_size,
              batch_first=True)
        self.fc = nn.Linear(hidden_size, self.num_classes)
        
    def forward(self, x, verbose=False):
        # Initialize hidden and cell states
        # (num_layers * num_directions, batch, hidden_size)
        h_0 = torch.zeros(1, 1, self.hidden_size)

        if verbose:
            print("  input", x.size())
        emb = self.embedding(x)
        emb = emb.view(1, x.size(0), -1)
        if verbose:
            print("  embedding", emb.size())

        # Propagate embedding through RNN
        # Input: (batch, seq_len, embedding_size)
        # h_0: (num_layers * num_directions, batch, hidden_size)
        out, hidden = self.rnn(emb, h_0)
        if verbose:
            print("  rnn_out", out.size())
        return self.fc(hidden)
    
rnn = RNNClassifier(vocabulary_size, 200, 5)
print(rnn)

# Set loss and optimizer function
# CrossEntropyLoss = LogSoftmax + NLLLoss
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.1)

RNNClassifier(
  (embedding): Embedding(1001, 200)
  (rnn): RNN(200, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=3, bias=True)
)


In [9]:
inputEx = torch.tensor(sick_dataset[2][0])
print(inputEx)
rnn(inputEx, verbose=True)

tensor([   1,   24,   31,    8,   25,   16,   27,    6,   18,   13,   11,   29,
        1000,    2,   24,   12,   25,   16,   27,   26,    4,   13,   30,    4,
          28,    0])
  input torch.Size([26])
  embedding torch.Size([1, 26, 200])
  rnn_out torch.Size([1, 26, 5])


tensor([[[-0.0831, -0.5896, -0.6353]]], grad_fn=<ThAddBackward>)

In [10]:
from torch.utils.data import DataLoader

In [11]:
# TODO, use this insted
train_loader = DataLoader(dataset=sick_dataset,
                          batch_size=1, shuffle=True)

In [12]:
# Train the model
rnn = rnn
total_loss = 0
for epoch in range(1): # need to use DataLoader for more epochs
    for i, (idx_sentence, target) in enumerate(sick_dataset):
        idx_sentence = torch.tensor(idx_sentence)
        target = torch.tensor([np.long(target)])
        
        output = rnn(idx_sentence)

        loss = criterion(output[0], target)
        total_loss += loss.data[0]

        rnn.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 201 == 0:
            print('Train Epoch: {} [{:5}/{} ({:.0f}%)]\tLoss: {:.2f}'.format(
                epoch,  i , len(sick_dataset),
                100. * i  / len(train_loader),
                total_loss / i ))

print("Learning finished!")

  if sys.path[0] == '':




KeyError: 4500