In [1]:
from torch import nn, ones
from torch.autograd import Variable
from torchvision import models
from torch.nn.init import kaiming_normal
from torch import np
import torch
import torch.nn.functional as F
import random
import numpy as np

# Playing with embeddings - simple classifier

## Encoding functions

In [2]:
vocab = ['<BEGIN>','<STOP>','clear', 'cloudy', 'haze','partly_cloudy',
    'agriculture','artisinal_mine','bare_ground','blooming',
    'blow_down','conventional_mine','cultivation','habitation',
    'primary','road','selective_logging','slash_burn','water'
    ]

word_to_ix = { word: i for i, word in enumerate(vocab) }
print(word_to_ix)
one_hot_mapping = {k:np.eye(19)[v] for k,v in word_to_ix.items()}
# print(one_hot_mapping)

{'<BEGIN>': 0, '<STOP>': 1, 'clear': 2, 'cloudy': 3, 'haze': 4, 'partly_cloudy': 5, 'agriculture': 6, 'artisinal_mine': 7, 'bare_ground': 8, 'blooming': 9, 'blow_down': 10, 'conventional_mine': 11, 'cultivation': 12, 'habitation': 13, 'primary': 14, 'road': 15, 'selective_logging': 16, 'slash_burn': 17, 'water': 18}


In [3]:
def label_to_target(word_to_ix, label):
    return Variable(torch.LongTensor(
            list(map(lambda w: word_to_ix[w], label))
        ))

## Decoding functions

In [4]:
ix_to_word = {v: k for k, v in word_to_ix.items()}

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

def predictions_to_label(ix_to_word, predictions):
    predictions = F.softmax(predictions)
    _, preds = torch.max(predictions.data, 1)
    return list(map(lambda ix: ix_to_word[ix], flatten(preds.tolist())))

## Batch creation

In [6]:
def gen_batch(n, vocab):
    batch = []
    for _ in range(n):
        batch.append(random.choice(vocab))
    return batch

## Model Definition

In [7]:
class EmbeddingIdentity(nn.Module):
    """ Testing weight sharing
    """
    
    def __init__(self, vocab, repr_dim, num_rnn_layers):
        super(EmbeddingIdentity, self).__init__()
        self.label_to_ix = { label: i for i, label in enumerate(vocab) }
        self.embeds = nn.Embedding(len(vocab), repr_dim)
        self.rnn = nn.LSTM(input_size=repr_dim,
                            hidden_size=repr_dim,
                            num_layers=num_rnn_layers,
                            batch_first = True)
        self.fc = nn.Linear(repr_dim, len(vocab))
        
        # link embedding and decoding weight
        self.fc.weight = self.embeds.weight
    
    def toVariable(self, x):
        return Variable(torch.LongTensor(
            list(map(lambda lbl: self.label_to_ix[lbl], x))
        ))
    
    def forward(self, x, hidden=None):
        x = self.toVariable(x)          # Tensor with a single integer
        f = self.embeds(x).unsqueeze(1) # Dim 1x5 --> unsqueeze --> 1x1x5
        f, hidden = self.rnn(f, hidden) # Dim output: 1x1x5, Dim hidden: 2x1x5
        f = self.fc(f.contiguous().squeeze(1)) # Dim 1x19
        return f
        

## Playground

In [8]:
model = EmbeddingIdentity(vocab,5, 2)

In [9]:
output = model(["slash_burn"])
predictions_to_label(ix_to_word, output)

['blow_down']

In [10]:
batch = gen_batch(10, vocab)

In [11]:
output = model(batch)
predictions_to_label(ix_to_word, output)

['blow_down',
 'road',
 'road',
 'road',
 'road',
 'road',
 'road',
 'blow_down',
 'road',
 'blow_down']

In [12]:
target = label_to_target(word_to_ix,batch)
batch

['conventional_mine',
 'clear',
 'partly_cloudy',
 'bare_ground',
 'partly_cloudy',
 'clear',
 'bare_ground',
 'road',
 'cultivation',
 'blow_down']

In [13]:
loss = F.cross_entropy(output, target)

In [14]:
# loss.backward()
loss

Variable containing:
 2.8663
[torch.FloatTensor of size 1]

# Wrapping up for multi epoch training

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [16]:
for epoch in range(500):
    X = gen_batch(500, vocab)
    y = label_to_target(word_to_ix,X)
    y_pred = model(X)
    loss = criterion(y_pred, y)
    print(epoch, loss.data[0])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 2.956784963607788
1 2.948615550994873
2 2.9271082878112793
3 2.9263296127319336
4 2.920184373855591
5 2.911189079284668
6 2.908919334411621
7 2.898547410964966
8 2.8742473125457764
9 2.8789680004119873
10 2.832608222961426
11 2.818241596221924
12 2.8162808418273926
13 2.827798366546631
14 2.7939224243164062
15 2.749051094055176
16 2.751620054244995
17 2.716921329498291
18 2.683095693588257
19 2.679623603820801
20 2.6578989028930664
21 2.6117377281188965
22 2.588899612426758
23 2.5576703548431396
24 2.558809995651245
25 2.5392045974731445
26 2.4743964672088623
27 2.456502676010132
28 2.4482438564300537
29 2.4007177352905273
30 2.37410044670105
31 2.3194262981414795
32 2.295959711074829
33 2.2708818912506104
34 2.2550437450408936
35 2.195267915725708
36 2.178271770477295
37 2.1516690254211426
38 2.1237802505493164
39 2.0647482872009277
40 2.022016763687134
41 1.9893934726715088
42 1.9478552341461182
43 1.9277807474136353
44 1.9139608144760132
45 1.8647056818008423
46 1.8194890022277832

363 0.06316755712032318
364 0.06349195539951324
365 0.06464570015668869
366 0.06357749551534653
367 0.062733955681324
368 0.06362716853618622
369 0.06219123676419258
370 0.06315423548221588
371 0.062006816267967224
372 0.061410821974277496
373 0.06166284903883934
374 0.06015673279762268
375 0.06161753460764885
376 0.06003640592098236
377 0.061790332198143005
378 0.06077992916107178
379 0.059719160199165344
380 0.060314472764730453
381 0.05991772189736366
382 0.0600455217063427
383 0.059739623218774796
384 0.060548458248376846
385 0.05752124264836311
386 0.0569709874689579
387 0.05664600431919098
388 0.056815046817064285
389 0.05699184909462929
390 0.05662521347403526
391 0.056607685983181
392 0.05597366392612457
393 0.057010751217603683
394 0.055294740945100784
395 0.056630298495292664
396 0.054974328726530075
397 0.05528252571821213
398 0.05522580444812775
399 0.05457230284810066
400 0.05341123789548874
401 0.05369950458407402
402 0.053978804498910904
403 0.053411927074193954
404 0.05

In [17]:
batch2 = gen_batch(30, vocab)

In [18]:
output2 = model(batch2)

In [19]:
batch2

['<STOP>',
 'haze',
 'conventional_mine',
 'conventional_mine',
 'bare_ground',
 'artisinal_mine',
 'bare_ground',
 'habitation',
 'cultivation',
 'blow_down',
 'selective_logging',
 '<STOP>',
 'blooming',
 'primary',
 'road',
 'blow_down',
 'blooming',
 '<STOP>',
 'partly_cloudy',
 '<BEGIN>',
 'blooming',
 'blow_down',
 'cloudy',
 'bare_ground',
 'primary',
 'primary',
 'cultivation',
 'agriculture',
 'partly_cloudy',
 'artisinal_mine']

In [20]:
predictions_to_label(ix_to_word, output2)

['<STOP>',
 'haze',
 'conventional_mine',
 'conventional_mine',
 'bare_ground',
 'artisinal_mine',
 'bare_ground',
 'habitation',
 'cultivation',
 'blow_down',
 'selective_logging',
 '<STOP>',
 'blooming',
 'primary',
 'road',
 'blow_down',
 'blooming',
 '<STOP>',
 'partly_cloudy',
 '<BEGIN>',
 'blooming',
 'blow_down',
 'cloudy',
 'bare_ground',
 'primary',
 'primary',
 'cultivation',
 'agriculture',
 'partly_cloudy',
 'artisinal_mine']

# Sequence prediction

In [21]:
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [22]:
seq_vocab = ['clear', 'cloudy', 'haze','partly_cloudy',
    'agriculture','artisinal_mine','bare_ground','blooming',
    'blow_down','conventional_mine','cultivation','habitation',
    'primary','road','selective_logging','slash_burn','water'
    ]


In [23]:
def gen_batch_sequences(n, seq_vocab):
    batch = []
    for _ in range(n):
        seq = ['<BEGIN>']
        for _ in range(random.randint(1,16)):
            seq.append(random.choice(seq_vocab))
        seq.append('<STOP>')
        seq = list(dict.fromkeys(seq)) # Remove duplicate while keeping order
        batch.append(seq)
    return batch

In [24]:
batch_seq = gen_batch_sequences(10, seq_vocab)

In [25]:
batch_seq

[['<BEGIN>', 'clear', 'artisinal_mine', 'road', '<STOP>'],
 ['<BEGIN>',
  'primary',
  'blooming',
  'cloudy',
  'blow_down',
  'cultivation',
  'slash_burn',
  'road',
  '<STOP>'],
 ['<BEGIN>', 'blow_down', 'water', '<STOP>'],
 ['<BEGIN>',
  'primary',
  'blooming',
  'haze',
  'blow_down',
  'road',
  'cloudy',
  'habitation',
  'conventional_mine',
  '<STOP>'],
 ['<BEGIN>',
  'conventional_mine',
  'road',
  'water',
  'cultivation',
  'agriculture',
  'haze',
  'primary',
  'slash_burn',
  '<STOP>'],
 ['<BEGIN>', 'haze', 'road', '<STOP>'],
 ['<BEGIN>',
  'partly_cloudy',
  'cloudy',
  'slash_burn',
  'blooming',
  'road',
  'cultivation',
  'artisinal_mine',
  'clear',
  'blow_down',
  'water',
  '<STOP>'],
 ['<BEGIN>',
  'artisinal_mine',
  'blow_down',
  'conventional_mine',
  'cloudy',
  'slash_burn',
  'water',
  '<STOP>'],
 ['<BEGIN>',
  'partly_cloudy',
  'road',
  'selective_logging',
  'primary',
  'habitation',
  'clear',
  '<STOP>'],
 ['<BEGIN>', 'cultivation', 'primary',

In [26]:
def seqlabels_to_target(word_to_ix, list_seq):
    return list(map(lambda s: torch.LongTensor(
                list(map(lambda label: word_to_ix[label], s))
            ), list_seq))

In [27]:
embeds = nn.Embedding(len(vocab), 5, padding_idx=0)

In [28]:
seq_tensors = seqlabels_to_target(word_to_ix, batch_seq)

In [29]:
Variable(seq_tensors[1])

Variable containing:
  0
 14
  9
  3
 10
 12
 17
 15
  1
[torch.LongTensor of size 9]

In [30]:
embeds(Variable(seq_tensors[1]))

Variable containing:
 0.0000  0.0000  0.0000  0.0000  0.0000
-0.5254 -0.1480  0.9374 -1.9755 -0.1513
-0.5471 -1.0386  0.7887 -0.1657 -0.9696
 1.4302  1.0649  2.6553  1.7606  0.7829
 1.0784  1.1442  0.7852 -0.6828 -0.9015
 0.1928  0.2651 -0.3466 -0.9127  0.5378
 0.1592 -0.4425  0.0053  1.4737 -0.8462
-1.5880  0.7203 -0.0336  0.7798 -0.4408
-0.0589  1.1673  1.1468 -0.3766  0.6624
[torch.FloatTensor of size 9x5]

In [31]:
Variable(seq_tensors[3])

Variable containing:
  0
 14
  9
  4
 10
 15
  3
 13
 11
  1
[torch.LongTensor of size 10]

In [32]:
embeds(Variable(seq_tensors[3]))

Variable containing:
 0.0000  0.0000  0.0000  0.0000  0.0000
-0.5254 -0.1480  0.9374 -1.9755 -0.1513
-0.5471 -1.0386  0.7887 -0.1657 -0.9696
 2.2629 -0.9940  0.0123 -0.3606  0.8853
 1.0784  1.1442  0.7852 -0.6828 -0.9015
-1.5880  0.7203 -0.0336  0.7798 -0.4408
 1.4302  1.0649  2.6553  1.7606  0.7829
-1.4367 -0.9439 -0.1380 -1.5372  1.1991
 0.7725  1.8602 -0.2360 -0.3035  0.3017
-0.0589  1.1673  1.1468 -0.3766  0.6624
[torch.FloatTensor of size 10x5]

Seems like it will be a pain to work with variable size input. It would need my custom data loader. Hence I would create a custom one directly for Amazon dataset

In [33]:
from torch.utils.data.dataset import Dataset
from torchvision import transforms
import pandas as pd
import os

from torch import np, from_numpy # Numpy like wrapper

class TagsDataset(Dataset):
    """Dataset wrapping target labels for Kaggle - Planet Amazon from Space competition.

    Arguments:
        A CSV file path
        Path to image folder
        Extension of images
    """

    def __init__(self, csv_path, vocab_mapping):
    
        self.df = pd.read_csv(csv_path)
        self.vocab_mapping = vocab_mapping

        self.tags = self.df['tags'].str.split()

        
    def __getitem__(self, index):
        vocab = self.vocab_mapping
        tags = []
        tags.append(vocab['<BEGIN>'])
        tags.extend([vocab[tag] for tag in self.tags[index]])
        tags.append(vocab['<STOP>'])
        
        tags = torch.Tensor(tags)
        
        return tags, tags

    def __len__(self):
        return len(self.df.index)


    def collate_fn(self, data):
        """Creates mini-batch tensors for tags with variable size

        Args:
            data: list of tuple (input, target). 
                - input: torch tensor of shape (?); variable length.
                - target: torch tensor of same shape (?); variable length.
        Returns:
            inputs: torch tensor of shape (batch_size, padded_length).
            targets: torch tensor of shape (batch_size, padded_length).
            lengths: list; valid length for each padded tags.
        """
        # Sort a data list by target length (descending order).
        data.sort(key=lambda x: len(x[1]), reverse=True)
        _, tags = zip(*data)

        # Merge tags (from tuple of 1D tensor to 2D tensor).
        lengths = [len(tag) for tag in tags]
        targets = torch.zeros(len(tags), max(lengths)).long()
        for i, tag in enumerate(tags):
            end = lengths[i]
            targets[i, :end] = tag[:end]        
        return targets, targets, lengths


In [34]:
X_train = TagsDataset('./data/train.csv',word_to_ix)

In [35]:
train_loader = torch.utils.data.DataLoader(dataset=X_train, 
                                              batch_size=100,
                                              shuffle=True,
                                              num_workers=1,
                                              collate_fn=X_train.collate_fn)

In [82]:
class SeqPred(nn.Module):
    """ Testing weight sharing + Variable Length sequence
    """
    
    def __init__(self, vocab_size, embed_dim, num_rnn_layers):
        super(SeqPred, self).__init__()
 
        self.vocab_size = vocab_size
        self.embeds = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, embed_dim, num_rnn_layers, batch_first = True)
        self.fc = nn.Linear(embed_dim, vocab_size)
        
        # link embedding and decoding weight
        self.fc.weight = self.embeds.weight
    
    
    def forward(self, tags, lengths, hidden=None):
        embed = self.embeds(tags)
        packed = pack_padded_sequence(embed, lengths, batch_first=True)
        out, hidden = self.rnn(packed, hidden)
        out = self.fc(out[0])
        return out, hidden

    def genTags(self, inputs, states):
        tag_ids = []
        inputs = self.embeds(inputs)
        for i in range(self.vocab_size):                                      # maximum sampling length
            hiddens, states = self.rnn(inputs, states)          # (batch_size, 1, hidden_size)
            outputs = self.fc(hiddens.squeeze(1))            # (batch_size, vocab_size)
            predicted = outputs.max(1)[1]
            tag_ids.append(predicted)
            inputs = self.embeds(predicted)
        tag_ids = torch.cat(tag_ids, 1)                  # (batch_size, 19)
        return tag_ids.squeeze()

In [83]:
model = SeqPred(19, 5, 2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [84]:
epoch =0
for batch_idx, (data, target, lengths) in enumerate(train_loader):
    data = Variable(data)
    target = Variable(target)
    targets = pack_padded_sequence(target, lengths, batch_first=True)[0]
    
    model.zero_grad()
    
    outputs, _ = model(data,lengths)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    
    if batch_idx % 100 == 0:
        print('Train Epoch: {:03d} [{:05d}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader) * len(data),
            100. * batch_idx / len(train_loader), loss.data[0]))

RuntimeError: expected a Variable argument, but got tuple

In [52]:
# Set initial states
state = (Variable(torch.zeros(2, 1, 5)),
             Variable(torch.zeros(2, 1, 5)))

In [72]:
start = Variable(torch.rand(1, 1).mul(19).long(), volatile=True)
start

Variable containing:
 12
[torch.LongTensor of size 1x1]

In [73]:
start0 = Variable(torch.zeros(1, 1).long(), volatile=True)
start0

Variable containing:
 0
[torch.LongTensor of size 1x1]

In [75]:
model.genTags(start0,state)

Variable containing:
 0
 0
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
 6
[torch.LongTensor of size 19]