In [2]:
import json
import torch
from rnn.dataset import Dictionary, MyDataset
import pandas as pd
import random
import torch.autograd as autograd
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [15]:
class Const(object):
    MAX_VOCAB_SIZE = 10000
    MAX_SAMPLE_LENGTH = 30
    BATCH_SIZE = 32
    VAL_BATCH_SIZE = 32
const = Const()

# Load dataset, transform word to id
all_samples = pd.read_csv('./data/lstm.cvs').to_dict('records')
all_text = []
for sample in all_samples:
    if isinstance(sample['texts'], str):
        sample['processed_text'] = sample['texts'].split(' ')
    else:
        # special treat for empty tweets
        sample['processed_text'] = ['null']
    all_text.extend(sample['processed_text'])
dictionary = Dictionary(all_text, max_vocab_size=const.MAX_VOCAB_SIZE)

for idx, sample in enumerate(all_samples):
    sample['processed_id'] = [dictionary.to_idx(word) for word in sample['processed_text']]

# train & test split
random.shuffle(all_samples)
train_samples = all_samples[:int(len(all_samples) * 0.8)]
test_samples = all_samples[int(len(all_samples) * 0.8):]

train_dataset = MyDataset(train_samples, dictionary.to_idx('_unk'), const.MAX_SAMPLE_LENGTH)
val_dataset = MyDataset(train_samples, dictionary.to_idx('_unk'), const.MAX_SAMPLE_LENGTH)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=const.BATCH_SIZE, shuffle=True, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=const.VAL_BATCH_SIZE, shuffle=False, num_workers=4)

In [16]:
sample = iter(train_dataloader).next()

In [80]:
class LSTMClassifier(nn.Module):

    def __init__(self, max_vocab_size, embed_dim, hidden_dim,
                 dropout_rate=0.5, num_layers=1, rnn_type='LSTM', pretrained_embeddings=None, concat_feature=False):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(max_vocab_size, embed_dim)
        self.rnn_type = rnn_type
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(embed_dim, hidden_dim, num_layers, dropout=dropout_rate)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError("""An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, nonlinearity=nonlinearity, dropout=dropout)
        self.output_dropout = nn.Dropout(dropout_rate)
        # 2-classes classification
        if concat_feature is False:
            self.hidden2out = nn.Linear(hidden_dim, 2)
        else:
            self.hidden2out = nn.Linear(hidden_dim + 2, 2)
        self.concat_feature = concat_feature

        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.cross_entropy_loss = nn.CrossEntropyLoss()

    def forward(self, sample):
        '''
        Input: sample
            key: word_id, retweet_count, favorite_count, text_length, label
        Output: output
            key: logit
        '''
        # batch_size x length x embedding_size
        embeds = self.embedding(sample['word_id'])
        packed_input = pack_padded_sequence(embeds, sample['length'], batch_first=True, enforce_sorted=False)
        output, hidden = self.rnn(packed_input)

        if self.rnn_type == 'LSTM':
            hidden = hidden[0][-1]
        else:
            hidden = hidden[-1]
        
        if self.concat_feature:
            hidden = torch.cat(
                [
                    hidden,
                    sample['retweet_count'].reshape(hidden.shape[0], 1),
                    sample['favorite_count'].reshape(hidden.shape[0], 1),
                ],
                dim=1
            )

        output = self.output_dropout(hidden)
        output = self.hidden2out(output)
        output = {'logit': output}
        return output
    
    def cal_loss(self, sample, output):
        '''
        Input: sample, output
        Output: loss (scalar)
        '''
        return self.cross_entropy_loss(output['logit'], sample['label'])

In [87]:
hidden.shape

torch.Size([32, 128])

In [95]:
torch.cat(
    [
        hidden,
        sample['retweet_count'].reshape(hidden.shape[0], 1),
        sample['favorite_count'].reshape(hidden.shape[0], 1),
    ],
    dim=1
).shape


torch.Size([32, 130])

tensor(0.6899, grad_fn=<NllLossBackward>)

PackedSequence(data=tensor([[ 0.0405,  0.0625, -0.0175,  ..., -0.1562,  0.0019, -0.0698],
        [-0.1963,  0.1338,  0.1191,  ..., -0.0339,  0.1650, -0.0830],
        [ 0.0239, -0.0461,  0.0039,  ..., -0.2695, -0.0688, -0.2754],
        ...,
        [-0.0315,  0.2637,  0.1001,  ..., -0.1992,  0.1162, -0.0752],
        [-0.0030,  0.0762,  0.0515,  ..., -0.0654,  0.0430, -0.0286],
        [-0.3594,  0.0020, -0.0557,  ..., -0.4785,  0.0081,  0.1816]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([32, 31, 29, 27, 26, 26, 23, 20, 19, 16, 12, 10,  6,  5,  4,  3,  3,  3,
         3,  3,  3,  3,  2,  2,  2,  1]), sorted_indices=tensor([16,  2,  0, 14, 19, 15,  7, 17, 27, 13,  9, 12, 25, 24, 29, 30,  4,  5,
        22, 26, 23, 20,  3, 21, 18,  8, 10,  1, 28,  6, 31, 11]), unsorted_indices=tensor([ 2, 27,  1, 22, 16, 17, 29,  6, 25, 10, 26, 31, 11,  9,  3,  5,  0,  7,
        24,  4, 21, 23, 18, 20, 13, 12, 19,  8, 28, 14, 15, 30]))