In [1]:
# load data
import numpy as np
from data_util import DATA

# Hyper parameters
vocab_size = 30000
N = 30
topk = 10

data = DATA(vocab_size=vocab_size, N=N, topk=topk)
x, x_len, y = data.load_data()
pretrained_weight = data.load_embed()

---------- load preprocessed data ----------
x: (4664, 60, 10) x_len: (4664,) y: (4664,)
---------- load saved pretrained embedding weight ----------
vocab_embed: (30002, 200)


In [18]:
import torch
import torch.utils.data as Data
from torch.utils.data.sampler import SubsetRandomSampler

# Hyper parameters
batch_size = 32
test_ratio = 0.2
embed_size = 200
hidden_size = 64

# random seed
seed = 2018
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# transform into torch
# separate into train/test set
x_torch = torch.from_numpy(x).type(torch.LongTensor)
y_torch = torch.from_numpy(y).type(torch.LongTensor)
x_len_torch = torch.from_numpy(x_len).type(torch.LongTensor)

data_num = x_torch.shape[0]
split = int(data_num*test_ratio)
indices = list(range(data_num))

np.random.seed(seed)
test_idx = np.random.choice(indices, size=split, replace=False)
#print(test_idx)
train_idx = list(set(indices) - set(test_idx))

train_sampler = SubsetRandomSampler(train_idx)
test_sampler = SubsetRandomSampler(test_idx)

torch_dataset = Data.TensorDataset(x_torch, y_torch, x_len_torch)
train_loader = torch.utils.data.DataLoader(torch_dataset, 
                batch_size=batch_size, sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(torch_dataset, 
                batch_size=len(test_idx), sampler=test_sampler)

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as transforms
from torch.nn import utils as nn_utils

class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding(vocab_size+2, embed_size, padding_idx=0)
        self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight))
        self.lstm = nn.LSTM(         # if use nn.RNN(), it hardly learns
            input_size=embed_size,
            hidden_size=hidden_size,# rnn hidden unit
            num_layers=1,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(hidden_size, 2)

    def forward(self, x, length_list):     
        x_embed = self.embed(x)
        # 1 - maxpool
        #x= F.max_pool2d(x_embed, kernel_size=(x_embed.size(2),1)).squeeze(2)
        # 2 - average
        x = torch.mean(x_embed, dim=2)
        # settle the problem of variable sequence
        _, idx_sort = torch.sort(length_list, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        length_list_sort = list(length_list[idx_sort])
        x = x.index_select(0, Variable(idx_sort))
        x_pack = nn_utils.rnn.pack_padded_sequence(x, length_list_sort, batch_first=True)
        output, (h_n, h_c) = self.lstm(x_pack, None)   # None represents zero initial hidden state
        h_n = h_n[0][idx_unsort]
        r_out = self.out(h_n)
        return r_out

lstm = LSTM()
print(lstm)
#r_out = lstm(t1,t3)

LSTM(
  (embed): Embedding(30002, 200, padding_idx=0)
  (lstm): LSTM(200, 64, batch_first=True)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


In [15]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as transforms
from torch.nn import utils as nn_utils

class GRU(nn.Module):
    def __init__(self):
        super(GRU, self).__init__()
        self.embed = nn.Embedding(vocab_size+2, embed_size, padding_idx=0)
        self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight))
        self.gru = nn.GRU(         # if use nn.RNN(), it hardly learns
            input_size=embed_size,
            hidden_size=hidden_size,# rnn hidden unit
            num_layers=1,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(hidden_size, 2)

    def forward(self, x, length_list):     
        x_embed = self.embed(x)
        # 1 - maxpool
        #x= F.max_pool2d(x_embed, kernel_size=(x_embed.size(2),1)).squeeze(2)
        # 2 - average
        x = torch.mean(x_embed, dim=2)
        # settle the problem of variable sequence
        _, idx_sort = torch.sort(length_list, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        length_list_sort = list(length_list[idx_sort])
        x = x.index_select(0, Variable(idx_sort))
        x_pack = nn_utils.rnn.pack_padded_sequence(x, length_list_sort, batch_first=True)
        output, h_n = self.gru(x_pack, None)   # None represents zero initial hidden state
        h_n = h_n[0][idx_unsort]
        r_out = self.out(h_n)
        return r_out

gru = GRU()
print(gru)

GRU(
  (embed): Embedding(30002, 200, padding_idx=0)
  (gru): GRU(200, 64, batch_first=True)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


In [8]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as transforms
from torch.nn import utils as nn_utils

class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embed = nn.Embedding(vocab_size+2, embed_size, padding_idx=0)
        self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight))
        self.rnn = nn.RNN(         # if use nn.RNN(), it hardly learns
            input_size=embed_size,
            hidden_size=hidden_size,# rnn hidden unit
            num_layers=1,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(hidden_size, 2)

    def forward(self, x, length_list):     
        x_embed = self.embed(x)
        # 1 - maxpool
        #x= F.max_pool2d(x_embed, kernel_size=(x_embed.size(2),1)).squeeze(2)
        # 2 - average
        x = torch.mean(x_embed, dim=2)
        # settle the problem of variable sequence
        _, idx_sort = torch.sort(length_list, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        length_list_sort = list(length_list[idx_sort])
        x = x.index_select(0, Variable(idx_sort))
        x_pack = nn_utils.rnn.pack_padded_sequence(x, length_list_sort, batch_first=True)
        output, h_n = self.rnn(x_pack, None)   # None represents zero initial hidden state
        h_n = h_n[0][idx_unsort]
        r_out = self.out(h_n)
        return r_out

rnn = RNN()
print(rnn)

RNN(
  (embed): Embedding(30002, 200, padding_idx=0)
  (rnn): RNN(200, 64, batch_first=True)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


In [11]:
import os
import pickle
import torch
from sklearn.metrics import accuracy_score, classification_report

class Train:
    
    def __init__(self, saved_path="./save"):
        self.saved_path = saved_path
    
    def save_best_model(self, model, model_name, exist_max_acc, new_acc, prf):
        model_file = os.path.join(self.saved_path, "%s.pkl" % model_name)
        if new_acc > exist_max_acc:
            state = {"net": model.state_dict(), "accuracy": new_acc, "prf": prf}
            torch.save(state, model_file)
            #print("upgrade best model")
            
    def show_best_result(self, model_name):
        model_file = os.path.join(self.saved_path, "%s.pkl" % model_name)
        s = torch.load(model_file)
        print("\nBest Result\nAccuracy: %.3f" % s["accuracy"])
        print(s["prf"])
        
    def start(self, model, model_name, EPOCH=5, lr=0.001):
        
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)   # optimize all cnn parameters
        loss_func = nn.CrossEntropyLoss()                         # the target label is not one-hotted
        exist_max_acc = 0

        # training and testing
        for epoch in range(EPOCH):
            for step, (b_x, b_y, b_len) in enumerate(train_loader):  # gives batch data
                output = model(b_x, b_len)                        # rnn output
                loss = loss_func(output, b_y)                   # cross entropy loss
                optimizer.zero_grad()                           # clear gradients for this training step
                loss.backward()                                 # backpropagation, compute gradients
                optimizer.step()                                # apply gradients

                if step % 25 == 0:
                    for x_test, y_test, x_len_test in test_loader:
                        test_output = model(x_test, x_len_test)
                    y_test = np.array(y_test)
                    pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
                    accuracy = float((pred_y == y_test).astype(int).sum()) / float(y_test.size)
                    prf = classification_report(y_test, pred_y, target_names=["non-rumor", "rumor"], digits=3)

                    self.save_best_model(model, model_name, exist_max_acc, accuracy, prf)
                    exist_max_acc = max(exist_max_acc, accuracy)
                    print("pred positive:", pred_y.sum(), "real positive:", y_test.sum(), y_test.shape)
                    print('Epoch: ', epoch+1, '| Step: %5d' % step, 
                          '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.3f' % accuracy)
            
        self.show_best_result(model_name)
                                    
train = Train()

In [6]:
train.start(model=lstm, model_name="lstm", EPOCH=6)

pred positive: 608 real positive: 464 (932,)
Epoch:  1 | Step:     0 | train loss: 0.7062 | test accuracy: 0.356
pred positive: 173 real positive: 464 (932,)
Epoch:  1 | Step:    25 | train loss: 0.6708 | test accuracy: 0.606
pred positive: 427 real positive: 464 (932,)
Epoch:  1 | Step:    50 | train loss: 0.5827 | test accuracy: 0.716
pred positive: 578 real positive: 464 (932,)
Epoch:  1 | Step:    75 | train loss: 0.4706 | test accuracy: 0.745
pred positive: 395 real positive: 464 (932,)
Epoch:  1 | Step:   100 | train loss: 0.5730 | test accuracy: 0.714
pred positive: 521 real positive: 464 (932,)
Epoch:  2 | Step:     0 | train loss: 0.4357 | test accuracy: 0.750
pred positive: 536 real positive: 464 (932,)
Epoch:  2 | Step:    25 | train loss: 0.5049 | test accuracy: 0.768
pred positive: 574 real positive: 464 (932,)
Epoch:  2 | Step:    50 | train loss: 0.4452 | test accuracy: 0.766
pred positive: 573 real positive: 464 (932,)
Epoch:  2 | Step:    75 | train loss: 0.2750 | test

In [16]:
train.start(model=gru, model_name="gru", EPOCH=6)

  'precision', 'predicted', average, warn_for)


pred positive: 0 real positive: 464 (932,)
Epoch:  1 | Step:     0 | train loss: 0.6900 | test accuracy: 0.502
pred positive: 216 real positive: 464 (932,)
Epoch:  1 | Step:    25 | train loss: 0.6853 | test accuracy: 0.614
pred positive: 407 real positive: 464 (932,)
Epoch:  1 | Step:    50 | train loss: 0.4770 | test accuracy: 0.724
pred positive: 699 real positive: 464 (932,)
Epoch:  1 | Step:    75 | train loss: 0.5282 | test accuracy: 0.711
pred positive: 433 real positive: 464 (932,)
Epoch:  1 | Step:   100 | train loss: 0.3952 | test accuracy: 0.744
pred positive: 562 real positive: 464 (932,)
Epoch:  2 | Step:     0 | train loss: 0.3219 | test accuracy: 0.773
pred positive: 623 real positive: 464 (932,)
Epoch:  2 | Step:    25 | train loss: 0.5397 | test accuracy: 0.761
pred positive: 402 real positive: 464 (932,)
Epoch:  2 | Step:    50 | train loss: 0.3844 | test accuracy: 0.742
pred positive: 584 real positive: 464 (932,)
Epoch:  2 | Step:    75 | train loss: 0.3432 | test a

In [12]:
train.start(model=rnn, model_name="rnn", EPOCH=6)

pred positive: 141 real positive: 464 (932,)
Epoch:  1 | Step:     0 | train loss: 0.6976 | test accuracy: 0.486
pred positive: 169 real positive: 464 (932,)
Epoch:  1 | Step:    25 | train loss: 0.7115 | test accuracy: 0.591
pred positive: 318 real positive: 464 (932,)
Epoch:  1 | Step:    50 | train loss: 0.5669 | test accuracy: 0.665
pred positive: 498 real positive: 464 (932,)
Epoch:  1 | Step:    75 | train loss: 0.7550 | test accuracy: 0.710
pred positive: 478 real positive: 464 (932,)
Epoch:  1 | Step:   100 | train loss: 0.4870 | test accuracy: 0.721
pred positive: 140 real positive: 464 (932,)
Epoch:  2 | Step:     0 | train loss: 0.7717 | test accuracy: 0.590
pred positive: 518 real positive: 464 (932,)
Epoch:  2 | Step:    25 | train loss: 0.6592 | test accuracy: 0.620
pred positive: 317 real positive: 464 (932,)
Epoch:  2 | Step:    50 | train loss: 0.5716 | test accuracy: 0.658
pred positive: 454 real positive: 464 (932,)
Epoch:  2 | Step:    75 | train loss: 0.5960 | test