In [0]:
import numpy as np
from multiprocessing import Pool
def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    #print(r)
    
    return np.sum(r / np.log2(np.arange(2, r.size + 2)))


def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max


def get_result(args):

    (y_pred, y_true)=args

    top_k = 50
    pred_topk_index = sorted(range(len(y_pred)), key=lambda i: y_pred[i], reverse=True)[:top_k]
    #print(pred_topk_index[0])
    pos_index = set([k for k, v in enumerate(y_true) if v == 1])

    r = [1 if k in pos_index else 0 for k in pred_topk_index[:top_k]]

    p_1 = precision_at_k(r, 1)
    p_3 = precision_at_k(r, 3)
    p_5 = precision_at_k(r, 5)

    ndcg_1 = ndcg_at_k(r, 1)
    ndcg_3 = ndcg_at_k(r, 3)
    ndcg_5 = ndcg_at_k(r, 5)

    return np.array([p_1, p_3, p_5, ndcg_1, ndcg_3, ndcg_5])

def evaluate(Y_tst_pred, Y_tst):
    pool = Pool(12)
    results = pool.map(get_result,zip(list(Y_tst_pred), list(Y_tst)))
    print(results[0])
    pool.terminate()
    tst_result = list(np.mean(np.array(results),0))
    print(tst_result)
    print ('\rTst Prec@1,3,5: ', tst_result[:3], ' Tst NDCG@1,3,5: ', tst_result[3:])

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

def squash_v1(x, axis):
    s_squared_norm = (x ** 2).sum(axis, keepdim=True)
    scale = torch.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    return scale * x

def dynamic_routing(batch_size, b_ij, u_hat, input_capsule_num):
    num_iterations = 3

    for i in range(num_iterations):
        if True:
            leak = torch.zeros_like(b_ij).sum(dim=2, keepdim=True)
            leaky_logits = torch.cat((leak, b_ij),2)
            leaky_routing = F.softmax(leaky_logits, dim=2)
            c_ij = leaky_routing[:,:,1:,:].unsqueeze(4)
        else:
            c_ij = F.softmax(b_ij, dim=2).unsqueeze(4)
        v_j = squash_v1((c_ij * u_hat).sum(dim=1, keepdim=True), axis=3)
        if i < num_iterations - 1:
            b_ij = b_ij + (torch.cat([v_j] * input_capsule_num, dim=1) * u_hat).sum(3)

    poses = v_j.squeeze(1)
    activations = torch.sqrt((poses ** 2).sum(2))
    return poses, activations


def Adaptive_KDE_routing(batch_size, b_ij, u_hat):
    last_loss = 0.0
    while True:
        if False:
            leak = torch.zeros_like(b_ij).sum(dim=2, keepdim=True)
            leaky_logits = torch.cat((leak, b_ij),2)
            leaky_routing = F.softmax(leaky_logits, dim=2)
            c_ij = leaky_routing[:,:,1:,:].unsqueeze(4)
        else:
            c_ij = F.softmax(b_ij, dim=2).unsqueeze(4)
        c_ij = c_ij/c_ij.sum(dim=1, keepdim=True)
        v_j = squash_v1((c_ij * u_hat).sum(dim=1, keepdim=True), axis=3)
        dd = 1 - ((squash_v1(u_hat, axis=3)-v_j)** 2).sum(3)
        b_ij = b_ij + dd

        c_ij = c_ij.view(batch_size, c_ij.size(1), c_ij.size(2))
        dd = dd.view(batch_size, dd.size(1), dd.size(2))

        kde_loss = torch.mul(c_ij, dd).sum()/batch_size
        kde_loss = np.log(kde_loss.item())

        if abs(kde_loss - last_loss) < 0.05:
            break
        else:
            last_loss = kde_loss
    poses = v_j.squeeze(1)
    activations = torch.sqrt((poses ** 2).sum(2))
    return poses, activations


def KDE_routing(batch_size, b_ij, u_hat):
    num_iterations = 3
    for i in range(num_iterations):
        if False:
            leak = torch.zeros_like(b_ij).sum(dim=2, keepdim=True)
            leaky_logits = torch.cat((leak, b_ij),2)
            leaky_routing = F.softmax(leaky_logits, dim=2)
            c_ij = leaky_routing[:,:,1:,:].unsqueeze(4)
        else:
            c_ij = F.softmax(b_ij, dim=2).unsqueeze(4)

        c_ij = c_ij/c_ij.sum(dim=1, keepdim=True)
        v_j = squash_v1((c_ij * u_hat).sum(dim=1, keepdim=True), axis=3)

        if i < num_iterations - 1:
            dd = 1 - ((squash_v1(u_hat, axis=3)-v_j)** 2).sum(3)
            b_ij = b_ij + dd
    poses = v_j.squeeze(1)
    activations = torch.sqrt((poses ** 2).sum(2))
    return poses, activations

class FlattenCaps(nn.Module):
    def __init__(self):
        super(FlattenCaps, self).__init__()
    def forward(self, p, a):
        poses = p.view(p.size(0), p.size(2) * p.size(3) * p.size(4), -1)
        activations = a.view(a.size(0), a.size(1) * a.size(2) * a.size(3), -1)
        return poses, activations

class PrimaryCaps(nn.Module):
    def __init__(self, num_capsules, in_channels, out_channels, kernel_size, stride):
        super(PrimaryCaps, self).__init__()

        self.capsules = nn.Conv1d(in_channels, out_channels * num_capsules, kernel_size, stride)

        torch.nn.init.xavier_uniform_(self.capsules.weight)

        self.out_channels = out_channels
        self.num_capsules = num_capsules

    def forward(self, x):
        batch_size = x.size(0)
        u = self.capsules(x).view(batch_size, self.num_capsules, self.out_channels, -1, 1)
        poses = squash_v1(u, axis=1)
        activations = torch.sqrt((poses ** 2).sum(1))
        return poses, activations

class FCCaps(nn.Module):
    def __init__(self, args, output_capsule_num, input_capsule_num, in_channels, out_channels):
        super(FCCaps, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.input_capsule_num = input_capsule_num
        self.output_capsule_num = output_capsule_num

        self.W1 = nn.Parameter(torch.FloatTensor(1, input_capsule_num, output_capsule_num, out_channels, in_channels))
        torch.nn.init.xavier_uniform_(self.W1)

        self.is_AKDE = args.is_AKDE
        self.sigmoid = nn.Sigmoid()


    def forward(self, x, y, labels):
        batch_size = x.size(0)
        variable_output_capsule_num = len(labels)
        W1 = self.W1[:,:,labels,:,:]

        x = torch.stack([x] * variable_output_capsule_num, dim=2).unsqueeze(4)

        W1 = W1.repeat(batch_size, 1, 1, 1, 1)
        u_hat = torch.matmul(W1, x)

        b_ij = Variable(torch.zeros(batch_size, self.input_capsule_num, variable_output_capsule_num, 1)).cuda()

        if self.is_AKDE == True:
            poses, activations = Adaptive_KDE_routing(batch_size, b_ij, u_hat)
        else:
            #poses, activations = dynamic_routing(batch_size, b_ij, u_hat, self.input_capsule_num)
            poses, activations = KDE_routing(batch_size, b_ij, u_hat)
        return poses, activations


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def BCE_loss(x, target):
    return nn.BCELoss()(x.squeeze(2), target)

class CapsNet_Text(nn.Module):
    def __init__(self, args, w2v):
        super(CapsNet_Text, self).__init__()
        self.num_classes = args.num_classes
        self.embed = nn.Embedding(args.vocab_size, args.vec_size)
        self.embed.weight = nn.Parameter(torch.from_numpy(w2v))

        self.ngram_size = [2,4,8]

        self.convs_doc = nn.ModuleList([nn.Conv1d(args.sequence_length, 32, K, stride=2) for K in self.ngram_size])
        torch.nn.init.xavier_uniform_(self.convs_doc[0].weight)
        torch.nn.init.xavier_uniform_(self.convs_doc[1].weight)
        torch.nn.init.xavier_uniform_(self.convs_doc[2].weight)

        self.primary_capsules_doc = PrimaryCaps(num_capsules=args.dim_capsule, in_channels=32, out_channels=32, kernel_size=1, stride=1)

        self.flatten_capsules = FlattenCaps()

        self.W_doc = nn.Parameter(torch.FloatTensor(14272, args.num_compressed_capsule))
        torch.nn.init.xavier_uniform_(self.W_doc)

        self.fc_capsules_doc_child = FCCaps(args, output_capsule_num=args.num_classes, input_capsule_num=args.num_compressed_capsule,
                            	  in_channels=args.dim_capsule, out_channels=args.dim_capsule)

    def compression(self, poses, W):
        poses = torch.matmul(poses.permute(0,2,1), W).permute(0,2,1)
        activations = torch.sqrt((poses ** 2).sum(2))
        return poses, activations

    def forward(self, data, labels):
        data = self.embed(data)
        nets_doc_l = []
        for i in range(len(self.ngram_size)):
            nets = self.convs_doc[i](data)
            nets_doc_l.append(nets)
        nets_doc = torch.cat((nets_doc_l[0], nets_doc_l[1], nets_doc_l[2]), 2)
        poses_doc, activations_doc = self.primary_capsules_doc(nets_doc)
        poses, activations = self.flatten_capsules(poses_doc, activations_doc)
        poses, activations = self.compression(poses, self.W_doc)
        poses, activations = self.fc_capsules_doc_child(poses, activations, labels)
        return poses, activations


class CNN_KIM(nn.Module):

    def __init__(self, args, w2v):
        super(CNN_KIM, self).__init__()
        self.embed = nn.Embedding(args.vocab_size, args.vec_size)
        self.embed.weight = nn.Parameter(torch.from_numpy(w2v))
        self.conv13 = nn.Conv2d(1, 128, (3, args.vec_size))
        self.conv14 = nn.Conv2d(1, 128, (4, args.vec_size))
        self.conv15 = nn.Conv2d(1, 128, (5, args.vec_size))

        self.fc1 = nn.Linear(3 * 128, args.num_classes)
        self.m = nn.Sigmoid()

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def loss(self, x, target):
        return nn.BCELoss()(x, target)

    def forward(self, x):
        x = self.embed(x).unsqueeze(1)
        x1 = self.conv_and_pool(x,self.conv13)
        x2 = self.conv_and_pool(x,self.conv14)
        x3 = self.conv_and_pool(x,self.conv15)
        x = torch.cat((x1, x2, x3), 1)
        activations = self.fc1(x)
        return self.m(activations)

class XML_CNN(nn.Module):

    def __init__(self, args, w2v):
        super(XML_CNN, self).__init__()
        self.embed = nn.Embedding(args.vocab_size, args.vec_size)
        self.embed.weight = nn.Parameter(torch.from_numpy(w2v))
        self.conv13 = nn.Conv1d(500, 32, 2, stride=2)
        self.conv14 = nn.Conv1d(500, 32, 4, stride=2)
        self.conv15 = nn.Conv1d(500, 32, 8, stride=2)

        self.fc1 = nn.Linear(14272, 512)
        self.fc2 = nn.Linear(512, args.num_classes)
        self.m = nn.Sigmoid()
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        return x

    def loss(self, x, target):
        return nn.BCELoss()(x, target)

    def forward(self, x):
        x = self.embed(x)
        batch_size = x.shape[0]

        x1 = self.conv13(x).reshape(batch_size, -1)
        x2 = self.conv14(x).reshape(batch_size, -1)
        x3 = self.conv15(x).reshape(batch_size, -1)
        x = torch.cat((x1, x2, x3), 1)
        hidden = self.fc1(x)
        activations = self.fc2(hidden)
        return self.m(activations)

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
import numpy as np
import os
import re
import itertools
import scipy.sparse as sp
import  pickle
from collections import Counter
from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")


def clean_str(string):
    # remove stopwords
    # string = ' '.join([word for word in string.split() if word not in cachedStopWords])
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def pad_sentences(sentences, padding_word="<PAD/>", max_length=500):
    sequence_length = min(max(len(x) for x in sentences), max_length)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        if len(sentence) < max_length:
            num_padding = sequence_length - len(sentence)
            new_sentence = sentence + [padding_word] * num_padding
        else:
            new_sentence = sentence[:max_length]
        padded_sentences.append(new_sentence)
    return padded_sentences


def load_data_and_labels(data):
    #print(data[0])
    x_text = [clean_str(doc['text']) for doc in data]
    x_text = [s.split(" ") for s in x_text]
    labels = [doc['catgy'] for doc in data]
    # for j in range(20):
    #   print(x_text[j])
    #   print(labels[j])
    row_idx, col_idx, val_idx = [], [], []
    for i in range(len(labels)):
        l_list = list(set(labels[i])) # remove duplicate cateories to avoid double count
        for y in l_list:
            row_idx.append(i)
            col_idx.append(y)
            val_idx.append(1)
    m = max(row_idx) + 1
    n = max(col_idx) + 1
    Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n))
    return [x_text, Y, labels]


def build_vocab(sentences, vocab_size=50000):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [x[0] for x in word_counts.most_common(vocab_size)]
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    # append <UNK/> symbol to the vocabulary
    vocabulary['<UNK/>'] = len(vocabulary)
    vocabulary_inv.append('<UNK/>')
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, vocabulary):
    x = np.array([[vocabulary[word] if word in vocabulary else vocabulary['<UNK/>'] for word in sentence] for sentence in sentences])
    #x = np.array([[vocabulary[word] if word in vocabulary else len(vocabulary) for word in sentence] for sentence in sentences])
    return x


def load_data(data_path, max_length=500, vocab_size=50000):
    # Load and preprocess data
    with open(os.path.join(data_path), 'rb') as fin:
        [train, test, vocab, catgy] = pickle.load(fin)
        print(train[0],test[0],vocab,catgy,sep='\n')

    # dirty trick to prevent errors happen when test is empty
    if len(test) == 0:
        test[:5] = train[:5]
    #print(train[0])
    #print(test[0])
    #print(vocab)
    #print(catgy)
    trn_sents, Y_trn, Y_trn_o = load_data_and_labels(train)
    tst_sents, Y_tst, Y_tst_o = load_data_and_labels(test)
    #print(catgy)
    #print(type(Y_trn[0]))
    trn_sents_padded = pad_sentences(trn_sents, max_length=max_length)
    tst_sents_padded = pad_sentences(tst_sents, max_length=max_length)
    vocabulary, vocabulary_inv = build_vocab(trn_sents_padded + tst_sents_padded, vocab_size=vocab_size)
    X_trn = build_input_data(trn_sents_padded, vocabulary)
    X_tst = build_input_data(tst_sents_padded, vocabulary)
    return X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv
    # return X_trn, Y_trn, vocabulary, vocabulary_inv


def batch_iter(data, batch_size, num_epochs):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_data = data[shuffle_indices]
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [0]:
from os.path import join, exists, split
import os
import numpy as np

def load_word2vec(model_type, vocabulary_inv, num_features=300):
    """
    loads Word2Vec model
    Returns initial weights for embedding layer.

    inputs:
    model_type      # GoogleNews / glove
    vocabulary_inv  # dict {str:int}
    num_features    # Word vector dimensionality
    """

    model_dir = '/content/drive/My Drive/NLP-Capsule'

    if model_type == 'glove':
        model_name = join(model_dir, 'glove.6B.%dd.txt' % (num_features))
        assert(exists(model_name))
        print('Loading existing Word2Vec model (Glove.6B.%dd)' % (num_features))

        # dictionary, where key is word, value is word vectors
        embedding_model = {}
        for line in open(model_name, 'r', encoding="utf-8"):
            tmp = line.strip().split()
            word, vec = tmp[0], list(map(float, tmp[1:]))
            assert(len(vec) == num_features)
            if word not in embedding_model:
                embedding_model[word] = vec
        assert(len(embedding_model) == 400000)

    else:
        raise ValueError('Unknown pretrain model type: %s!' % (model_type))

    embedding_weights = [embedding_model[w] if w in embedding_model
                         else np.random.uniform(-0.25, 0.25, num_features)
                         for w in vocabulary_inv]
    embedding_weights = np.array(embedding_weights).astype('float32')

    return embedding_weights

In [0]:
 CUDA_VISIBLE_DEVICES=-1

In [0]:
from __future__ import division, print_function, unicode_literals
import argparse
import numpy as np
import torch
import torch.nn as nn
import os
import json
import random
import time
from torch.autograd import Variable
from torch.optim import Adam

torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)

parser = argparse.ArgumentParser()

parser.add_argument('--dataset', type=str, default='eurlex_raw_text.p',
                    help='Options: eurlex_raw_text.p, rcv1_raw_text.p, wiki30k_raw_text.p',required=False)
parser.add_argument('--vocab_size', type=int, default=30001, help='vocabulary size',required=False)
parser.add_argument('--vec_size', type=int, default=300, help='embedding size',required=False)
parser.add_argument('--sequence_length', type=int, default=500, help='the length of documents',required=False)
parser.add_argument('--is_AKDE', type=bool, default=True, help='if Adaptive KDE routing is enabled',required=False)
parser.add_argument('--num_epochs', type=int, default=30, help='Number of training epochs',required=False)
parser.add_argument('--tr_batch_size', type=int, default=64, help='Batch size for training',required=False)
parser.add_argument('--learning_rate', type=float, default=1e-3, help='Learning rate for training',required=False)
parser.add_argument('--start_from', type=str, default='', help='',required=False)

parser.add_argument('--num_compressed_capsule', type=int, default=128, help='The number of compact capsules',required=False)
parser.add_argument('--dim_capsule', type=int, default=16, help='The number of dimensions for capsules',required=False)

parser.add_argument('--learning_rate_decay_start', type=int, default=0,
                    help='at what iteration to start decaying learning rate? (-1 = dont) (in epoch)',required=False)
parser.add_argument('--learning_rate_decay_every', type=int, default=20,
                    help='how many iterations thereafter to drop LR?(in epoch)',required=False)
parser.add_argument('--learning_rate_decay_rate', type=float, default=0.95,
                    help='how many iterations thereafter to drop LR?(in epoch)',required=False)



args = parser.parse_args(''.split())
params = vars(args)
print(json.dumps(params, indent = 2))
print(args.dataset)
X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = load_data('/content/drive/My Drive/NLP-Capsule/eurlex_raw_text.p',
                                                                           max_length=args.sequence_length,
                                                                           vocab_size=args.vocab_size)
Y_trn = Y_trn.toarray()
Y_tst = Y_tst.toarray()

X_trn = X_trn.astype(np.int32)
X_tst = X_tst.astype(np.int32)
Y_trn = Y_trn.astype(np.int32)
Y_tst = Y_tst.astype(np.int32)

embedding_weights = load_word2vec('glove', vocabulary_inv, args.vec_size)

args.num_classes = Y_trn.shape[1]
print(args.num_classes)

capsule_net = CapsNet_Text(args, embedding_weights)
capsule_net = nn.DataParallel(capsule_net).cuda()


def transformLabels(labels):
    label_index = list(set([l for _ in labels for l in _]))
    label_index.sort()

    variable_num_classes = len(label_index)
    target = []
    for _ in labels:
        tmp = np.zeros([variable_num_classes], dtype=np.float32)
        tmp[[label_index.index(l) for l in _]] = 1
        target.append(tmp)
    target = np.array(target)
    return label_index, target

current_lr = args.learning_rate

optimizer = Adam(capsule_net.parameters(), lr=current_lr)

def set_lr(optimizer, lr):
    for group in optimizer.param_groups:
        group['lr'] = lr

for epoch in range(args.num_epochs):
    torch.cuda.empty_cache()

    nr_trn_num = X_trn.shape[0]
    nr_batches = int(np.ceil(nr_trn_num / float(args.tr_batch_size)))

    if epoch > args.learning_rate_decay_start and args.learning_rate_decay_start >= 0:
        frac = (epoch - args.learning_rate_decay_start) // args.learning_rate_decay_every
        decay_factor = args.learning_rate_decay_rate  ** frac
        current_lr = current_lr * decay_factor
    print(current_lr)
    set_lr(optimizer, current_lr)

    capsule_net.train()
    for iteration, batch_idx in enumerate(np.random.permutation(range(nr_batches))):
        start = time.time()
        start_idx = batch_idx * args.tr_batch_size
        end_idx = min((batch_idx + 1) * args.tr_batch_size, nr_trn_num)

        X = X_trn[start_idx:end_idx]
        Y = Y_trn_o[start_idx:end_idx]
        data = Variable(torch.from_numpy(X).long()).cuda()

        batch_labels, batch_target = transformLabels(Y)
        batch_target = Variable(torch.from_numpy(batch_target).float()).cuda()
        optimizer.zero_grad()
        poses, activations = capsule_net(data, batch_labels)
        loss = BCE_loss(activations, batch_target)
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        done = time.time()
        elapsed = done - start

        print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f} {:.5f}".format(
                      iteration, nr_batches,
                      iteration * 100 / nr_batches,
                      loss.item(), elapsed),
                      end="")

    torch.cuda.empty_cache()

    if (epoch + 1) > 20:
        checkpoint_path = os.path.join('/content/drive/My Drive/NLP-Capsule', 'model-eur-akde-' + str(epoch + 1) + '.pth')
        if epoch==29:
          #torch.save(capsule_net.state_dict(), checkpoint_path)
          print("model saved to {}".format(checkpoint_path))


{
  "dataset": "eurlex_raw_text.p",
  "vocab_size": 30001,
  "vec_size": 300,
  "sequence_length": 500,
  "is_AKDE": true,
  "num_epochs": 30,
  "tr_batch_size": 64,
  "learning_rate": 0.001,
  "start_from": "",
  "num_compressed_capsule": 128,
  "dim_capsule": 16,
  "learning_rate_decay_start": 0,
  "learning_rate_decay_every": 20,
  "learning_rate_decay_rate": 0.95
}
eurlex_raw_text.p
["'decis", 'committe', 'region', 'septemb', 'public', 'access', 'document', 'committe', 'region', 'bureau', 'committe', 'region', 'regard', 'treati', 'european', 'union', 'declar', 'attach', 'final', 'act', 'thereof', 'provis', 'adopt', 'govern', 'public', 'access', 'document', 'committe', 'region', 'hereinaft', 'committe', 'measur', 'harmoni', 'code', 'conduct', 'agre', 'adopt', 'commiss', 'council', 'area', 'decemb', 'order', 'ensur', 'consist', 'continu', 'activ', 'institut', 'accord', 'articl', 'treati', 'european', 'union', 'provis', 'applic', 'document', 'held', 'committe', 'medium', 'exclud', 'do



Iteration: 181/182 (99.5%)  Loss: 0.00177 0.371890.000630249409724609
Iteration: 181/182 (99.5%)  Loss: 0.00142 0.367260.0005987369392383785
Iteration: 181/182 (99.5%)  Loss: 0.00156 0.39415model saved to /content/drive/My Drive/NLP-Capsule/model-eur-akde-30.pth


In [0]:
from __future__ import division, print_function, unicode_literals
import argparse
import numpy as np
import torch
import torch.nn as nn
import os
import json
import random
import time
from torch.autograd import Variable
from torch.optim import Adam



torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)

parser = argparse.ArgumentParser()

parser.add_argument('--dataset', type=str, default='eurlex_raw_text.p',
                    help='Options: eurlex_raw_text.p, rcv1_raw_text.p, wiki30k_raw_text.p',required=False)
parser.add_argument('--vocab_size', type=int, default=30001, help='vocabulary size',required=False)
parser.add_argument('--vec_size', type=int, default=300, help='embedding size',required=False)
parser.add_argument('--sequence_length', type=int, default=500, help='the length of documents',required=False)
parser.add_argument('--is_AKDE', type=bool, default=True, help='if Adaptive KDE routing is enabled',required=False)
parser.add_argument('--num_epochs', type=int, default=30, help='Number of training epochs',required=False)
parser.add_argument('--tr_batch_size', type=int, default=256, help='Batch size for training',required=False)
parser.add_argument('--ts_batch_size', type=int, default=16, help='Batch size for training',required=False)

parser.add_argument('--learning_rate', type=float, default=1e-3, help='Learning rate for training',required=False)
parser.add_argument('--start_from', type=str, default='', help='',required=False)

parser.add_argument('--num_compressed_capsule', type=int, default=128, help='The number of compact capsules',required=False)
parser.add_argument('--dim_capsule', type=int, default=16, help='The number of dimensions for capsules')

parser.add_argument('--learning_rate_decay_start', type=int, default=0,
                    help='at what iteration to start decaying learning rate? (-1 = dont) (in epoch)',required=False)
parser.add_argument('--learning_rate_decay_every', type=int, default=20,
                    help='how many iterations thereafter to drop LR?(in epoch)',required=False)
parser.add_argument('--learning_rate_decay_rate', type=float, default=0.95,
                    help='how many iterations thereafter to drop LR?(in epoch)',required=False)

parser.add_argument('--gradient_accumulation_steps', type=int, default=8,required=False)

parser.add_argument('--re_ranking', type=int, default=200, help='The number of re-ranking size',required=False)


args = parser.parse_args(''.split())
params = vars(args)
print(json.dumps(params, indent = 2))

X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = load_data('/content/drive/My Drive/NLP-Capsule/eurlex_raw_text.p',
                                                                           max_length=args.sequence_length,
                                                                           vocab_size=args.vocab_size)
Y_trn = Y_trn.toarray()
Y_tst = Y_tst.toarray()
X_trn = X_trn.astype(np.int32)
X_tst = X_tst.astype(np.int32)
Y_trn = Y_trn.astype(np.int32)
Y_tst = Y_tst.astype(np.int32)

embedding_weights = load_word2vec('glove', vocabulary_inv, args.vec_size)

args.num_classes = Y_trn.shape[1]

capsule_net = CapsNet_Text(args, embedding_weights)
capsule_net = nn.DataParallel(capsule_net).cuda()

model_name = 'model-EUR-CNN-40.pth'
baseline = CNN_KIM(args, embedding_weights)
baseline.load_state_dict(torch.load(os.path.join('/content/drive/My Drive/NLP-Capsule', model_name)))
baseline = nn.DataParallel(baseline).cuda()
print(model_name + ' loaded')

def transformLabels(labels, total_labels):
    label_index = list(set([l for _ in total_labels for l in _]))
    label_index.sort()

    variable_num_classes = len(label_index)
    target = []
    for _ in labels:
        tmp = np.zeros([variable_num_classes], dtype=np.float32)
        tmp[[label_index.index(l) for l in _]] = 1
        target.append(tmp)
    target = np.array(target)
    return label_index, target

current_lr = args.learning_rate

optimizer = Adam(capsule_net.parameters(), lr=current_lr)

def set_lr(optimizer, lr):
    for group in optimizer.param_groups:
        group['lr'] = lr
import random
import scipy.sparse as sp
import os

for epoch in range(args.num_epochs):

    nr_trn_num = X_trn.shape[0]
    nr_batches = int(np.ceil(nr_trn_num / float(args.tr_batch_size)))

    if epoch > args.learning_rate_decay_start and args.learning_rate_decay_start >= 0:
        frac = (epoch - args.learning_rate_decay_start) // args.learning_rate_decay_every
        decay_factor = args.learning_rate_decay_rate  ** frac
        current_lr = current_lr * decay_factor
    print(current_lr)
    set_lr(optimizer, current_lr)

    capsule_net.train()
    for iteration, batch_idx in enumerate(np.random.permutation(range(nr_batches))):
        start = time.time()
        start_idx = batch_idx * args.tr_batch_size
        end_idx = min((batch_idx + 1) * args.tr_batch_size, nr_trn_num)

        X = X_trn[start_idx:end_idx]
        Y = Y_trn_o[start_idx:end_idx]

        batch_steps = int(np.ceil(len(X)) / (float(args.tr_batch_size) / float(args.gradient_accumulation_steps)))
        batch_loss = 0
        for i in range(batch_steps):
            step_size = int(float(args.tr_batch_size) // float(args.gradient_accumulation_steps))
            step_X = X[i * step_size: (i+1) * step_size]
            step_Y = Y[i * step_size: (i+1) * step_size]

            step_X = Variable(torch.from_numpy(step_X).long()).cuda()
            step_labels, step_target = transformLabels(step_Y, Y)
            step_target = Variable(torch.from_numpy(step_target).float()).cuda()            

            poses, activations = capsule_net(step_X, step_labels)
            step_loss = BCE_loss(activations, step_target)            
            step_loss = step_loss / args.gradient_accumulation_steps
            step_loss.backward()
            batch_loss += step_loss.item()
        
        optimizer.step()
        optimizer.zero_grad()
        done = time.time()
        elapsed = done - start

        print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f} {:.5f}".format(
                      iteration, nr_batches,
                      iteration * 100 / nr_batches,
                      batch_loss, elapsed),
                      end="")

    if (epoch + 1) > 20 and (epoch + 1)<30:         

        nr_tst_num = X_tst.shape[0]
        nr_batches = int(np.ceil(nr_tst_num / float(args.ts_batch_size)))

        n, k_trn = Y_trn.shape
        m, k_tst = Y_tst.shape
        print ('k_trn:', k_trn)
        print ('k_tst:', k_tst)

        capsule_net.eval()
        top_k = 50
        row_idx_list, col_idx_list, val_idx_list = [], [], []
        for batch_idx in range(nr_batches):
            start = time.time()
            start_idx = batch_idx * args.ts_batch_size
            end_idx = min((batch_idx + 1) * args.ts_batch_size, nr_tst_num)
            X = X_tst[start_idx:end_idx]
            Y = Y_tst_o[start_idx:end_idx]
            data = Variable(torch.from_numpy(X).long()).cuda()

            candidates = baseline(data)
            candidates = candidates.data.cpu().numpy()

            Y_pred = np.zeros([candidates.shape[0], args.num_classes])
            for i in range(candidates.shape[0]):
                candidate_labels = candidates[i, :].argsort()[-args.re_ranking:][::-1].tolist()
                _, activations_2nd = capsule_net(data[i, :].unsqueeze(0), candidate_labels)
                Y_pred[i, candidate_labels] = activations_2nd.squeeze(2).data.cpu().numpy()

            for i in range(Y_pred.shape[0]):
                sorted_idx = np.argpartition(-Y_pred[i, :], top_k)[:top_k]
                row_idx_list += [i + start_idx] * top_k
                col_idx_list += (sorted_idx).tolist()
                val_idx_list += Y_pred[i, sorted_idx].tolist()

            done = time.time()
            elapsed = done - start

            print("\r Epoch: {} Reranking: {} Iteration: {}/{} ({:.1f}%)  Loss: {:.5f} {:.5f}".format(
                  (epoch + 1), args.re_ranking, batch_idx, nr_batches,
                  batch_idx * 100 / nr_batches,
                  0, elapsed),
                  end="")

        m = max(row_idx_list) + 1
        n = max(k_trn, k_tst)
        #print(elapsed)
        Y_tst_pred = sp.csr_matrix((val_idx_list, (row_idx_list, col_idx_list)), shape=(m, n))

        if k_trn >= k_tst:
            Y_tst_pred = Y_tst_pred[:, :k_tst]

        evaluate(Y_tst_pred.toarray(), Y_tst)

#        checkpoint_path = os.path.join('save_new', 'model-eur-akde-' + str(epoch + 1) + '.pth')
#        torch.save(capsule_net.state_dict(), checkpoint_path)
#        print("model saved to {}".format(checkpoint_path))


{
  "dataset": "eurlex_raw_text.p",
  "vocab_size": 30001,
  "vec_size": 300,
  "sequence_length": 500,
  "is_AKDE": true,
  "num_epochs": 30,
  "tr_batch_size": 256,
  "ts_batch_size": 16,
  "learning_rate": 0.001,
  "start_from": "",
  "num_compressed_capsule": 128,
  "dim_capsule": 16,
  "learning_rate_decay_start": 0,
  "learning_rate_decay_every": 20,
  "learning_rate_decay_rate": 0.95,
  "gradient_accumulation_steps": 8,
  "re_ranking": 200
}
["'decis", 'committe', 'region', 'septemb', 'public', 'access', 'document', 'committe', 'region', 'bureau', 'committe', 'region', 'regard', 'treati', 'european', 'union', 'declar', 'attach', 'final', 'act', 'thereof', 'provis', 'adopt', 'govern', 'public', 'access', 'document', 'committe', 'region', 'hereinaft', 'committe', 'measur', 'harmoni', 'code', 'conduct', 'agre', 'adopt', 'commiss', 'council', 'area', 'decemb', 'order', 'ensur', 'consist', 'continu', 'activ', 'institut', 'accord', 'articl', 'treati', 'european', 'union', 'provis', 'a

In [9]:
from __future__ import division, print_function, unicode_literals
import argparse
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import random
import time
import os

torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)

parser = argparse.ArgumentParser()

parser.add_argument('--dataset', type=str, default='eurlex_raw_text.p',
                    help='Options: eurlex_raw_text.p, rcv1_raw_text.p, wiki30k_raw_text.p',required=False)
parser.add_argument('--vocab_size', type=int, default=30001, help='vocabulary size',required=False)
parser.add_argument('--vec_size', type=int, default=300, help='embedding size',required=False)
parser.add_argument('--sequence_length', type=int, default=500, help='the length of documents',required=False)
parser.add_argument('--is_AKDE', type=bool, default=True, help='if Adaptive KDE routing is enabled',required=False)
parser.add_argument('--num_epochs', type=int, default=30, help='Number of training epochs',required=False)
parser.add_argument('--ts_batch_size', type=int, default=32, help='Batch size for training',required=False)
parser.add_argument('--learning_rate', type=float, default=1e-3, help='Learning rate for training',required=False)
parser.add_argument('--start_from', type=str, default='save', help='',required=False)

parser.add_argument('--num_compressed_capsule', type=int, default=128, help='The number of compact capsules',required=False)
parser.add_argument('--dim_capsule', type=int, default=16, help='The number of dimensions for capsules',required=False)

parser.add_argument('--re_ranking', type=int, default=200, help='The number of re-ranking size',required=False)

import json
args = parser.parse_args(''.split())
params = vars(args)
print(json.dumps(params, indent = 2))

X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = load_data('/content/drive/My Drive/NLP-Capsule/eurlex_raw_text.p',
                                                                                max_length=args.sequence_length,
                                                                                vocab_size=args.vocab_size)
Y_trn = Y_trn.toarray()
Y_tst = Y_tst.toarray()
#print(Y_tst)
X_trn = X_trn.astype(np.int32)
X_tst = X_tst.astype(np.int32)
Y_trn = Y_trn.astype(np.int32)
Y_tst = Y_tst.astype(np.int32)
print(X_tst[0],Y_tst[0])


embedding_weights = load_word2vec('glove', vocabulary_inv, args.vec_size)
args.num_classes = Y_trn.shape[1]

capsule_net = CapsNet_Text(args, embedding_weights)
capsule_net = nn.DataParallel(capsule_net).cuda()
model_name = 'model-eur-akde-29.pth'
capsule_net.load_state_dict(torch.load(os.path.join('/content/drive/My Drive/NLP-Capsule', model_name)))
print(model_name + ' loaded')


model_name = 'model-EUR-CNN-40.pth'
baseline = CNN_KIM(args, embedding_weights)
baseline.load_state_dict(torch.load(os.path.join('/content/drive/My Drive/NLP-Capsule', model_name)))
baseline = nn.DataParallel(baseline).cuda()
print(model_name + ' loaded')


nr_tst_num = X_tst.shape[0]
nr_batches = int(np.ceil(nr_tst_num / float(args.ts_batch_size)))

n, k_trn = Y_trn.shape
m, k_tst = Y_tst.shape
print ('k_trn:', k_trn)
print ('k_tst:', k_tst)

capsule_net.eval()
top_k = 50
row_idx_list, col_idx_list, val_idx_list = [], [], []
for batch_idx in range(nr_batches):
    start = time.time()
    start_idx = batch_idx * args.ts_batch_size
    end_idx = min((batch_idx + 1) * args.ts_batch_size, nr_tst_num)
    X = X_tst[start_idx:end_idx]
    Y = Y_tst_o[start_idx:end_idx]
    data = Variable(torch.from_numpy(X).long()).cuda()

    candidates = baseline(data)
    candidates = candidates.data.cpu().numpy()

    Y_pred = np.zeros([candidates.shape[0], args.num_classes])
    for i in range(candidates.shape[0]):
        candidate_labels = candidates[i, :].argsort()[-args.re_ranking:][::-1].tolist()
        _, activations_2nd = capsule_net(data[i, :].unsqueeze(0), candidate_labels)
        Y_pred[i, candidate_labels] = activations_2nd.squeeze(2).data.cpu().numpy()

    for i in range(Y_pred.shape[0]):
        sorted_idx = np.argpartition(-Y_pred[i, :], top_k)[:top_k]
        row_idx_list += [i + start_idx] * top_k
        col_idx_list += (sorted_idx).tolist()
        val_idx_list += Y_pred[i, sorted_idx].tolist()

    done = time.time()
    elapsed = done - start

    print("\r Reranking: {} Iteration: {}/{} ({:.1f}%)  Loss: {:.5f} {:.5f}".format(
          args.re_ranking, batch_idx, nr_batches,
          batch_idx * 100 / nr_batches,
          0, elapsed),
          end="")

m = max(row_idx_list) + 1
n = max(k_trn, k_tst)
print(elapsed)
Y_tst_pred = sp.csr_matrix((val_idx_list, (row_idx_list, col_idx_list)), shape=(m, n))
#print(Y_tst_pred[0])
if k_trn >= k_tst:
    Y_tst_pred = Y_tst_pred[:, :k_tst]
print(Y_tst_pred[0])
evaluate(Y_tst_pred.toarray(), Y_tst)



{
  "dataset": "eurlex_raw_text.p",
  "vocab_size": 30001,
  "vec_size": 300,
  "sequence_length": 500,
  "is_AKDE": true,
  "num_epochs": 30,
  "ts_batch_size": 32,
  "learning_rate": 0.001,
  "start_from": "save",
  "num_compressed_capsule": 128,
  "dim_capsule": 16,
  "re_ranking": 200
}
{'text': "'decis committe region septemb public access document committe region bureau committe region regard treati european union declar attach final act thereof provis adopt govern public access document committe region hereinaft committe measur harmoni code conduct agre adopt commiss council area decemb order ensur consist continu activ institut accord articl treati european union provis applic document held committe medium exclud document written person bodi institut committe principl allow public wide access committe document part greater transpar committe work subject except protect public interest individu privaci decis appli due regard provis govern protect classifi inform decid articl publ

In [10]:
Y_tst_pred[0]

<1x3946 sparse matrix of type '<class 'numpy.float64'>'
	with 50 stored elements in Compressed Sparse Row format>

In [11]:
print(baseline)

DataParallel(
  (module): CNN_KIM(
    (embed): Embedding(30001, 300)
    (conv13): Conv2d(1, 128, kernel_size=(3, 300), stride=(1, 1))
    (conv14): Conv2d(1, 128, kernel_size=(4, 300), stride=(1, 1))
    (conv15): Conv2d(1, 128, kernel_size=(5, 300), stride=(1, 1))
    (fc1): Linear(in_features=384, out_features=3954, bias=True)
    (m): Sigmoid()
  )
)


In [28]:
X_trn, Y_trn, Y_trn_o, X_tst, Y_tst, Y_tst_o, vocabulary, vocabulary_inv = load_data('/content/drive/My Drive/NLP-Capsule/eurlex_raw_text.p',
                                                                                max_length=args.sequence_length,
                                                                                vocab_size=args.vocab_size)

{'text': "'decis committe region septemb public access document committe region bureau committe region regard treati european union declar attach final act thereof provis adopt govern public access document committe region hereinaft committe measur harmoni code conduct agre adopt commiss council area decemb order ensur consist continu activ institut accord articl treati european union provis applic document held committe medium exclud document written person bodi institut committe principl allow public wide access committe document part greater transpar committe work subject except protect public interest individu privaci decis appli due regard provis govern protect classifi inform decid articl public access committe document condit laid decis committe document mean written text medium exist data held committe subject articl articl applic access committe document write secretari gener committe made suffici precis manner inform enabl document document request identifi applic ask detail 

In [29]:
print(Y_tst[0])

  (0, 300)	1
  (0, 415)	1
  (0, 442)	1
  (0, 995)	1
  (0, 1546)	1


In [30]:
n=Y_tst[0].toarray()
n=n.reshape(-1)
print(len(n))
for i in range(len(n)):
  if n[i]!=0:

    print(n[i],i)

3946
1 300
1 415
1 442
1 995
1 1546


In [0]:
for i in range(len(n)):
  

In [15]:
print(Y_tst_pred[0])

  (0, 9)	0.028301537036895752
  (0, 23)	0.012057950720191002
  (0, 46)	0.025693805888295174
  (0, 55)	0.01044171117246151
  (0, 140)	0.025068894028663635
  (0, 141)	0.026241132989525795
  (0, 203)	0.009621907025575638
  (0, 227)	0.022926144301891327
  (0, 300)	0.8182740807533264
  (0, 321)	0.013197005726397038
  (0, 341)	0.0236428901553154
  (0, 415)	0.9342229962348938
  (0, 442)	0.9777390956878662
  (0, 501)	0.04833345115184784
  (0, 522)	0.009664259850978851
  (0, 588)	0.03853737562894821
  (0, 606)	0.16200889647006989
  (0, 682)	0.05232146754860878
  (0, 710)	0.01182631403207779
  (0, 714)	0.01137586496770382
  (0, 766)	0.017075397074222565
  (0, 806)	0.07390626519918442
  (0, 897)	0.9161458611488342
  (0, 921)	0.022440925240516663
  (0, 935)	0.05164450407028198
  (0, 995)	0.9500865340232849
  (0, 1016)	0.01414217334240675
  (0, 1062)	0.011938530951738358
  (0, 1067)	0.012114441022276878
  (0, 1086)	0.9534541964530945
  (0, 1197)	0.013958916068077087
  (0, 1226)	0.014681846834719181

In [37]:
print(Y_tst_pred[0].toarray())
n1=Y_tst_pred[0].toarray().reshape(-1)
print(len(n1))
for i in range(len(n1)):
  if n1[i]!=0:

    print(n1[i],i)
    print(key_list[val_list.index(i)])

[[0. 0. 0. ... 0. 0. 0.]]
3946
0.028301537036895752 9
gatt
0.012057950720191002 23
tariff_quota
0.025693805888295174 46
quantitative_restriction
0.01044171117246151 55
community_import
0.025068894028663635 140
import_licence
0.026241132989525795 141
customs_formalities
0.009621907025575638 203
export
0.022926144301891327 227
agricultural_product
0.8182740807533264 300
wood_product
0.013197005726397038 321
spain
0.0236428901553154 341
costa_rica
0.9342229962348938 415
france
0.9777390956878662 442
state_monopoly
0.04833345115184784 501
import_policy
0.009664259850978851 522
telecommunications
0.03853737562894821 588
fixing_of_prices
0.16200889647006989 606
inter-company_cooperation
0.05232146754860878 682
singapore
0.01182631403207779 710
marketing
0.01137586496770382 714
industrial_property
0.017075397074222565 766
export_policy
0.07390626519918442 806
advertising
0.9161458611488342 897
inflammable_product
0.022440925240516663 921
derogation_from_community_law
0.05164450407028198 935
c

In [0]:
with open(os.path.join('/content/drive/My Drive/NLP-Capsule/eurlex_raw_text.p'), 'rb') as fin:
        [train, test, vocab, catgy] = pickle.load(fin)

In [38]:
key_list = list(catgy.keys()) 
val_list = list(catgy.values()) 
  
print(key_list[val_list.index(300)]) 
print(key_list[val_list.index(415)]) 

wood_product
france


In [39]:
n=Y_tst[0].toarray()
n=n.reshape(-1)
for i in range(len(n)):
  if n[i]!=0:
    print(n[i],i)
    print(key_list[val_list.index(n[i])])

1 300
pollution_control_measures
1 415
pollution_control_measures
1 442
pollution_control_measures
1 995
pollution_control_measures
1 1546
pollution_control_measures
