In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime, time
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [25]:
def zeros_padding(lst):
    inner_mean_len = seq_length
    result = np.zeros([len(lst), inner_mean_len])
    for i, row in enumerate(lst):
        for j, val in enumerate(row):
            if j < inner_mean_len:
                result[i][j] = val
    return result
    

In [26]:
def docs_to_lines(docs):
    l = []
    for doc in docs:
        for line in doc:
            l.append(line)
    return l

def to_one_of_k(int_targets, num_classes):
    one_of_k_targets = np.zeros((np.array(int_targets).shape[0], num_classes))
    one_of_k_targets[range(np.array(int_targets).shape[0]), int_targets] = 1
    return one_of_k_targets

        
def get_batches(input_x, input_y, batch_size, isShuffle =  False):
    n_batches = len(input_x) // batch_size
    train_size = n_batches*batch_size
    if isShuffle:
        shuffle_idx = np.random.permutation(np.arange(len(input_x)))
        train_x = input_x[shuffle_idx]
        train_y = np.array(input_y)[shuffle_idx]
    else: 
        train_x = input_x[:train_size]
        train_y = np.array(input_y)[:train_size]
    for idx in range(0, len(train_x), batch_size):
        x = train_x[idx:idx+batch_size]
        y = train_y[idx:idx+batch_size]
        yy = to_one_of_k(y.astype(np.int32), num_classes)
        yield x, yy

### hyperparameter

In [20]:
w2v_google_50d = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_google_50d.npy')

In [31]:
embedding_pubmed = {}
# embedding_pubmed['retrain_2m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_2m.npy')
# embedding_pubmed['retrain_1m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_1m.npy')
# embedding_pubmed['retrain_05m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_05m.npy')
# embedding_pubmed['retrain_01m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_01m.npy')
# embedding_pubmed['retrain_005m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_005m.npy')
# embedding_pubmed['retrain_001m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_001m.npy')

embedding_pubmed['retrain_3m'] = np.load('/Users/zhang/MscProject_tweak2vec/word2vecModel/pubmed/w2v_retrain_3m.npy')


In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
embedding_pubmed_concat = {}
for key in embedding_pubmed.keys():
    key_str = 'concat_'+key
    concat_vec = np.concatenate([w2v_google_50d, embedding_pubmed[key]], axis=1)
    embedding_pubmed_concat[key_str]  = pca.fit_transform(concat_vec)
    print(key_str,embedding_pubmed_concat[key_str].shape)

concat_pivotsfull_3m (27188, 50)
concat_pivotsfull_2m (27188, 50)
concat_pivotsfull_1m (27188, 50)
concat_pivotsfull_05m (27188, 50)
concat_pivotsfull_01m (27188, 50)
concat_pivotsfull_005m (27188, 50)
concat_pivotsfull_001m (27188, 50)


In [23]:
embedding_dim = 50
seq_length = 100 
num_classes = 5  
num_filters = 256  # number of kernels
kernel_size = 5  
vocab_size = len(embedding_pubmed['retrain_2m']) 

hidden_dim = 128  

keep_prob_rate = 0.75
learning_rate = 1e-3

batch_size = 100  
num_epoch = 8  

print_per_batch = 100 
save_per_batch = 10

### import data

In [27]:
file_train_x = '/Users/zhang/MscProject_tweak2vec/corpus/pubmed_train_x.npy'
file_train_y = '/Users/zhang/MscProject_tweak2vec/corpus/pubmed_train_y.npy'
file_val_x = '/Users/zhang/MscProject_tweak2vec/corpus/pubmed_dev_x.npy'
file_val_y = '/Users/zhang/MscProject_tweak2vec/corpus/pubmed_dev_y.npy'
file_test_x = '/Users/zhang/MscProject_tweak2vec/corpus/pubmed_test_x.npy'
file_test_y = '/Users/zhang/MscProject_tweak2vec/corpus/pubmed_test_y.npy'



train_x = zeros_padding( docs_to_lines( np.load(file_train_x).tolist() ) )
train_y = docs_to_lines( np.load(file_train_y).tolist() )
val_x = zeros_padding( docs_to_lines( np.load(file_val_x).tolist() ) )
val_y = to_one_of_k(docs_to_lines( np.load(file_val_y).tolist() ),5)
test_x = zeros_padding( docs_to_lines( np.load(file_test_x).tolist() ) )
test_y = to_one_of_k(docs_to_lines( np.load(file_test_y).tolist() ),5)

In [28]:
len(train_x)

180037

### build graph

In [29]:
train_graph = tf.Graph()
with train_graph.as_default():
    input_x = tf.placeholder(tf.int32, [None, seq_length], name='input_x')
    input_y = tf.placeholder(tf.int32, [None, num_classes], name='input_y')
    embedding = tf.placeholder(tf.float32, [vocab_size, embedding_dim], name='embedding')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    with tf.name_scope("embedding"):
        # embedding layer
        embedding_inputs = tf.nn.embedding_lookup(embedding, input_x)
    with tf.name_scope("CNN"):
        # CNN layer
        conv1 = tf.layers.conv1d(inputs=embedding_inputs, filters=num_filters, 
                                 kernel_size=kernel_size, padding="VALID", activation=tf.nn.relu,
                                 activity_regularizer=tf.contrib.layers.l2_regularizer(0.001),)
        # global maxpooling layer
        pool1 = tf.reduce_max(conv1, reduction_indices=[1])
        bn1 = tf.layers.batch_normalization(pool1)
        
        conv2 = tf.layers.conv1d(inputs=conv1, filters=num_filters, 
                                 kernel_size=kernel_size, padding="VALID", activation=tf.nn.relu,
                                 activity_regularizer=tf.contrib.layers.l2_regularizer(0.001),)
        pool2 = tf.reduce_max(conv2, reduction_indices=[1])
        bn2 = tf.layers.batch_normalization(pool2)    
        
    with tf.name_scope("classifier"):
        # fully connected layer
        fc = tf.layers.dense(bn2, hidden_dim, name='fc1')
        fc = tf.contrib.layers.dropout(fc, keep_prob)
        fc = tf.nn.relu(fc)
        # classifier
        logits = tf.layers.dense(fc, num_classes, name='fc2')
        y_pred_class = tf.argmax(tf.nn.softmax(logits), 1) 
    with tf.name_scope("optimize"):
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y)
        loss = tf.reduce_mean(cross_entropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    with tf.name_scope("accuracy"):
        correct_pred = tf.equal(tf.argmax(input_y, 1), y_pred_class)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



### train model

In [32]:
# for embed_type in embedding_pubmed.keys():
for embed_type in embedding_pubmed.keys():

    print( embed_type + " Starting training at", datetime.datetime.now())
    #sess = tf.InteractiveSession()
    #saver = tf.train.Saver()

    train_loss_lst = []
    train_acc_lst = []
    val_loss_lst = []
    val_acc_lst = []

    with tf.Session(graph=train_graph) as sess:
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        save_path = saver.save(sess,'/Users/zhang/MscProject_tweak2vec/Pubmed_save/'+embed_type+'_model.ckpt')
         

        t_loss = 0
        t_acc = 0
        v_best_acc = 0
        iteration = 0
        train_batches_size = len(train_x) // batch_size
        val_batches_size = len(val_x) // batch_size

        for e in range(1, num_epoch+1):
            train_batches = get_batches(train_x, train_y, batch_size, isShuffle =  True)
            val_batches = get_batches(val_x, val_y, batch_size)
            

            start = time.time()  
            # training
            for train_inputs, train_targets in train_batches:
                iteration += 1
                feed = {input_x: train_inputs,
                        input_y: train_targets,
                        embedding: embedding_pubmed[embed_type],
                        keep_prob: keep_prob_rate}
                train_loss, train_acc, _ = sess.run([loss, accuracy, optimizer], feed_dict=feed)
                t_loss += train_loss
                t_acc += train_acc
                if iteration % 600 == 0:
                    end = time.time()
                    print("Epoch {}/{}".format(e, num_epoch),
                          "Iteration: {}".format(iteration),
                          "Avg. Training loss: {:.4f}".format(t_loss / 600),
                          "Avg. Training acc: {:.4f}".format(t_acc / 600),
                          "{:.4f} sec/batch".format((end - start) / 600))      
                    t_loss = 0
                    t_acc = 0
                    start = time.time()
            train_loss_lst.append(train_loss)
            train_acc_lst.append(train_acc)


            # validation
            feed = {input_x: val_x,
                    input_y: val_y,
                    embedding: embedding_pubmed[embed_type],
                    keep_prob: 1}
            val_loss, val_acc, y_pred = sess.run([loss, accuracy, y_pred_class], feed_dict=feed)
            if val_acc > v_best_acc:
                v_best_acc = val_acc
                y_predict = y_pred

            end = time.time()
            print("Epoch {}/{}".format(e, num_epoch),
                  "Avg. Val. loss: {:.4f}".format(val_loss),
                  "Avg. Val. acc: {:.4f}".format(val_acc),
                  "{:.4f} sec".format(end - start),
                  "--------------------------------")
            val_loss_lst.append(val_loss)
            val_acc_lst.append(val_acc)
        y_label = docs_to_lines( np.load(file_val_y).tolist() )
        confusion_mat = tf.confusion_matrix(y_label, y_predict, 5)
        confusion = sess.run(confusion_mat)


    print("Finish at", datetime.datetime.now())

    np.save(embed_type+'_train_acc.npy',np.array(train_acc_lst))
    np.save(embed_type+'_train_loss.npy',np.array(train_loss_lst))
    np.save(embed_type+'_val_acc.npy',np.array(val_acc_lst))
    np.save(embed_type+'_val_loss.npy',np.array(val_loss_lst))
    np.save(embed_type+'_confusion.npy',confusion)

retrain_3m Starting training at 2018-08-02 11:09:11.393276
Epoch 1/8 Iteration: 600 Avg. Training loss: 0.7569 Avg. Training acc: 0.7063 0.2056 sec/batch
Epoch 1/8 Iteration: 1200 Avg. Training loss: 0.6325 Avg. Training acc: 0.7609 0.2005 sec/batch
Epoch 1/8 Iteration: 1800 Avg. Training loss: 0.5985 Avg. Training acc: 0.7758 0.2004 sec/batch
Epoch 1/8 Avg. Val. loss: 0.5711 Avg. Val. acc: 0.7853 30.3510 sec --------------------------------
Epoch 2/8 Iteration: 2400 Avg. Training loss: 0.5529 Avg. Training acc: 0.7942 0.1963 sec/batch
Epoch 2/8 Iteration: 3000 Avg. Training loss: 0.5504 Avg. Training acc: 0.7949 0.1960 sec/batch
Epoch 2/8 Iteration: 3600 Avg. Training loss: 0.5517 Avg. Training acc: 0.7945 0.1956 sec/batch
Epoch 2/8 Avg. Val. loss: 0.5532 Avg. Val. acc: 0.7934 24.0549 sec --------------------------------
Epoch 3/8 Iteration: 4200 Avg. Training loss: 0.5004 Avg. Training acc: 0.8153 0.1955 sec/batch
Epoch 3/8 Iteration: 4800 Avg. Training loss: 0.5045 Avg. Training acc

In [140]:
with tf.Session(graph=train_graph) as sess:
    y_label = docs_to_lines( np.load(file_val_y).tolist() )
    confusion_mat = tf.confusion_matrix(y_label, y_predict, 5)
    confusion = sess.run(confusion_mat)
    print(confusion)

[[2191  159  274   95  730]
 [ 849 1017  189   39  282]
 [ 155   32 8872  720  185]
 [  81    0  696 8529  535]
 [ 424    4  230  514 3410]]
