In [1]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# checked  Chaojie Wang 2018-8-3
"""
Created on Wed Jan 10 22:41:31 2018

@author: wangchaojie
"""

import numpy as np
np.random.RandomState(1)

realmin = 2.2e-10
def log_max(x):
    return np.log(np.maximum(x, realmin))

#====================== Load data ======================#
import cPickle

DATA = cPickle.load(open("./TREC_3k-12-6.pkl","r"))

data_vab_list          = DATA['Vocabulary']
data_vab_count_list    = DATA['Vab_count']
data_vab_length        = DATA['Vab_Size']
data_label             = DATA['Label']
data_train_list        = DATA['Train_Origin']
data_train_label       = np.array(DATA['Train_Label'])
data_train_split       = DATA['Train_Word_Split']
data_train_list_index  = DATA['Train_Word2Index']
data_test_list         = DATA['Test_Origin']
data_test_label        = np.array(DATA['Test_Label'])
data_test_split        = DATA['Test_Word_Split']
data_test_list_index   = DATA['Test_Word2Index']
data_value             = 10

print 'Load data'

#======================= Preprocess =======================#
delete_count = 0

for i in range(len(data_train_list)):
    
    x_single = np.reshape(data_train_list_index[i], [len(data_train_list_index[i])]).astype(np.int32)
    x_len    = x_single.shape[0]
        
    i_index = i - delete_count
    if i_index == 0:
        batch_len  = np.array([x_len])
        batch_rows = x_single
        batch_cols = np.arange(x_len)                                         
        batch_file_index = np.ones_like(x_single) * i_index
        batch_value      = np.ones_like(x_single) * data_value
        batch_label      = np.array([data_train_label[i]])
    else:
        batch_len  = np.concatenate((batch_len, np.array([x_len])), axis=0)
        batch_rows = np.concatenate((batch_rows, x_single), axis=0)
        batch_cols = np.concatenate((batch_cols, np.arange(x_len)), axis = 0) 
        batch_file_index = np.concatenate((batch_file_index, np.ones_like(x_single) * i_index), axis=0)
        batch_value      = np.concatenate((batch_value, np.ones_like(x_single) * data_value), axis=0)
        batch_label      = np.concatenate((batch_label,np.array([data_train_label[i]])),axis=0)

print 'Preprocess finished'

batch_len_tr        = batch_len
batch_rows_tr       = batch_rows
batch_cols_tr       = batch_cols
batch_file_index_tr = batch_file_index
batch_value_tr      = batch_value
batch_label_tr      = batch_label

#======================= Setting =======================#
Setting = {}
Setting['N_train']    = len(data_train_list) - delete_count 
Setting['K1']         = 32
Setting['K1_V1']      = DATA['Vab_Size']
Setting['K1_V2']      = np.max(batch_len) + 2  # padding             　
Setting['K1_S3']      = DATA['Vab_Size']
Setting['K1_S4']      = 3
Setting['K1_S1']      = Setting['K1_V1'] + 1 - Setting['K1_S3']
Setting['K1_S2']      = Setting['K1_V2'] + 1 - Setting['K1_S4']   

Setting['Iter']       = 200
Setting['Burinin']    = 0.75*Setting['Iter']
Setting['Collection'] = Setting['Iter'] - Setting['Burinin']

#======================= SuperParams =======================#
SuperParams = {}
SuperParams['gamma0'] = 0.1  # r
SuperParams['c0']     = 0.1
SuperParams['a0']     = 0.1  # p
SuperParams['b0']     = 0.1  
SuperParams['e0']     = 0.1  # c
SuperParams['f0']     = 0.1
SuperParams['eta']    = 0.05 # Phi

#======================= Tensorflow Initial =======================#
# Initial Graph
import tensorflow as tf
# H*W*Outchannel*Inchannel
Phi_1   = tf.placeholder(tf.float32, shape = [Setting['K1_S3'], Setting['K1_S4'], 1, Setting['K1']]) #HWC
# N*H*W*Inchannel
Theta_1 = tf.placeholder(tf.float32, shape = [1, Setting['K1_S1'], Setting['K1_S2'], Setting['K1']])
# Outshape N*H*W*Outchannel
X_1     = tf.nn.conv2d_transpose(Theta_1, Phi_1, output_shape=[1, Setting['K1_V1'], Setting['K1_V2'], 1], strides=[1,1,1,1], padding='VALID')

# Initial
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

print 'Tensorflow initial finished'

#====================== CUDA Initial ======================#
# Note， do not add any cuda operation among CUDA initial such as Tensorflow!!!!!!!!!!!!!!!!!!
import pycuda.curandom as curandom
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule

mod = SourceModule("""

#include <stdio.h>
__global__ void Multi_Sampler(int* para, float *word_aug_stack, float *MultRate_stack, int *row_index, int *column_index, int *page_index, float *value_index, float *Params_W1_nk1, float *Params_D1_k1, float *Params_W1_nk1_Aug, float *Params_D1_k1_Aug)
{
    int K1         = para[0];
    int K1_K1      = para[1];
    int K1_K2      = para[2];
    int K1_K3      = para[3];
    int K1_K4      = para[4];
    int word_total = para[5];

    int ix = blockDim.x * blockIdx.x + threadIdx.x; 
    int iy = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int idx = iy* blockDim.x *gridDim.x+ ix;
    
    if ((idx < word_total))
    {
        int v1 = row_index[idx];                 // row_index
        int v2 = column_index[idx];              // col_index
        int n  = page_index[idx];                // file_index
        float value = value_index[idx];
        
        int word_k1_min = 0;
        int word_k1_max = 0;
        int word_k2_min = 0;
        int word_k2_max = 0;
        
        // word_k1
        if ((v1 - K1_K3 + 1) > 0)
            word_k1_min = v1 - K1_K3 + 1;
        else
            word_k1_min = 0;

        if (v1 > K1_K1 -1)
            word_k1_max = K1_K1 -1;
        else
            word_k1_max = v1;

        int l_word_k1 = word_k1_max - word_k1_min + 1;
        int *word_k1  = new int[l_word_k1];
        for (int i = 0; i < (l_word_k1); i++)
            word_k1[i] = word_k1_min + i;

        // word_k2
        if ((v2 - K1_K4 + 1) > 0)
            word_k2_min = v2 - K1_K4 + 1;
        else
            word_k2_min = 0;

        if (v2 > K1_K2 -1)
            word_k2_max = K1_K2 -1;
        else
            word_k2_max = v2;

        int l_word_k2 = word_k2_max - word_k2_min + 1;
        int *word_k2  = new int[l_word_k2];
        for (int i = 0; i < (l_word_k2); i++)
            word_k2[i] = word_k2_min + i;

        // word_k3
        int *word_k3 = new int[l_word_k1];
        for (int i = 0; i < (l_word_k1); i++)
            word_k3[i] = v1 - word_k1[i] ;

        // word_k4
        int *word_k4 = new int[l_word_k2];
        for (int i = 0; i < (l_word_k2); i++)
            word_k4[i] = v2 - word_k2[i] ;
        
        float MultRate_sum = 0;
        //word_aug_stack
        //MultRate_stack
        //Params_W1_nk1
        //Params_D1_k1
        int stack_start = idx * K1_K4 * K1;
        
        for (int i = 0; i < K1; i++)
        {
            for (int k = 0; k < (l_word_k1); k++)
            {
                for (int j = 0; j < (l_word_k2); j++)
                {
                    int temp_a = (n) * K1 * K1_K1 * K1_K2 + (i) * K1_K1 * K1_K2 + word_k1[k] * K1_K2 + (word_k2[j]);
                    int temp_b = (i) * K1_K3 * K1_K4 + word_k3[k] * K1_K4 + (word_k4[j]);
                    int temp_c = stack_start + i*l_word_k1*l_word_k2 + k*l_word_k2 + j;
                    
                    MultRate_stack[temp_c] = Params_W1_nk1[temp_a] * Params_D1_k1[temp_b];
                    MultRate_sum = MultRate_sum + MultRate_stack[temp_c];
                }
            }
        }
        
        for (int i = 0; i < K1; i++)
        {
            for (int k = 0; k < (l_word_k1); k++)
            {
                for (int j = 0; j < (l_word_k2); j++)
                {
                    int temp_a = (n) * K1 * K1_K1 * K1_K2 + (i) * K1_K1 * K1_K2 + word_k1[k] * K1_K2 + (word_k2[j]);
                    int temp_b = (i) * K1_K3 * K1_K4 + word_k3[k] * K1_K4 + (word_k4[j]);
                    int temp_c = stack_start + i*l_word_k1*l_word_k2 + k*l_word_k2 + j;
                    
                    if (MultRate_sum == 0)
                    {
                        MultRate_stack[temp_c] = 1.0 / (K1 * l_word_k1 * l_word_k2);
                        word_aug_stack[temp_c] = MultRate_stack[temp_c] * value;
                    }
                    else
                    {
                        MultRate_stack[temp_c] = MultRate_stack[temp_c] / MultRate_sum;
                        word_aug_stack[temp_c] = MultRate_stack[temp_c] * value;
                    }

                    atomicAdd(&Params_W1_nk1_Aug[temp_a], word_aug_stack[temp_c]);
                    atomicAdd(&Params_D1_k1_Aug[temp_b], word_aug_stack[temp_c]);
                }
            }
        }

        delete[] word_k1;
        delete[] word_k2;
        delete[] word_k3;
        delete[] word_k4; 
    }
    
}
 """)
print "CUDA initial finish"

Load data
Preprocess finished
Couldn't import dot_parser, loading of dot files will not be possible.
Tensorflow initial finished
CUDA initial finish


In [2]:
#======================= Initial Params =======================#
import PGBN_sampler 
from scipy.special import gamma
Params = {}

Params['D1_k1'] = np.random.rand(Setting['K1'], Setting['K1_S3'], Setting['K1_S4'])
for k1 in range(Setting['K1']):
    Params['D1_k1'][k1, :, :] = Params['D1_k1'][k1, :, :] / np.sum(Params['D1_k1'][k1, :, :])
Params['W1_nk1'] = np.random.rand(Setting['N_train'], Setting['K1'], Setting['K1_S1'], Setting['K1_S2'])
Params['W1_nk1_Pooling'] = np.sum(np.sum(Params['W1_nk1'], axis=3), axis=2)

Params['c2_n']   = 1 * np.ones([Setting['N_train']])
Params['p2_n']   = 1 / (1 + Params['c2_n'])

Params['Gamma']  = np.ones([Setting['K1'], 1]) / Setting['K1']

# Collection
W_train = np.zeros([Setting['N_train'], Setting['K1']])

# CUDA function
fuc = mod.get_function("Multi_Sampler")

import time
Iter_time = []
Iter_lh   = []

#========================== Gibbs ==========================＃
for t in range(Setting['Iter']):
    
    start_time = time.time()
    
    #========================== 1st layer Augmentation ==========================＃
    Params['D1_k1_Aug']  = np.zeros_like(Params['D1_k1'])     # Augmentation on D
    Params['W1_nk1_Aug'] = np.zeros_like(Params['W1_nk1'])    # Augmentation on w
    
    X_rows       = np.array(batch_rows, dtype = 'int32')
    X_cols       = np.array(batch_cols, dtype = 'int32') + 1  # padding
    X_file_index = np.array(batch_file_index, dtype = 'int32')
    X_value      = np.array(batch_value, dtype = 'float32')

    word_total     = len(X_rows)
    word_aug_stack = np.zeros((Setting['K1']*Setting['K1_S4']*word_total),dtype=np.float32)
    MultRate_stack = np.zeros((Setting['K1']*Setting['K1_S4']*word_total),dtype=np.float32)
    Batch_Para     = np.array([Setting['K1'], Setting['K1_S1'], Setting['K1_S2'], Setting['K1_S3'], Setting['K1_S4'], word_total], dtype=np.int32)
    
    block_x = 128
    grid_x  = 128
    grid_y  = word_total / (block_x * grid_x) + 1
    
    W1_nk1     = np.array(Params['W1_nk1'], dtype = 'float32', order='C')
    D1_k1      = np.array(Params['D1_k1'], dtype = 'float32', order='C')
    W1_nk1_Aug = np.zeros(W1_nk1.shape, dtype = 'float32', order='C')
    D1_k1_Aug  = np.zeros(D1_k1.shape, dtype = 'float32', order='C')
    
    fuc(drv.In(Batch_Para), drv.In(word_aug_stack), drv.In(MultRate_stack), drv.In(X_rows), drv.In(X_cols), drv.In(X_file_index), drv.In(X_value), drv.In(W1_nk1), drv.In(D1_k1), drv.InOut(W1_nk1_Aug), drv.InOut(D1_k1_Aug), grid =(grid_x, grid_y, 1)  ,block=(block_x,1,1))   # 一般最多512个并行线程

    Params['W1_nk1_Aug'] = np.array(W1_nk1_Aug, dtype='float64') # N*K1*S1*S2
    Params['D1_k1_Aug']  = np.array(D1_k1_Aug, dtype='float64')  # K1*S3*S4
    Params['W1_nk1_Aug_Pooling'] = np.sum(np.sum(Params['W1_nk1_Aug'], axis=3), axis=2) # N*K1
    
    #====================== Parameters Update ======================#
    # Update D
    for k1 in range(Setting['K1']):
        X_k1_34 = Params['D1_k1_Aug'][k1, :, :] 
        D1_k1_s = (X_k1_34 + SuperParams['eta']) / np.sum(X_k1_34 + SuperParams['eta'])
        Params['D1_k1'][k1, :, :] = D1_k1_s

    # Update c_j,p_j
    Params['c2_n']     = np.random.gamma(SuperParams['e0'] + np.sum(Params['Gamma'])) 
    Params['c2_n']     = Params['c2_n'] / (SuperParams['f0'] + np.sum(Params['W1_nk1_Pooling'], axis=1))
    Params['p2_n']     = 1 / (Params['c2_n'] + 1)
    
    # Update w_j
    W_k1_sn = np.random.gamma(Params['W1_nk1_Aug_Pooling'].T + Params['Gamma']) / (1 + Params['c2_n']) # V*N
    Params['W1_nk1_Pooling'] = np.transpose(W_k1_sn)  # N*K1
    
    for k1 in range(Setting['K1']):
        Params['W1_nk1'][:, k1, 0, :] = (Params['W1_nk1_Aug'][:,k1,0,:] / (Params['W1_nk1_Aug_Pooling'][:, k1:k1+1] + 0.0001)) * Params['W1_nk1_Pooling'][:, k1:k1+1]
 
    end_time = time.time()

    if t == 0:
        Iter_time.append(end_time - start_time)
    else:
        Iter_time.append(end_time - start_time + Iter_time[-1])
    
    print "epoch " + str(t) + " takes " + str(end_time - start_time) + " seconds"
    
    #====================== Likelihood ======================#
    if np.mod(t,50) == 0:
        
        likelyhood = 0
        start_time = time.time()
        Orgin_X = np.zeros([Setting['N_train'], Setting['K1_V1'], Setting['K1_V2']])
        Orgin_X[[batch_file_index, batch_rows, batch_cols+1]] = batch_value

        for i in range(Setting['N_train']):
            
            Phi_tmp = np.transpose(np.reshape(Params['D1_k1'],[Setting['K1'], Setting['K1_S3'], Setting['K1_S4'], 1]),[1,2,3,0])
            Theta_tmp = np.transpose(Params['W1_nk1'][i:i+1,:,:,:], [0,2,3,1])
            PhiTheta_1= sess.run(X_1, feed_dict={Phi_1:Phi_tmp.astype(np.float32), Theta_1:Theta_tmp.astype(np.float32)})

            likelyhood = likelyhood + np.sum(Orgin_X[i,:,:] * log_max(PhiTheta_1[0,:,:,0]) - PhiTheta_1[0,:,:,0] - log_max(gamma(Orgin_X[i,:,:] + 1)))  
        end_time = time.time()
        print "Likelihood " + str(likelyhood / Setting['N_train']) + " takes " + str(end_time - start_time) + " seconds"
        Iter_lh.append(likelyhood / Setting['N_train'])
        
print "Train phase finished" 

epoch 0 takes 0.214260101318 seconds
Likelihood -518.5410512004162 takes 23.6906311512 seconds
epoch 1 takes 0.184551954269 seconds
epoch 2 takes 0.205935001373 seconds
epoch 3 takes 0.188102960587 seconds
epoch 4 takes 0.184844017029 seconds
epoch 5 takes 0.192376852036 seconds
epoch 6 takes 0.201794147491 seconds
epoch 7 takes 0.195115089417 seconds
epoch 8 takes 0.194356918335 seconds
epoch 9 takes 0.194139003754 seconds
epoch 10 takes 0.190070152283 seconds
epoch 11 takes 0.215881824493 seconds
epoch 12 takes 0.201840877533 seconds
epoch 13 takes 0.201107978821 seconds
epoch 14 takes 0.207763195038 seconds
epoch 15 takes 0.204153060913 seconds
epoch 16 takes 0.205536842346 seconds
epoch 17 takes 0.210997104645 seconds
epoch 18 takes 0.198394060135 seconds
epoch 19 takes 0.21150302887 seconds
epoch 20 takes 0.19393491745 seconds
epoch 21 takes 0.211939811707 seconds
epoch 22 takes 0.212212085724 seconds
epoch 23 takes 0.2281229496 seconds
epoch 24 takes 0.230623960495 seconds
epoch 