# CDL by tensorflow

### import module

In [1]:
import numpy as np
import pickle
import tensorflow as tf
import time
#init random seed
np.random.seed(5)

  from ._conv import register_converters as _register_converters


## 1. data preprocess

#### build item information matrix of citeulike-a by bag of word

In [2]:
#find vocabulary_size = 8000
with open(r"ctrsr_datasets/citeulike-a/vocabulary.dat") as vocabulary_file:
    vocabulary_size = len(vocabulary_file.readlines())
    
#find item_size = 16980
with open(r"ctrsr_datasets/citeulike-a/mult.dat") as item_info_file:
    item_size = len(item_info_file.readlines())

#initialize item_infomation_matrix (16980 , 8000)
item_infomation_matrix = np.zeros((item_size , vocabulary_size))

#build item_infomation_matrix
with open(r"ctrsr_datasets/citeulike-a/mult.dat") as item_info_file:
    sentences = item_info_file.readlines()
    
    for index,sentence in enumerate(sentences):
        words = sentence.strip().split(" ")[1:]
        for word in words:
            vocabulary_index , number = word.split(":")
            item_infomation_matrix[index][int(vocabulary_index)] =number

#### build rating matrix citeulike-a

In [3]:
#find user_size = 5551
with open(r"ctrsr_datasets/citeulike-a/users.dat") as rating_file:
    user_size = len(rating_file.readlines())

#initialize rating_matrix (5551 , 16980)
import numpy as np
rating_matrix = np.zeros((user_size , item_size))

#build rating_matrix
with open(r"ctrsr_datasets/citeulike-a/users.dat") as rating_file:
    lines = rating_file.readlines()
    for index,line in enumerate(lines):
        items = line.strip().split(" ")
        for item in items:  
            rating_matrix[index][int(item)] = 1

#### save matrix by pickle

In [4]:
with open(r'item_infomation_matrix.pickle', 'wb') as handle:
    pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(r'rating_matrix.pickle', 'wb') as handle:
    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### load matrix from pickle 

In [5]:
with open(r'item_infomation_matrix.pickle', 'rb') as handle:
    item_infomation_matrix = pickle.load(handle)  
    
with open(r'rating_matrix.pickle', 'rb') as handle2:
    rating_matrix = pickle.load(handle2)

## 2. build model

#### masking noise 

In [6]:
#apply SDAE : we hope to reconstruct item information by masking nosie
def mask(corruption_level ,size):
    mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])
    return mask

def add_noise(x , corruption_level ):
    x = x * mask(corruption_level , x.shape)
    return x

In [7]:
class CDL():
    def __init__(self , rating_matrix , item_infomation_matrix):
        
        # model參數設定
        self.n_input = item_infomation_matrix.shape[1]
        self.n_hidden1 = 200
        self.n_hidden2 = 50
        self.k = 50
        
        self.lambda_w = 0.1
        self.lambda_n = 10
        self.lambda_u = 1
        self.lambda_v = 10
        
        self.drop_ratio = 0.1
        self.learning_rate = 0.01
        self.epochs = 200
        self.batch_size = 256
        
        self.a = 1
        self.b =0.01
        self.P = 1
        
        self.num_u = rating_matrix.shape[0]
        self.num_v = rating_matrix.shape[1]
        
        self.Weights = {
            'w1' : tf.Variable(tf.truncated_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),
            'w2' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),
            'w3' : tf.Variable(tf.truncated_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev= tf.truediv(1.0,self.lambda_w))),
            'w4' : tf.Variable(tf.truncated_normal( [self.n_hidden1 , self.n_input] , mean=0.0,  stddev= tf.truediv(1.0,self.lambda_w)))   
        }
        self.Biases = {
            'b1' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),
            'b2' : tf.Variable( tf.zeros(shape=self.n_hidden2) ),
            'b3' : tf.Variable( tf.zeros(shape=self.n_hidden1) ),
            'b4' : tf.Variable( tf.zeros(shape=self.n_input) ),
        }
        
        self.item_infomation_matrix = item_infomation_matrix
        
        self.rating_matrix = rating_matrix
        
        for i in range(self.num_u):
            x = np.random.choice(np.where(self.rating_matrix[i,:]>0)[0] , self.P)
            self.rating_matrix[i,:].fill(0)
            self.rating_matrix[i,x] = 1
        
        self.confidence = np.mat(np.ones(self.rating_matrix.shape)) * self.b
        self.confidence[np.where(self.rating_matrix>0)] = self.a
        
    def encoder(self , x , drop_ratio):
        w1 = self.Weights['w1']
        b1 = self.Biases['b1']
        L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )
        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )
        
        w2 = self.Weights['w2']
        b2 = self.Biases['b2']
        L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )
        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)
        
        return L2
    
    def decoder(self , x , drop_ratio):
        w3 = self.Weights['w3']
        b3 = self.Biases['b3']
        L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)
        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)

        w4 = self.Weights['w4']
        b4 = self.Biases['b4']
        L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)
        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)

        return L4
    
#     def only_MF(self):
#         self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )
#         self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )
#         self.drop_ratio = tf.placeholder(tf.float32)
#         self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )
        
#         batch_size = tf.cast(tf.shape(self.R)[1], tf.int32)
        
        
#         self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) 
#         self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )
        
#         batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])
        
#         loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) 
#         loss_2 = tf.reduce_sum(tf.multiply(self.C ,
#                                     tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) 
#                                 )
        
#         self.loss = loss_1 + loss_2 
#         self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def build_model(self):
        
        self.X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.C = tf.placeholder(tf.float32 , shape=(self.num_u,None) )
        self.R = tf.placeholder(tf.float32 , shape=(self.num_u,None) )
        self.drop_ratio = tf.placeholder(tf.float32)
        self.model_batch_data_idx = tf.placeholder( tf.int32 , shape=None )
        #SDAE item factor
        V_sdae = self.encoder( self.X_0 , self.drop_ratio )
        
        #SDAE output 
        sdae_output = self.decoder( V_sdae , self.drop_ratio )
        
        
        
        
        batch_size = tf.cast(tf.shape(self.X_0)[0], tf.int32)
        
        
        self.V = tf.Variable( tf.zeros(shape=[self.num_v, self.k], dtype=tf.float32 ) ) 
        self.U = tf.Variable( tf.zeros(shape=[self.num_u, self.k], dtype=tf.float32 ) )
        
        batch_V = tf.reshape(tf.gather(self.V, self.model_batch_data_idx), shape=[batch_size, self.k])
        
        loss_1 = self.lambda_u * tf.nn.l2_loss( self.U ) 
        loss_2 = self.lambda_w * 1/2 * tf.reduce_sum([tf.nn.l2_loss(w)+tf.nn.l2_loss(b) for w,b in zip(self.Weights.values() , self.Biases.values())])
        loss_3 = self.lambda_v * tf.nn.l2_loss(batch_V - V_sdae)
        loss_4 = self.lambda_n * tf.nn.l2_loss(sdae_output - self.X_c)
        
        loss_5 = tf.reduce_sum(tf.multiply(self.C ,
                                    tf.square(self.R - tf.matmul(self.U , batch_V , transpose_b=True))) 
                                )
        
        self.loss = loss_1 + loss_2 + loss_3 + loss_4 + loss_5
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
    def train_model(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
        start_time = time.time()
        
        random_idx = np.random.permutation(self.num_v)
        
        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)
        
        for epoch in range(self.epochs):
            batch_cost = 0
            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):
                
                batch_idx = random_idx[i:i+self.batch_size]
                _ , loss = self.sess.run([self.optimizer, self.loss] , 
                                            feed_dict={self.X_0 : self.item_infomation_matrix_noise[batch_idx,:] , 
                                                       self.X_c : self.item_infomation_matrix[batch_idx,:] , 
                                                       self.R : self.rating_matrix[: , batch_idx], 
                                                       self.C : self.confidence[: , batch_idx], 
                                                       self.drop_ratio : 0.1 ,
                                                       self.model_batch_data_idx  : batch_idx })
                batch_cost = batch_cost + loss

            print ("Training //", "Epoch %d //" % (epoch+1), " Total cost = {:.2f}".format(batch_cost), "Elapsed time : %d sec" % (time.time() - start_time))
            
        return self.sess.run((tf.matmul(self.U, self.V, transpose_b=True)))

#### train model

In [8]:
R_train = rating_matrix.copy()
cdl = CDL(R_train , item_infomation_matrix)
cdl.build_model()
R = cdl.train_model()

Training // Epoch 1 //  Total cost = 699676798.00 Elapsed time : 13 sec
Training // Epoch 2 //  Total cost = 509001361.50 Elapsed time : 23 sec
Training // Epoch 3 //  Total cost = 426090375.00 Elapsed time : 32 sec
Training // Epoch 4 //  Total cost = 402229449.00 Elapsed time : 40 sec
Training // Epoch 5 //  Total cost = 385809588.50 Elapsed time : 50 sec
Training // Epoch 6 //  Total cost = 370855234.50 Elapsed time : 59 sec
Training // Epoch 7 //  Total cost = 356779552.50 Elapsed time : 68 sec
Training // Epoch 8 //  Total cost = 343552683.50 Elapsed time : 77 sec
Training // Epoch 9 //  Total cost = 330934537.50 Elapsed time : 85 sec
Training // Epoch 10 //  Total cost = 318868322.50 Elapsed time : 94 sec
Training // Epoch 11 //  Total cost = 307315671.75 Elapsed time : 102 sec
Training // Epoch 12 //  Total cost = 296283195.75 Elapsed time : 112 sec
Training // Epoch 13 //  Total cost = 285720805.75 Elapsed time : 121 sec
Training // Epoch 14 //  Total cost = 275675537.25 Elapse

Training // Epoch 113 //  Total cost = 26229822.77 Elapsed time : 999 sec
Training // Epoch 114 //  Total cost = 25965280.67 Elapsed time : 1008 sec
Training // Epoch 115 //  Total cost = 25713353.86 Elapsed time : 1017 sec
Training // Epoch 116 //  Total cost = 25468158.66 Elapsed time : 1025 sec
Training // Epoch 117 //  Total cost = 25241022.56 Elapsed time : 1033 sec
Training // Epoch 118 //  Total cost = 25019519.48 Elapsed time : 1042 sec
Training // Epoch 119 //  Total cost = 24811663.63 Elapsed time : 1050 sec
Training // Epoch 120 //  Total cost = 24609974.30 Elapsed time : 1058 sec
Training // Epoch 121 //  Total cost = 24420480.23 Elapsed time : 1066 sec
Training // Epoch 122 //  Total cost = 24235782.49 Elapsed time : 1074 sec
Training // Epoch 123 //  Total cost = 24066398.74 Elapsed time : 1082 sec
Training // Epoch 124 //  Total cost = 23899705.29 Elapsed time : 1091 sec
Training // Epoch 125 //  Total cost = 23745025.42 Elapsed time : 1099 sec
Training // Epoch 126 //  

#### evaluation

In [9]:
all_cnt = 0
for i in range(rating_matrix.shape[0]):
    l_score = np.ravel(R[i,:]).tolist()
    pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True)
    l_rec = [i[0] for i in pl][:300]
    s_rec = set(l_rec)
    s_true = set(np.ravel(np.where(rating_matrix[i,:]>0)))
    cnt_hit = len(s_rec.intersection(s_true))
    all_cnt = all_cnt + cnt_hit/len(s_true)

In [10]:
#accuracy 0.085不能算太低 因為他是所有item(16980)去排序
print("accuracy : %.3f"%(all_cnt/rating_matrix.shape[0]))

accuracy : 0.081
