# CDL

## 1. data preprocess

#### build item information matrix of citeulike-a by bag of word

In [14]:
#find vocabulary_size = 8000
with open(r"ctrsr_datasets/citeulike-a/vocabulary.dat") as vocabulary_file:
    vocabulary_size = len(vocabulary_file.readlines())
    
#find item_size = 16980
with open(r"ctrsr_datasets/citeulike-a/mult.dat") as item_info_file:
    item_size = len(item_info_file.readlines())

#initialize item_infomation_matrix (16980 , 8000)
import numpy as np
item_infomation_matrix = np.zeros((item_size , vocabulary_size))

#build item_infomation_matrix
with open(r"ctrsr_datasets/citeulike-a/mult.dat") as item_info_file:
    sentences = item_info_file.readlines()
    
    for index,sentence in enumerate(sentences):
        words = sentence.strip().split(" ")[1:]
        for word in words:
            vocabulary_index , number = word.split(":")
            item_infomation_matrix[index][int(vocabulary_index)] =number
        

#### build rating matrix citeulike-a

In [15]:
#find user_size = 5551
with open(r"ctrsr_datasets/citeulike-a/users.dat") as rating_file:
    user_size = len(rating_file.readlines())

#initialize rating_matrix (5551 , 16980)
import numpy as np
rating_matrix = np.zeros((user_size , item_size))

#build rating_matrix
with open(r"ctrsr_datasets/citeulike-a/users.dat") as rating_file:
    lines = rating_file.readlines()
    for index,line in enumerate(lines):
        items = line.strip().split(" ")
        for item in items:  
            rating_matrix[index][int(item)] = 1

#### save matrix by pickle

In [16]:
import pickle

In [17]:
with open(r'item_infomation_matrix.pickle', 'wb') as handle:
    pickle.dump(item_infomation_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(r'rating_matrix.pickle', 'wb') as handle:
    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SDAE

In [87]:
import tensorflow as tf
import pickle
import numpy as np

In [88]:
#init random seed
np.random.seed(5)

#### load matrix from pickle 

In [89]:
with open(r'item_infomation_matrix.pickle', 'rb') as handle:
    item_infomation_matrix = pickle.load(handle)  
    
with open(r'rating_matrix.pickle', 'rb') as handle2:
    rating_matrix = pickle.load(handle2)

#### autoencoder model

In [180]:
n_input = 8000
n_hidden1 = 200
n_hidden2 = 50

lambda_w = 1
lambda_j = 1

learning_rate = 0.001

In [181]:
X_0 = tf.placeholder(tf.float32 , shape=(None , n_input))
drop_ratio = tf.placeholder(tf.float32 )
X_c = tf.placeholder(tf.float32 , shape=(None , n_input))

In [182]:
def encoder(x , drop_ratio):
    w_1 = tf.Variable(tf.random_normal( [n_input , n_hidden1] , mean=0.0, stddev=1 / lambda_w ))
    b_1 = tf.Variable(tf.random_normal( [n_hidden1] , mean=0.0, stddev=1 / lambda_w ))
    L_1 = tf.nn.sigmoid(tf.matmul(x,w_1) + b_1)
    L_1 = tf.nn.dropout(L_1 , keep_prob= 1 - drop_ratio)
    
    w_2 = tf.Variable(tf.random_normal( [n_hidden1 , n_hidden2] , mean=0.0, stddev=1 / lambda_w ))
    b_2 = tf.Variable(tf.random_normal( [n_hidden2] , mean=0.0, stddev=1 / lambda_w ))
    L_2 = tf.nn.sigmoid(tf.matmul(L_1,w_2) + b_2)
    L_2 = tf.nn.dropout(L_2 , keep_prob= 1 - drop_ratio)
    
    return L_2

def decoder(x , drop_ratio):
    w_1 = tf.Variable(tf.random_normal( [n_hidden2 , n_hidden1] , mean=0.0, stddev=1 / lambda_w ))
    b_1 = tf.Variable(tf.random_normal( [n_hidden1] , mean=0.0, stddev=1 / lambda_w ))
    L_1 = tf.nn.sigmoid(tf.matmul(x,w_1) + b_1)
    L_1 = tf.nn.dropout(L_1 , keep_prob= 1 - drop_ratio)
    
    w_2 = tf.Variable(tf.random_normal( [n_hidden1 , n_input] , mean=0.0, stddev=1 / lambda_w ))
    b_2 = tf.Variable(tf.random_normal( [n_input] , mean=0.0, stddev=1 / lambda_w ))
    L_2 = tf.nn.sigmoid(tf.matmul(L_1,w_2) + b_2)
    L_2 = tf.nn.dropout(L_2 , keep_prob= 1 - drop_ratio)
    
    return L_2

def stacked_autoencoder(x , drop_ratio):
    encoder_output = encoder(x , drop_ratio)
    decoder_output = decoder(encoder_output , drop_ratio)
    #decoder_output = tf.Variable(tf.random_normal( shape=(n_input) , mean=decoder_output, stddev=1 / lambda_j ))
    return decoder_output

In [183]:
y_pred = stacked_autoencoder(X_0 , drop_ratio)

In [184]:
loss = tf.reduce_mean( tf.pow( X_c - y_pred, 2 ) )

In [185]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

#### masking noise 

In [186]:
def mask(corruption_level ,size):
    mask = np.random.binomial(1, 1 - corruption_level, [size[0],size[1]])
    return mask

def add_noise(x , corruption_level ):
    x = x * mask(corruption_level , x.shape)
    return x

#### trainging SDAE

In [187]:
np.random.shuffle(item_infomation_matrix) #random index of train data

item_infomation_matrix_noise = add_noise(item_infomation_matrix , 0.3)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
epochs = 100
batch_size = 32
for epoch in range(epochs):
    print("%d / %d"%(epoch+1 , epochs))
    
    for i in range(0 , item_infomation_matrix.shape[0] , batch_size):
        X_train_batch = item_infomation_matrix_noise[i:i+batch_size]
        y_train_batch = item_infomation_matrix[i:i+batch_size]
        
        _ , my_loss = sess.run([optimizer, loss] , feed_dict={X_0 :X_train_batch , X_c : y_train_batch , drop_ratio : 0.1})
    print(my_loss)
    

1 / 100
0.0408685
2 / 100
0.03735322
3 / 100
0.036900964
4 / 100
0.03623298
5 / 100
0.036303572
6 / 100
0.035621922
7 / 100
0.035176165
8 / 100
0.034799315
9 / 100
0.03430466
10 / 100
0.034051135
11 / 100
0.033710763
12 / 100
0.033708513
13 / 100
0.033184234
14 / 100
0.032846563
15 / 100
0.032619923
16 / 100
0.03191311
17 / 100
0.031550966
18 / 100
0.030927423
19 / 100
0.030331863
20 / 100
0.029995788
21 / 100
0.029391833
22 / 100
0.028713318
23 / 100
0.028577851
24 / 100
0.028025744
25 / 100
0.027895307
26 / 100
0.027624547
27 / 100
0.027608208
28 / 100
0.027472727
29 / 100
0.0272853
30 / 100
0.02730599
31 / 100
0.027276788
32 / 100
0.02719413
33 / 100
0.027098432
34 / 100
0.026884759
35 / 100
0.026618829
36 / 100
0.02652392
37 / 100
0.026382416
38 / 100
0.02623517
39 / 100
0.02613476
40 / 100
0.026049793
41 / 100
0.02596328
42 / 100
0.025791943
43 / 100
0.025933292
44 / 100
0.0257954
45 / 100
0.025714237
46 / 100
0.025537478
47 / 100
0.025409702
48 / 100
0.02532146
49 / 100
0.0254084