In [147]:
import theano
import theano.tensor as T
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [118]:
def get_data(size):
    ratings = []
    if size == "100k":
        path = "../dat/rec/ml-100k/u.data"
        print("Read movie lens 100k data set")
        f = open(path, "r")
        while (1):
            line = f.readline()
            if line == "":
                break
            ratings.append(line.split()[0:-1])
        f.close()
    ratings = np.array(ratings, dtype = np.float32)
    # permute the ratings array
    ratings = np.random.permutation(ratings)
    print("Loading data done")
    return ratings

In [119]:
ratings = get_data("100k")

Read movie lens 100k data set
Loading data done


In [120]:
ratings.shape

(100000, 3)

In [121]:
def load_split_data(data_size, test_p):
    # Load data and split into train set, test set randomly.
    # data_size is either "100k", "1m", "10m" or "20m".
    # test_p is a float between 0 - 1 indicating the portion of data hold out as test set
    print("split data randomly")
    # Load ratings, data is already permuted in get_data
    ratings = get_data(data_size)
    nb_users = int(np.max(ratings[:, 0]))
    nb_movies = int(np.max(ratings[:, 1]))
    # split test/train set
    test_size = int(len(ratings) * test_p)
    test_ratings = ratings[:test_size]
    train_ratings = ratings[test_size:]
    # train_ratings is converted into a matrix
    train_M = np.zeros((nb_movies, nb_users), dtype = np.float32)
    for rating in train_ratings:
        train_M[int(rating[1]-1), int(rating[0]-1)] = rating[2]
    # save test and train data in case more training is needed on this split
    np.save("../dat/rec/" + data_size + "_" + str(int(test_p * 100))+ "percent_test.npy", test_ratings)
    np.save("../dat/rec/" + data_size + "_" + str(int(test_p * 100))+ "percent_trainM.npy", train_M)
    # test_ratings is numpy array of user id | item id | rating
    # train_M is numpy array with nb_movies rows and nb_users columns, missing entries are filled with zero
    return test_ratings, train_M, nb_users, nb_movies, len(train_ratings)


In [122]:
# def cal_RMSE(prediction_M, test_ratings):
#     RMSE = 0
#     for rating in test_ratings:
#         RMSE += (rating[2] - prediction_M[int(rating[1] - 1), int(rating[0] - 1)])**2
#     RMSE = math.sqrt(RMSE / len(test_ratings))
#     return RMSE

In [126]:
nb_epoch = 10
test_p = 0.1
nb_hunits = 10
lambda_reg = 0.001
learningrate = 0.01
data_size = "100k"
test_ratings, train_M, nb_users, nb_movies, k = load_split_data(data_size, test_p)

split data randomly
Read movie lens 100k data set
Loading data done


In [139]:
prediction_M = np.zeros((nb_movies, nb_users), dtype = np.float32) # 1682 x 943
RMSE_list = [0] * nb_epoch # 1 x 10

# set up theano autoencoder structure and update function
X = T.dvector("input")
X_observed = T.dvector("observedIndex")
update_matrix = T.matrix("updateIndex") # only 0 or 1
V = theano.shared(np.random.randn(nb_hunits, nb_users), name='V')
miu = theano.shared(np.zeros(nb_hunits), name='miu')
W = theano.shared(np.random.randn(nb_users, nb_hunits), name='W')
b = theano.shared(np.zeros(nb_users), name='b')
z1 = T.nnet.sigmoid(V.dot(X) + miu)
z2 = W.dot(z1) + b
loss_reg = 1.0/nb_movies * lambda_reg/2 * (T.sum(T.sqr(V)) + T.sum(T.sqr(W)))
loss = T.sum(T.sqr((X - z2) * X_observed)) + loss_reg
gV, gmiu, gW, gb = T.grad(loss, [V, miu, W, b]) # gb is (dL / db)

In [142]:
def train_auto():
    
    train = theano.function(
      inputs=[X, X_observed, update_matrix],
      outputs=[z2], # W += -epsilon * dW
      updates=((V, V - learningrate * gV * update_matrix),(miu, miu - learningrate * gmiu),
          (W, W - learningrate * gW * update_matrix.T), (b, b - learningrate * gb * X_observed)))
    
    for j in range(nb_epoch):
        print(str(j + 1) + " epoch")
        for i in np.random.permutation(nb_movies):
            Ri = train_M[i, :] # (943, 1) or (1, 943) -> take (943, 1)
            Ri_observed = Ri.copy()
            Ri_observed[Ri > 0] = 1 # pick out rated value(observed)
            update_m = np.tile(Ri_observed, (nb_hunits, 1)) # copy 10 columns for 10 hidden units
            Ri_predicted = train(Ri, Ri_observed, update_m) 
            prediction_M[i, :] = np.array(Ri_predicted) # push_back into result, 1 column (943 x 1)
    #         RMSE_list[j] = cal_RMSE(prediction_M, test_ratings)
    print("training complete")
    return train_M, prediction_M

In [143]:
train_M, prediction_M = train_auto()

1 epoch
2 epoch
3 epoch
4 epoch
5 epoch
6 epoch
7 epoch
8 epoch
9 epoch
10 epoch
training complete


In [146]:
prediction_M

array([[ 2.8773241 ,  3.03445125,  2.54514933, ...,  3.51779461,
         3.89729404,  3.29231882],
       [ 3.27614784,  3.14767361,  2.12088227, ...,  3.67183399,
         3.76286483,  3.63309455],
       [ 3.94458914,  4.0529213 ,  3.79505682, ...,  4.49241924,
         4.48526764,  3.91896939],
       ..., 
       [ 0.42165658,  0.30078402,  1.14065456, ...,  1.9043963 ,
         1.68626499,  0.6113947 ],
       [ 1.74561405,  1.71814346,  1.93321943, ...,  1.52278507,
         3.30635905,  1.65390396],
       [ 0.26748362,  0.17551543,  1.27811289, ...,  1.88702881,
         1.68359172,  0.65674895]], dtype=float32)

In [145]:
train_M

array([[ 5.,  4.,  0., ...,  5.,  0.,  0.],
       [ 3.,  0.,  0., ...,  0.,  0.,  5.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [None]:
# np.tile(np.array([2,1]), 3) # stands for repeat
t = np.tile(np.array([2,1]), [10, 1])
t2 = t.copy()
t2[t == 1] = 52

In [None]:
t2