In [12]:
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
def get_data(size):
    ratings = []
    if size == "100k":
        path = "../dat/rec/ml-100k/u.data"
        print("Read movie lens 100k data set")
        f = open(path, "r")
        while (1):
            line = f.readline()
            if line == "":
                break
            ratings.append(line.split()[0:-1])
        f.close()
    ratings = np.array(ratings, dtype = np.float32)
    # permute the ratings array
    ratings = np.random.permutation(ratings)
    print("Loading data done")
    return ratings

In [14]:
ratings = get_data("100k")

Read movie lens 100k data set
Loading data done


In [15]:
ratings.shape

(100000, 3)

In [16]:
def load_split_data(data_size, test_p):
    # Load data and split into train set, test set randomly.
    # data_size is either "100k", "1m", "10m" or "20m".
    # test_p is a float between 0 - 1 indicating the portion of data hold out as test set
    print("split data randomly")
    # Load ratings, data is already permuted in get_data
    ratings = get_data(data_size)
    nb_users = int(np.max(ratings[:, 0]))
    nb_movies = int(np.max(ratings[:, 1]))
    # split test/train set
    test_size = int(len(ratings) * test_p)
    test_ratings = ratings[:test_size]
    train_ratings = ratings[test_size:]
    # train_ratings is converted into a matrix
    train_M = np.zeros((nb_movies, nb_users), dtype = np.float32)
    for rating in train_ratings:
        train_M[int(rating[1]-1), int(rating[0]-1)] = rating[2]
    # save test and train data in case more training is needed on this split
    np.save("../dat/rec/" + data_size + "_" + str(int(test_p * 100))+ "percent_test.npy", test_ratings)
    np.save("../dat/rec/" + data_size + "_" + str(int(test_p * 100))+ "percent_trainM.npy", train_M)
    # test_ratings is numpy array of user id | item id | rating
    # train_M is numpy array with nb_movies rows and nb_users columns, missing entries are filled with zero
    return test_ratings, train_M, nb_users, nb_movies, len(train_ratings)


In [17]:
# def cal_RMSE(prediction_M, test_ratings):
#     RMSE = 0
#     for rating in test_ratings:
#         RMSE += (rating[2] - prediction_M[int(rating[1] - 1), int(rating[0] - 1)])**2
#     RMSE = math.sqrt(RMSE / len(test_ratings))
#     return RMSE

In [18]:
nb_epoch = 10
test_p = 0.1
nb_hunits = 10
lambda_reg = 0.001
epsilon = 0.01 #learningrate
reg_lambda = 0.01
data_size = "100k"
test_ratings, train_M, nb_users, nb_movies, k = load_split_data(data_size, test_p)
prediction_M = np.zeros((nb_movies, nb_users), dtype = np.float32) # 1682 x 943
RMSE_list = [0] * nb_epoch # 1 x 10

split data randomly
Read movie lens 100k data set
Loading data done


In [19]:
def sigmoid(x):                                        
    return 1 / (1 + np.exp(-x))

In [20]:
def train(X, X_observed, update_matrix, num_passes = 1000):
    prediction_M = np.zeros((nb_movies, nb_users), dtype = np.float32) # 1682 x 943

    W1 = np.random.randn(nb_users, nb_hunits) / np.sqrt(nb_users) # 943 x 3
    b1 = np.zeros((1, nb_hunits)) # 1 x 3
    W2 = np.random.randn(nb_hunits, nb_users) / np.sqrt(nb_hunits) # 3 x 943
    b2 = np.zeros((1, nb_users)) # 1 x 943
    a2 = np.zeros((1, nb_users))
    for i in range(0, num_passes):

        z1 = X.dot(W1) + b1
        a1 = sigmoid(z1)
        z2 = a1.dot(W2) + b2
        a2 = sigmoid(z2)
        tmp1 = np.sum(np.square(W1))
        tmp2 = np.sum(np.square(W2))
        loss_reg = (1.0/nb_movies) * (lambda_reg/2) * ( tmp1 + tmp2 )
        loss = np.sum( np.square((X - a2) * X_observed) ) + loss_reg

        # Backpropagation
        delta2 = (a2 * (1 - a2)) * (a2 - X) # 200 x 2 -> (200 x 2) * (200 x 2) 
        dW2 = (a1.T).dot(delta2) # 3 x 2 -> (3 x 200) dot (200 x 2)
        db2 = np.sum(delta2, axis=0, keepdims=True) # 1 x 2(dim of a2)
        delta1 = (a1 * (1 - a1)) * (delta2.dot(W2.T)) # 200 x 3 -> (200 x 3) * ((200 x 2) dot (2 x 3))
        dW1 = (X.T).dot(delta1) # 2 x 3 -> (2 x 200) dot (200 x 3)
        db1 = np.sum(delta1, axis=0) # 1 x 3(dim of hidden layer a1)
        # Add regularization terms (b1 and b2 don't have regularization terms)
        dW2 += reg_lambda * W2
        dW1 += reg_lambda * W1
        
        # Gradient descent parameter update
        W1 += -epsilon * dW1 * update_matrix.T
        b1 += -epsilon * db1
        W2 += -epsilon * dW2 * update_matrix
        b2 += -epsilon * db2
    return a2

In [22]:
def train_auto():
    
    for j in range(nb_epoch):
        print(str(j + 1) + " epoch")
        for i in np.random.permutation(nb_movies):
            Ri = train_M[i, :] # (943, 1) or (1, 943) -> take (943, 1)
            Ri_observed = Ri.copy()
            Ri_observed[Ri > 0] = 1 # pick out rated value(observed)
            update_m = np.tile(Ri_observed, (nb_hunits, 1)) # copy 10 columns for 10 hidden units
            tmp = Ri[:, None]
            tmp2 = Ri[:, None]
            Ri_predicted = train(tmp.T, tmp2.T, update_m)
            prediction_M[i, :] = np.array(Ri_predicted) # push_back into result, 1 column (943 x 1)
    #         RMSE_list[j] = cal_RMSE(prediction_M, test_ratings)
    print("training complete")
    return train_M, prediction_M

In [None]:
train_M, prediction_M = train_auto()

1 epoch


In [None]:
prediction_M

In [None]:
train_M

In [None]:
# np.tile(np.array([2,1]), 3) # stands for repeat
t = np.tile(np.array([2,1]), [10, 1])
t2 = t.copy()
t2[t == 1] = 52

In [None]:
t2