In [1]:
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler,scale
import pandas as pd

%matplotlib inline

In [2]:
def get_data(size):
    ratings = []
    if size == "100k":
        path = "../dat/rec/ml-100k/u.data"
        print("Read movie lens 100k data set")
        f = open(path, "r")
        while (1):
            line = f.readline()
            if line == "":
                break
            ratings.append(line.split()[0:-1])
        f.close()
    ratings = np.array(ratings, dtype = np.float32)
    # permute the ratings array
    ratings = np.random.permutation(ratings)
    print("Loading data done")
    return ratings

In [3]:
ratings = get_data("100k")

Read movie lens 100k data set
Loading data done


In [4]:
ratings.shape

(100000, 3)

In [5]:
def load_split_data(data_size, test_p):
    # Load data and split into train set, test set randomly.
    # data_size is either "100k", "1m", "10m" or "20m".
    # test_p is a float between 0 - 1 indicating the portion of data hold out as test set
    print("split data randomly")
    # Load ratings, data is already permuted in get_data
    ratings = get_data(data_size)
    nb_users = int(np.max(ratings[:, 0]))
    nb_movies = int(np.max(ratings[:, 1]))
    # split test/train set
    test_size = int(len(ratings) * test_p)
    test_ratings = ratings[:test_size]
    train_ratings = ratings[test_size:]
    # train_ratings is converted into a matrix
    train_M = np.zeros((nb_movies, nb_users), dtype = np.float32)
    for rating in train_ratings:
        train_M[int(rating[1]-1), int(rating[0]-1)] = rating[2]
    # save test and train data in case more training is needed on this split
    np.save("../dat/rec/" + data_size + "_" + str(int(test_p * 100))+ "percent_test.npy", test_ratings)
    np.save("../dat/rec/" + data_size + "_" + str(int(test_p * 100))+ "percent_trainM.npy", train_M)
    # test_ratings is numpy array of user id | item id | rating
    # train_M is numpy array with nb_movies rows and nb_users columns, missing entries are filled with zero
    return test_ratings, train_M, nb_users, nb_movies, len(train_ratings)


In [6]:
# def cal_RMSE(prediction_M, test_ratings):
#     RMSE = 0
#     for rating in test_ratings:
#         RMSE += (rating[2] - prediction_M[int(rating[1] - 1), int(rating[0] - 1)])**2
#     RMSE = math.sqrt(RMSE / len(test_ratings))
#     return RMSE

In [7]:
nb_epoch = 10
test_p = 0.1
nb_hunits = 10
lambda_reg = 0.001
epsilon = 0.01 #learningrate
reg_lambda = 0.01
data_size = "100k"
test_ratings, train_M, nb_users, nb_movies, k = load_split_data(data_size, test_p)
prediction_M = np.zeros((nb_movies, nb_users), dtype = np.float32) # 1682 x 943
RMSE_list = [0] * nb_epoch # 1 x 10

split data randomly
Read movie lens 100k data set
Loading data done


In [8]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [69]:
def train(X, X_observed, update_matrix):

    W1 = np.random.randn(nb_users, nb_hunits) # 943 x 10
    b1 = np.zeros((1, nb_hunits)) # 1 x 10
    W2 = np.random.randn(nb_hunits, nb_users) # 10 x 943
    b2 = np.zeros((1, nb_users)) # 1 x 943
    a2 = np.zeros((nb_movies, nb_users)) # 1682 x 943
    
    for i in range(0, nb_epoch):

        z1 = X.dot(W1) + b1 # 1682x943 dot 943x10
        a1 = sigmoid(z1) # 1682x10
        z2 = a1.dot(W2) + b2 #1682x10 dot 10x943
        a2 = sigmoid(z2) #1682x943
        tmp1 = np.sum(np.square(W1))
        tmp2 = np.sum(np.square(W2))
        loss_reg = (1.0/nb_movies) * (lambda_reg/2) * ( tmp1 + tmp2 )
        loss = np.sum( np.square((X - a2) * X_observed) ) + loss_reg

        # Backpropagation
        delta2 = (a2 * (1 - a2)) * (a2 - X) # 1682x943
        
        dW2 = (a1.T).dot(delta2) # 10x943
        db2 = np.sum(delta2, axis=0, keepdims=True) # 1x943
        delta1 = (a1 * (1 - a1)) * (delta2.dot(W2.T)) # 1682x943
        dW1 = (X.T).dot(delta1) # 943x10
        db1 = np.sum(delta1, axis=0) # 1x10
        # Add regularization terms (b1 and b2 don't have regularization terms)
        dW2 += reg_lambda * W2
        dW1 += reg_lambda * W1
        
        # Gradient descent parameter update
        # original: 943x10 * 943x10
        W1 += -epsilon * dW1# 943x1682 * 943x10
        b1 += -epsilon * db1
        W2 += -epsilon * dW2
        b2 += -epsilon * db2
    return a2

In [70]:
def train_auto():
        
    Ri = train_M[:] # (1682, 943)
    Ri_observed = Ri.copy()
    Ri_observed[Ri > 0] = 1 # (1682, 943)
    update_m = Ri_observed # 
    Ri_predicted = train(Ri, Ri_observed, update_m)
    prediction_M = Ri_predicted
#         RMSE_list[j] = cal_RMSE(prediction_M, test_ratings)
    print("training complete")
    return train_M, prediction_M

In [71]:
train_M, prediction_M = train_auto()

[[  6.05082152e+01   4.28619921e+01   6.02955418e+01 ...,   4.38454923e+01
    2.71365649e+01   6.37797658e+01]
 [  1.07823197e+01   7.36226555e-01   4.16202724e+00 ...,   1.44297749e+01
    3.72366606e+00   1.71945999e+01]
 [  1.55281626e+01   1.40983470e+00   1.16591041e+01 ...,   8.03604003e+00
    1.44556853e+01   1.40157659e+01]
 ..., 
 [ -3.63673108e-01   5.55906311e+00  -9.27876264e+00 ...,   1.21349813e-02
    7.44944510e-01   2.76512334e+00]
 [  7.19870652e+00   5.77269565e+00  -7.05712250e+00 ...,   3.71419063e+01
    8.69303422e+00   1.44579425e+01]
 [  1.63492770e+01   2.64801663e+01   6.76429866e+01 ...,   8.34515649e+00
    1.78900142e+01   1.15489663e+01]]
[[ -4.25472951e-03   1.39775317e+01  -2.03320291e+00 ...,   2.84760775e+00
   -7.31714964e+00  -1.02044741e+00]
 [  1.90494716e-02   3.25002904e+01  -1.30437132e-03 ...,  -8.56833124e-03
   -9.39048349e-02  -7.73324313e-03]
 [ -6.42918766e-02  -1.01425537e+00  -4.57399983e-04 ...,  -2.68079631e-02
   -1.37727653e-01  -

  from ipykernel import kernelapp as app


[[ -3.09888945e-03  -1.22484762e+00   2.95657344e+00 ...,   2.77190835e+00
    1.17693756e+00  -9.75660542e-01]
 [  1.47141099e-02   1.93452285e+00  -2.95590248e-04 ...,  -4.05398104e-03
   -1.51201895e-02  -2.30583619e-02]
 [ -2.57407286e-01   2.18449201e+00   2.97252809e-05 ...,   1.12266658e-03
   -3.58105706e-04  -7.57002550e-01]
 ..., 
 [ -5.17835069e-43   4.27079351e-14  -1.86275587e-25 ...,   4.05730897e-15
    1.23085349e-16  -7.46847701e-37]
 [ -3.01881102e-07  -1.06503800e+00   5.20907997e-07 ...,  -6.64796867e-04
   -7.25837980e-02  -4.94831583e-06]
 [ -3.67935704e-06   4.73716358e+00  -1.10176118e-06 ...,  -1.32095979e+00
   -1.91005792e-03  -9.61145683e-03]]
[[ -3.06466385e-03  -3.57548092e+00  -1.50165395e-01 ...,  -1.91549939e+00
   -1.68814655e+00  -1.09887879e+00]
 [  1.04075313e-02  -3.82389402e-01  -1.52736016e-04 ...,  -2.66231687e-03
   -9.12967505e-03  -4.18286040e-02]
 [ -1.60942375e-01  -1.27974270e+00   5.96654782e-05 ...,   8.29020205e-02
   -9.06816005e-05   

In [67]:
prediction_M

array([[ 0.93552249,  0.14048842,  0.11097011, ...,  0.0328798 ,
         0.1934255 ,  0.77290839],
       [ 0.93552249,  0.14048842,  0.11097011, ...,  0.0328798 ,
         0.1934255 ,  0.77290839],
       [ 0.93552249,  0.14048842,  0.11097011, ...,  0.0328798 ,
         0.1934255 ,  0.77290839],
       ..., 
       [ 0.08228944,  0.03945425,  0.02518361, ...,  0.01691384,
         0.1055227 ,  0.1257709 ],
       [ 0.34202702,  0.00873126,  0.02520701, ...,  0.00165235,
         0.19778126,  0.29909253],
       [ 0.33107055,  0.01536374,  0.02858034, ...,  0.00171951,
         0.17247391,  0.4392191 ]])

In [68]:
train_M

array([[ 5.,  4.,  0., ...,  5.,  0.,  0.],
       [ 3.,  0.,  0., ...,  0.,  0.,  5.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [None]:
# np.tile(np.array([2,1]), 3) # stands for repeat
# t = np.tile(np.array([2,1]), [10, 1])
# t2 = t.copy()
# t2[t == 1] = 52

In [49]:
# a = np.array([[2,1], [10, 1]])#2,2
# b = np.array([[3,4],[4, 4])#2,1
# a * b