In [5]:
import numpy as np
import scipy.optimize as op

In [2]:
n_movies=50
n_users=50
n_features=19 # comedy,action,romance etc. applies to both movie and user

In [6]:
# movies x user rating matrix where ratings_matrix are from 1 to 10
#each column is a single users rating of all movies
# 0 value if the user has not rated the movie
ratings_matrix =np.zeros(shape=(n_movies,n_users))

In [7]:
with open("ml-100k/ua.base") as f:
    for line in f:
        l=line.split("\t")
        user_id=int(l[0])
        movie_id=int(l[1])
        rating=int(l[2])
        if (movie_id <= n_movies) and (user_id <= n_users):
            ratings_matrix[movie_id-1][user_id-1]=rating
        



In [8]:
did_rate=(ratings_matrix!=0)*1

In [9]:
def normalize_ratings_matrix(ratings_matrix, did_rate):
    num_movies = ratings_matrix.shape[0]
    
    ratings_matrix_mean = np.zeros(shape = (num_movies, 1))
    ratings_matrix_norm = np.zeros(shape = ratings_matrix.shape)
    
    for i in range(num_movies): 
        # Get all the indexes where there is a 1
        idx = np.where(did_rate[i] == 1)[0]
        #  Calculate mean rating of ith movie only from user's that gave a rating
        ratings_matrix_mean[i] = np.mean(ratings_matrix[i, idx])
        ratings_matrix_norm[i, idx] = ratings_matrix[i, idx] - ratings_matrix_mean[i]
    
    return ratings_matrix_norm, ratings_matrix_mean

In [10]:
ratings_matrix, ratings_matrix_mean = normalize_ratings_matrix(ratings_matrix, did_rate)

In [11]:

def unroll_params(X_and_theta, num_users, num_movies, num_features):
	# Retrieve the X and theta matrixes from X_and_theta, based on their dimensions (num_features, num_movies, num_movies)
	# --------------------------------------------------------------------------------------------------------------
	# Get the first 30 (10 * 3) rows in the 48 X 1 column vector
	first_30 = X_and_theta[:num_movies * num_features]
	# Reshape this column vector into a 10 X 3 matrix
	X = first_30.reshape((num_features, num_movies)).transpose()
	# Get the rest of the 18 the numbers, after the first 30
	last_18 = X_and_theta[num_movies * num_features:]
	# Reshape this column vector into a 6 X 3 matrix
	theta = last_18.reshape(num_features, num_users ).transpose()
	return X, theta


In [12]:

def calculate_cost(X_and_theta, ratings_matrix, did_rate, num_users, num_movies, num_features, reg_param):
	X, theta = unroll_params(X_and_theta, num_users, num_movies, num_features)
	
	# we multiply (element-wise) by did_rate because we only want to consider observations for which a rating was given
	cost = np.sum( (X.dot( theta.T ) * did_rate - ratings_matrix) ** 2 ) / 2
	# '**' means an element-wise power
	regularization = (reg_param / 2) * (np.sum( theta**2 ) + np.sum(X**2))
	return cost + regularization

In [13]:

def calculate_gradient(X_and_theta, ratings_matrix, did_rate, num_users, num_movies, num_features, reg_param):
	X, theta = unroll_params(X_and_theta, num_users, num_movies, num_features)
	
	# we multiply by did_rate because we only want to consider observations for which a rating was given
	difference = X.dot( theta.T ) * did_rate - ratings_matrix
	X_grad = difference.dot( theta ) + reg_param * X
	theta_grad = difference.T.dot( X ) + reg_param * theta
	
	# wrap the gradients back into a column vector 
	return np.r_[X_grad.T.flatten(), theta_grad.T.flatten()]


In [14]:
reg_param = 0
movie_features = np.random.randn( n_movies, n_features )
user_prefs = np.random.randn( n_users, n_features )
initial_X_and_theta = np.r_[movie_features.T.flatten(), user_prefs.T.flatten()]

# perform gradient descent, find the minimum cost (sum of squared errors) and optimal values of X (movie_features) and Theta (user_prefs)

minimized_cost_and_optimal_params = op.fmin_cg(calculate_cost, fprime=calculate_gradient, x0=initial_X_and_theta,
                                               args=(ratings_matrix, did_rate, n_users, n_movies, n_features, reg_param), 
                                               disp=True, full_output=True ) 


Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 81
         Function evaluations: 136
         Gradient evaluations: 136


In [15]:
#np.save("minimized_cost_and_optimal_params.npy", minimized_cost_and_optimal_params)
cost, optimal_movie_features_and_user_prefs = minimized_cost_and_optimal_params[1], minimized_cost_and_optimal_params[0]

In [16]:
movie_features, user_prefs = unroll_params(optimal_movie_features_and_user_prefs, n_users, n_movies, n_features)

In [18]:
all_predictions = movie_features.dot( user_prefs.T )
print all_predictions

[[  9.00000153e-01  -9.99998032e-02   2.04307500e+00 ...,   1.46506890e+00
   -2.10000032e+00   1.28119260e+00]
 [  1.66666816e-01  -4.32603598e-01  -4.58387332e+00 ...,  -1.86403460e+00
   -1.83333273e+00   9.08677216e-01]
 [  1.00000002e+00  -2.78041168e+00   7.61383598e+00 ...,  -2.23206929e+00
   -2.14121594e-07   1.05680365e+00]
 ..., 
 [  3.99999863e-01   2.04019759e+00  -8.06331347e-01 ...,   2.81329496e-01
   -9.10311994e-01   8.64104483e-01]
 [ -2.50000294e-01   2.34407434e+00  -2.89928155e+00 ...,  -2.68353550e+00
   -1.24999981e+00   3.11502678e+00]
 [  7.66666578e-01  -8.54601504e-02   1.00858294e+00 ...,  -2.33333718e-01
   -3.23333288e+00  -7.42969189e-01]]


In [22]:
user1_predictions = all_predictions[:, 0]# + ratings_matrix_mean
print sample_prediction

[  3.90000015e+00   3.16666682e+00   4.00000002e+00   1.89999979e+00
   3.39999968e+00   2.99999993e+00   3.21739104e+00   1.73268152e-07
   3.64705784e+00   3.00000044e+00   1.57142839e+00   3.46666591e+00
   4.33333290e+00   3.99999999e+00   3.85714305e+00   3.00000025e+00
   3.16666648e+00   4.33333293e+00   4.00000011e+00   1.39635616e+00
   1.28571434e+00   2.49999978e+00   2.71428591e+00   2.77777779e+00
   4.00000059e+00   2.00000023e+00   2.00000003e+00   3.00000020e+00
   1.57142832e+00   3.00000019e+00   1.99999998e+00   3.99999988e+00
   1.18134336e+00   3.00000021e+00   9.99999662e-01   2.99999939e+00
   3.49999978e+00   3.50000002e+00   3.33333356e+00   3.16666653e+00
   1.49999987e+00   3.99999989e+00   4.00000008e+00   3.00000052e+00
   3.66666675e+00   2.99999948e+00   3.66666705e+00   3.39999986e+00
   2.74999971e+00   3.76666658e+00]
