Adopted from https://github.com/alexvlis/movie-recommendation-system/blob/master/util.py

In [1]:
import csv
import numpy as np
import pandas as pd
import pickle

#we're looking to populate these 4 dictionaries
movieId_movieName = {}
movieId_movieCol  = {}
userId_userRow    = {}
userId_rating     = {}

#for reference
movieId_isRated   = {}

#then populate a data matrix based on the userId_rating dictionary


'''
Read Basic Movie Info
'''

path     = 'data'
filename = 'movies.csv'


dataFrame = pd.read_csv('{}/{}'.format(path,filename))

for i in range(len(dataFrame['movieId'])):
    movieId = dataFrame['movieId'][i]
    
    movieId_movieName[movieId] = dataFrame['title'][i]    
    movieId_isRated[movieId]    = 0

   
'''
Read in the ratings
'''
path     = 'data'
filename = 'ratings.csv'


dataFrame = pd.read_csv('{}/{}'.format(path,filename))

for i in range(len(dataFrame)):
    userId  = dataFrame['userId'][i]
    movieId = dataFrame['movieId'][i]
    rating  = dataFrame['rating'][i]
    
    if userId not in userId_rating.keys():
        userId_rating[userId] = [(movieId, rating)]
    else:
        userId_rating[userId].append((movieId, rating))
    
    movieId_isRated[movieId] = 1
        
print(userId_rating[1])
    

for movieId, isRated in movieId_isRated.items():
    if isRated == 0:
        del movieId_movieName[movieId]
        
userId_userRow
movieId_movieCol

i = 0
for movieId in sorted(movieId_movieName):
    movieId_movieCol[movieId] = i
    i+=1

i=0
for userId in sorted(userId_rating):
    userId_userRow[userId] = i
    i+=1

m = len(userId_userRow.keys())
n = len(movieId_movieCol.keys())
A = np.zeros((m,n))
    
print(A.shape)
for userId, ratings in userId_rating.items():
    for rating in ratings:
        movieId   = rating[0]
        score     = rating[1]
        
        if (userId in userId_userRow and movieId in movieId_movieCol):
            i = userId_userRow[userId]

            j = movieId_movieCol[movieId]
            A[i,j] = score

ratingCount = 0
for i in range(m):
    for j in range(n):
        if (A[i][j] != 0):
#             if(ratingCount < 20):
#                 print(A[i][j])
            ratingCount += 1


print('Number of ratings = {}'.format(ratingCount))
print('Total entries = {}'.format(m*n))
print('Sparsity = {}%'.format(ratingCount*100/(m*n)))

d = {'movieId_movieName': movieId_movieName,
     'movieId_movieCol' : movieId_movieCol,
     'userId_userRow'   : userId_userRow,
     'userId_rating'    : userId_rating }
pickle.dump(A, open('data/data_matrix.p', 'wb'))
pickle.dump(d, open('data/data_dicts.p', 'wb'))
print (A.shape)

[(31, 2.5), (1029, 3.0), (1061, 3.0), (1129, 2.0), (1172, 4.0), (1263, 2.0), (1287, 2.0), (1293, 2.0), (1339, 3.5), (1343, 2.0), (1371, 2.5), (1405, 1.0), (1953, 4.0), (2105, 4.0), (2150, 3.0), (2193, 2.0), (2294, 2.0), (2455, 2.5), (2968, 1.0), (3671, 3.0)]
(671, 9066)
Number of ratings = 100004
Total entries = 6083286
Sparsity = 1.6439141608663477%
(671, 9066)


In [3]:
import csv
import numpy as np
import pickle
import time

def k_cross(k = 10):

	A = load_data_matrix()
	m = A.shape[0]
	n = A.shape[1]

	print('A.shape = {}'.format(A.shape))

	prediction_matrices = []
	training_matrices   = []
	index_lists         = []
	for i in range(k):
	    A_copy = A.copy()
	    prediction_matrices.append(np.zeros((m, n)))
	    training_matrices.append(A_copy)
	    index_lists.append(np.zeros((m, n), dtype=bool))

	it    = 0
	for i in range(A.shape[0]):
	    for j in range(A.shape[1]):
	        if (A[i, j] != 0):
	            training_matrices[it%k][i, j]   = 0
	            prediction_matrices[it%k][i, j] = A[i, j]
	            index_lists[it%k][i, j] = True
	            it+=1

	return training_matrices, prediction_matrices, index_lists

def load_data_matrix(filename='data_matrix.p', path='data'):
	filepath = filename if path == '' else '{}/{}'.format(path,filename)
	A = pickle.load( open('{}'.format(filepath), 'rb'))
	return A

def get_MSE(mat1, mask, mat2=''):
	if (mat2 == ''):
		mat2 = load_data_matrix()

	A_mask    = mat2[mask]
	mat1_mask = mat1[mask]

	diff = A_mask-mat1_mask
	mse = np.dot(diff, diff)/A_mask.shape
	return mse[0]

if __name__ == '__main__':
	k = 10

	train_mats, val_mats, masks = k_cross(k=k)
	print('MSE = {}'.format(get_MSE(train_mats[0], masks[0])))

	m = train_mats[0].shape[0]
	n = train_mats[0].shape[1]
	start = time.time()
	for i in range(m):
	    for j in range(n):
	        for index in range(k):
	            if(train_mats[index][i,j] != 0 and val_mats[index][i,j] != 0):
	                print('we have a problem')
	end = time.time()
	print('you wasted {} seconds of my life'.format(end-start))

A.shape = (671, 9066)
MSE = 13.742275772422758
you wasted 40.15754175186157 seconds of my life


In [5]:
import pickle
import numpy as np
import util
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")


class CollaborativeFiltering():
	''' Collaborative Filtering Estimator '''

	@staticmethod
	def pearsonr(r_a, r_i):
		'''
		input: rating vectors of user a (active user) and user i
		output: pearsor correlation coefficient value
		'''

		# Get movies both users rated
		mask = np.logical_and(r_a>0, r_i>0)
		if mask.sum() == 0:
			return -1 # Does not actually matter what value we return

		r_a = r_a[mask]
		r_i = r_i[mask]

		# Get mean rating for each user
		r_a_bar = np.mean(r_a)
		r_i_bar = np.mean(r_i)

		# Calculate pearson correlation coefficient
		return  np.dot(r_a-r_a_bar, r_i-r_i_bar) /         \
				np.sqrt(np.dot(r_a-r_a_bar, r_a-r_a_bar) * \
						np.dot(r_i-r_i_bar, r_i-r_i_bar))  \

	@staticmethod
	def significance(r_a, r_i, thresh):
		'''
		input: rating vectors of user a (active user) and user i
		output: significance weight
		'''
		S = np.logical_and(r_a>0, r_i>0).sum()
		if S > thresh:
			return 1
		return S/thresh

	@staticmethod
	def prediction(r, w):
		'''
		input: neighborhood matrix of k rows, weight vector of neighborhood
		output: offset prediction vector for active user
		'''
		return np.dot((r.T - r.mean(axis=1)), w) / np.sum(w)


	'''*************************** Class methods ****************************'''

	def __init__(self, method="neighborhood", k=10, s=50):
		self.method = method
		self.k = k
		self.s = s

	def fit(self, A, verbose=False):
		self.verbose = verbose
		if self.verbose:
			print("Training...")

		if self.method == "neighborhood":
			return self.neighborhood_based(A)
		if self.method == "item":
			return self.neighborhood_based(A.T).T


	'''*************************** Private methods **************************'''

	def neighborhood_based(self, A):
		A_new = np.array(A) # copy A matrix

		for a, r_a in enumerate(A):
			# weight vector for active user a
			w = np.zeros(A.shape[0])
			w[a] = -1 # ignore active user

			for i, r_i in enumerate(A):
				if i == a:
					# Skip active user
					continue

				w[i] = CollaborativeFiltering.pearsonr(r_a, r_i) * \
						CollaborativeFiltering.significance(r_a, r_i, self.s)

			# Get indices of neighborhood
			K = np.argsort(w)[:self.k]
			mask = r_a==0
			A_new[a, mask] = (np.mean(r_a[r_a>0]) + CollaborativeFiltering.prediction(A[K], w[K]))[mask]

			if self.verbose:
				print("fitting item:", a, end='\r')

		if self.verbose:
			print("\nDone.")

		A_new[A_new>5] = 5.0 # clip all ratings to 5
		return np.around(A_new*2)/2 # round to nearest .5


if __name__ == "__main__":

	A = util.load_data_matrix()
	cf = CollaborativeFiltering()
	A_new = cf.fit(A, verbose=True)
	recommendations = np.argsort(A_new[1, :])[:5]

	B = pickle.load( open('{}'.format('data/data_dicts.p'), 'rb'))

	for movie_id,rating in B['userId_rating'][2]:
	   if rating == 5 :
	       print(B['movieId_movieName'][movie_id] , ", rating:" , rating )

	l = recommendations
	k_list =[]
	for movie_column in l :
	   for k, v in B['movieId_movieCol'].items():
	       if v == movie_column:
	           k_list.append(k)
	print("")
	print("Recommendations")
	for movie_id in k_list :
	   print(B['movieId_movieName'][movie_id])

Training...
fitting item: 67054 114 171 174208 322 350 396 478 488 579 582 604
Done.
Sense and Sensibility (1995) , rating: 5.0
Clueless (1995) , rating: 5.0
Apollo 13 (1995) , rating: 5.0
Circle of Friends (1995) , rating: 5.0
Like Water for Chocolate (Como agua para chocolate) (1992) , rating: 5.0
Legends of the Fall (1994) , rating: 5.0
Nightmare Before Christmas, The (1993) , rating: 5.0
Brady Bunch Movie, The (1995) , rating: 5.0
Terminator 2: Judgment Day (1991) , rating: 5.0
Dances with Wolves (1990) , rating: 5.0
Batman (1989) , rating: 5.0

Recommendations
Clerks (1994)
Shallow Grave (1994)
Naked Gun 33 1/3: The Final Insult (1994)
Santa Clause, The (1994)
Highlander III: The Sorcerer (a.k.a. Highlander: The Final Dimension) (1994)


In [None]:
import numpy as np
import time

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import MaxPooling2D
from keras import optimizers


from keras.models import load_model
from keras.callbacks import ModelCheckpoint

from keras.utils import np_utils

class Collaborative_Filtering_Neural_Net(object):

	def __init__(self, train_data, val_data, mask, num_layers=3, learn_rate=.2):

		self.train_data = train_data
		self.val_data   = val_data
		self.mask       = mask
		self.num_layers = num_layers

		self.m          = self.train_data.shape[0]
		self.n 			= self.train_data.shape[1]
		
		self.learn_rate = learn_rate

		self.construct_input()


	def construct_input(self):
		'''
		Construct training input/output from the training data matrix
		and 
		Construct validation input/output from the training/validation 
		'''
		def change_to_one_hot(value, value_range):
			one_hot_vec = np.zeros(len(value_range))
			one_hot_vec[int(value/.5)] = 1
			return one_hot_vec

		m = self.m
		n = self.n

		user_indices, movie_indices = (np.where(self.train_data > 0))
		scores = self.train_data[self.mask]

		num_train_samples = user_indices.shape[0]

		self.train_x = np.zeros((num_train_samples, m+n))
		self.train_y = np.zeros((num_train_samples, 11))

		start = time.time()

		#construct training input and output X, y
		for i in range(num_train_samples):
			u_ind = user_indices[i]
			m_ind = movie_indices[i]

			self.train_x[i, u_ind]   = 1
			self.train_x[i, m+m_ind] = 1

			score 			= self.train_data[u_ind, m_ind]
			self.train_y[i] = change_to_one_hot(score, np.arange(0,5.5,.5))

		#construct test inputs for where we need to predict values
		user_indices, movie_indices = np.where(self.mask)
		num_test_samples = user_indices.shape[0]
		self.test_x = np.zeros((num_test_samples, m+n))
		self.test_y = np.zeros((num_test_samples, 11))

		for i in range(num_test_samples):
			u_ind = user_indices[i]
			m_ind = movie_indices[i]

			self.test_x[i, u_ind]   = 1
			self.test_x[i, m+m_ind] = 1

			score 		   = self.val_data[u_ind, m_ind]
			self.test_y[i] = change_to_one_hot(score, np.arange(0,5.5,.5))

		print(time.time() - start)


	def construct_model(self, hidden_layer_pattern = 'exponential'):
		'''
		Constructs a Neural network with a given pattern.
		The pattern indicates how many neurons should exist at every layer.
		Param:
			hidden_layer_pattern - The input layer and output layer are fixed, but the rate at which the layer sizes
			decreases depends on the parameter, hidden_layer_pattern
		'''
		model = Sequential()
		input_size = self.m + self.n
		
		# add the first layer
		model.add(Dense(input_size, activation='relu', input_shape=(input_size,)))

		#one of the two model architectures tested
		if (hidden_layer_pattern == 'linear'):
			linear_decrease = int(input_size/self.num_layers)
			for i in range(self.num_layers):
				input_size = input_size - linear_decrease
				model.add(Dense(input_size, activation='relu') )

		if (hidden_layer_pattern == 'exponential'):
			exponential_decrease = int((np.exp(np.log(input_size)/(self.num_layers+2))))
			print(exponential_decrease)
			for i in range(self.num_layers):
				input_size = int(input_size/exponential_decrease);
				model.add(Dense(input_size, activation='relu') )

		print (model.output_shape)
		#one hot encoded output
		model.add(Dense(11, activation='relu'))


		# model says they optimized the log loss error

		adam = optimizers.Adam(lr=self.learn_rate, decay=.001)
		model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

		self.model = model

	def train_model(self, model_number = 0):
		'''
		Trains the model. Saves checkpoints of the model at every epoch.
		I personally just stop training when I find that the loss function has barely changed. Since it takes
		so long to perform each epoch on my computer, I just keep running a 20 epoch train, stop it when I
		have to, then train again later.
		Param:
			model_number - Just changes the filename that the model is saved to. 
						   Don't want to overwrite good save files during training, do you?

		Note: these checkpoints are 1GB each.
		'''
		# lets make checkpoints
		filepath = "nn_model_{}_lr_{}".format(model_number,self.learn_rate)
		filepath+= "_{epoch:02d}.hdf5"

		print('learn_rate = {}'.format(self.learn_rate))
		checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
		callbacks_list = [checkpoint]

		self.model.fit(self.train_x, self.train_y, batch_size=128, epochs=5, callbacks=callbacks_list, verbose=1)

	def load_model(self, filename):
		'''
		Loads the weights of an identically architectured neural net at the given filepath
		'''
		self.model.load_weights(filename)
		adam = optimizers.Adam(lr=self.learn_rate, decay=.001)
		self.model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])



	def predict_values(self, test_type='validation'):
		'''
		Predicts values based on training or validation data
		Return:
			scores
			predicted values
		'''
		# print(self.model.get_weights())
		if (test_type == 'validation'):
			scores = self.model.predict(self.test_x, verbose=True)
			return scores, self.test_y
		elif (test_type == 'training'):
			scores = self.model.predict(self.train_x, verbose=True)
			return scores, self.train_y



In [None]:
import numpy as np
import time
import util
from nn import Collaborative_Filtering_Neural_Net

def train():
	'''
	trains a neural net and saves snapshots every epoch. Runs 20 epochs or until you quit the process.
	'''
	train_mat, val_mat, masks = util.k_cross()
	A = util.load_data_matrix()

	start = time.time()
	net = Collaborative_Filtering_Neural_Net(train_mat[0], val_mat[0], masks[0])
	net.learn_rate=.1
	net.construct_model(hidden_layer_pattern = 'exponential')
	# net.load_model('nn_model_exponential_one_hot_learn_rate_.1_lr_0.1_04.hdf5')
	net.train_model(model_number='exponential_one_hot')
	print('time taken to train in seconds:', time.time() - start)

def test(model_name = '', test_type = 'validation'):
	'''
	Gets the accuracy and validation error of a model.
	This function assumes you have been saving your models
	'''
	train_mat, val_mat, masks = util.k_cross()
	A = util.load_data_matrix()

	net = Collaborative_Filtering_Neural_Net(train_mat[0], val_mat[0], masks[0])
	net.learn_rate=.1
	net.construct_model(hidden_layer_pattern = 'exponential')
	net.load_model(model_name)

	pred_scores , true_scores= net.predict_values(test_type = test_type)
	pred_scores = pred_scores.argmax(axis=1)
	true_scores    = true_scores.argmax(axis=1)

	#get Accuracy
	num_correct = np.sum(pred_scores == true_scores)
	accuracy    = num_correct/pred_scores.shape[0]*100

	#get MSE
	error = pred_scores-true_scores
	mse   = np.mean(np.power(error, 2))

	print('The {} accuracy of the model is {}%'.format(test_type, accuracy))
	print('The {} mean squared error of the model is {}'.format(test_type, mse))

if __name__ == '__main__':
	train()
	test('nn_model_exponential_one_hot_round_2_lr_0.1_08.hdf5', test_type='training')
	test('nn_model_exponential_one_hot_round_2_lr_0.1_08.hdf5', test_type='validation')

A.shape = (671, 9066)
4.032506704330444
6
(None, 45)
learn_rate = 0.1
Epoch 1/20
 1664/90003 [..............................] - ETA: 47:47 - loss: 4.7320 - acc: 0.2686