# Movie Recommendation 


## The MovieLens Dataset
Contains 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 MovieLens users.


# Data Processing

In [2]:
import pandas as pd
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating','user_emb_id'])

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])


print(users.shape, len(users))
print(movies.shape, len(movies))
print(ratings.shape, len(ratings))

(6040, 5) 6040
(3883, 3) 3883
(1000209, 4) 1000209


# Data Preparation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

## Set dataset size

In [4]:
import numpy as np
percent  = 0.01
chosen_idx = np.random.choice(len(users), replace=False, size=int(len(users)*percent))
users = users.iloc[chosen_idx]
chosen_idx = np.random.choice(len(movies), replace=False, size=int(len(movies)*percent))
movies = movies.iloc[chosen_idx]
chosen_idx = np.random.choice(len(ratings), replace=False, size=int(len(ratings)*percent))
ratings = ratings.iloc[chosen_idx]
print(ratings.shape)

(10002, 3)


## Two collaborative filtering approaches: Memory-based approach and Model-based approach

In [5]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://cdn-images-1.medium.com/max/2000/1*7uW5hLXztSu_FOmZOWpB6g.png")

# 1. Memory based approach: 
### User-based filtering and Item-based filtering

In [6]:
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances

# Fill NaN values in user_id and movie_id column with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

train_data, test_data = cv.train_test_split(ratings, test_size=0.2)

# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.as_matrix(columns = ['rating'])
test_data_matrix = test_data.as_matrix(columns = ['rating'])

# Check their shape

print(test_data_matrix.shape)

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
#print(user_correlation[:4, :4])

# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
#print(item_correlation[:4, :4])

# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    pred = np.array(pred)
    return pred



(2001, 1)


In [7]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)))


User-based CF RMSE: 1.5748840814691625
Item-based CF RMSE: 1.5748840814691625
User-based CF RMSE: 0.0
Item-based CF RMSE: 0.0





# Deep Learning Models

The basic idea is that the actual ratings of movies for each user can be represented by a matrix, say of users on the rows and movies along the columns.  
We don’t have the full rating matrix, instead, we have a very sparse set of entries. 
But if we could factor the rating matrix into two separate matrices, say one that was Used by Latent Factors, and one that was Latent Factors by Movies, then we could find the user’s rating for any movie by taking the dot product of the User row and the Movie column.

Y = Ratings.
X1, X2 = Movies, Users.

# Deep Learning Model 1

In [32]:
import numpy as np
from keras.layers import Embedding, Reshape, Merge
from keras.models import Sequential

class CFModel(Sequential):

    # The constructor for the class
    def __init__(self, n_users, m_items, k_factors, **kwargs):
        # P is the embedding layer that creates an User by latent factors matrix.
        # If the intput is a user_id, P returns the latent factor vector for that user.
        P = Sequential()
        P.add(Embedding(n_users, k_factors, input_length=1))
        P.add(Reshape((k_factors,)))

        # Q is the embedding layer that creates a Movie by latent factors matrix.
        # If the input is a movie_id, Q returns the latent factor vector for that movie.
        Q = Sequential()
        Q.add(Embedding(m_items, k_factors, input_length=1))
        Q.add(Reshape((k_factors,)))

        super(CFModel, self).__init__(**kwargs)
        
        # The Merge layer takes the dot product of user and movie latent factor vectors to return the corresponding rating.
        self.add(Merge([P, Q], mode='dot', dot_axes=1))

    # The rate function to predict user's rating of unrated items
    def rate(self, user_id, item_id):
        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]


In [34]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint


ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

import numpy as np
percent  = 0.2
chosen_idx = np.random.choice(len(users), replace=False, size=int(len(users)*percent))
users = users.iloc[chosen_idx]
chosen_idx = np.random.choice(len(movies), replace=False, size=int(len(movies)*percent))
movies = movies.iloc[chosen_idx]
chosen_idx = np.random.choice(len(ratings), replace=False, size=int(len(ratings)*percent))
ratings = ratings.iloc[chosen_idx]


print(user_20.shape, movie_20.shape, ratings_20.shape)
# Create training set
shuffled_ratings = ratings.sample(frac=1.)


# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)
# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

(1208, 5) (776, 3) (200041, 3)
Users: [5503 5687 3945 ... 4169  215  756] , shape = (200041,)
Movies: [2825 1672 3173 ... 1372 3143  456] , shape = (200041,)
Ratings: [4 1 3 ... 3 3 5] , shape = (200041,)


In [None]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2000 # A random test user (user_id = 2000)

# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')

# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

# Pick a random test user
users[users['user_id'] == TEST_USER]

# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)



Train on 180036 samples, validate on 20005 samples
Epoch 1/30
 - 56s - loss: 14.0542 - val_loss: 13.8855
Epoch 2/30
 - 55s - loss: 12.6226 - val_loss: 10.3997
Epoch 3/30
 - 59s - loss: 7.3539 - val_loss: 5.3291
Epoch 4/30
 - 52s - loss: 4.0102 - val_loss: 3.3089
Epoch 5/30
 - 50s - loss: 2.6277 - val_loss: 2.3712
Epoch 6/30
 - 49s - loss: 1.9542 - val_loss: 1.8864
Epoch 7/30
 - 53s - loss: 1.5840 - val_loss: 1.6042
Epoch 8/30
 - 54s - loss: 1.3594 - val_loss: 1.4296
Epoch 9/30
 - 54s - loss: 1.2112 - val_loss: 1.3092
Epoch 10/30


# Deep Learning Model 2

In [None]:
import numpy as np
import time

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import MaxPooling2D
from keras import optimizers


from keras.models import load_model
from keras.callbacks import ModelCheckpoint

from keras.utils import np_utils

class Collaborative_Filtering_Neural_Net(object):

	def __init__(self, train_data, val_data, mask, num_layers=3, learn_rate=.2):

		self.train_data = train_data
		self.val_data   = val_data
		self.mask       = mask
		self.num_layers = num_layers

		self.m          = self.train_data.shape[0]
		self.n 			= self.train_data.shape[1]
		
		self.learn_rate = learn_rate

		self.construct_input()


	def construct_input(self):
		'''
		Construct training input/output from the training data matrix
		and 
		Construct validation input/output from the training/validation 
		'''
		def change_to_one_hot(value, value_range):
			one_hot_vec = np.zeros(len(value_range))
			one_hot_vec[int(value/.5)] = 1
			return one_hot_vec


		m = self.m
		n = self.n

		user_indices, movie_indices = (np.where(self.train_data > 0))
		scores = self.train_data[self.mask]

		num_train_samples = user_indices.shape[0]

		self.train_x = np.zeros((num_train_samples, m+n))
		self.train_y = np.zeros((num_train_samples, 11))

		start = time.time()

		#construct training input and output X, y
		for i in range(num_train_samples):
			u_ind = user_indices[i]
			m_ind = movie_indices[i]

			self.train_x[i, u_ind]   = 1
			self.train_x[i, m+m_ind] = 1

			score 			= self.train_data[u_ind, m_ind]
			self.train_y[i] = change_to_one_hot(score, np.arange(0,5.5,.5))



		#construct test inputs for where we need to predict values
		user_indices, movie_indices = np.where(self.mask)
		num_test_samples = user_indices.shape[0]
		self.test_x = np.zeros((num_test_samples, m+n))
		self.test_y = np.zeros((num_test_samples, 11))

		for i in range(num_test_samples):
			u_ind = user_indices[i]
			m_ind = movie_indices[i]

			self.test_x[i, u_ind]   = 1
			self.test_x[i, m+m_ind] = 1

			score 		   = self.val_data[u_ind, m_ind]
			self.test_y[i] = change_to_one_hot(score, np.arange(0,5.5,.5))

		print(time.time() - start)


	def construct_model(self, hidden_layer_pattern = 'exponential'):
		'''
		Constructs a Neural network with a given pattern.
		The pattern indicates how many neurons should exist at every layer.
		Param:
			hidden_layer_pattern - The input layer and output layer are fixed, but the rate at which the layer sizes
			decreases depends on the parameter, hidden_layer_pattern
		'''
		model = Sequential()
		input_size = self.m + self.n
		
		# add the first layer
		model.add(Dense(input_size, activation='relu', input_shape=(input_size,)))

		#one of the two model architectures tested
		if (hidden_layer_pattern == 'linear'):
			linear_decrease = int(input_size/self.num_layers)
			for i in range(self.num_layers):
				input_size = input_size - linear_decrease
				model.add(Dense(input_size, activation='relu') )

		if (hidden_layer_pattern == 'exponential'):
			exponential_decrease = int((np.exp(np.log(input_size)/(self.num_layers+2))))
			print(exponential_decrease)
			for i in range(self.num_layers):
				input_size = int(input_size/exponential_decrease);
				model.add(Dense(input_size, activation='relu') )

		print (model.output_shape)
		#one hot encoded output
		model.add(Dense(11, activation='relu'))


		# model says they optimized the log loss error

		adam = optimizers.Adam(lr=self.learn_rate, decay=.001)
		model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

		self.model = model

	def train_model(self, model_number = 0):
		'''
		Trains the model. Saves checkpoints of the model at every epoch.
		I personally just stop training when I find that the loss function has barely changed. Since it takes
		so long to perform each epoch on my computer, I just keep running a 20 epoch train, stop it when I
		have to, then train again later.
		Param:
			model_number - Just changes the filename that the model is saved to. 
						   Don't want to overwrite good save files during training, do you?

		Note: these checkpoints are 1GB each.
		'''
		# lets make checkpoints
		filepath = "nn_model_{}_lr_{}".format(model_number,self.learn_rate)
		filepath+= "_{epoch:02d}.hdf5"

		print('learn_rate = {}'.format(self.learn_rate))
		checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
		callbacks_list = [checkpoint]

		self.model.fit(self.train_x, self.train_y, batch_size=128, epochs=20, callbacks=callbacks_list, verbose=1)

	def load_model(self, filename):
		'''
		Loads the weights of an identically architectured neural net at the given filepath
		'''
		self.model.load_weights(filename)
		adam = optimizers.Adam(lr=self.learn_rate, decay=.001)
		self.model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])



	def predict_values(self, test_type='validation'):
		'''
		Predicts values based on training or validation data
		Return:
			scores
			predicted values
		'''
		# print(self.model.get_weights())
		if (test_type == 'validation'):
			scores = self.model.predict(self.test_x, verbose=True)
			return scores, self.test_y
		elif (test_type == 'training'):
			scores = self.model.predict(self.train_x, verbose=True)
			return scores, self.train_y



In [None]:
import numpy as np
import time
import util
from nn import Collaborative_Filtering_Neural_Net

def train():
	'''
	trains a neural net and saves snapshots every epoch. Runs 20 epochs or until you quit the process.
	'''
	train_mat, val_mat, masks = util.k_cross()
	A = util.load_data_matrix()

	start = time.time()
	net = Collaborative_Filtering_Neural_Net(train_mat[0], val_mat[0], masks[0])
	net.learn_rate=.1
	net.construct_model(hidden_layer_pattern = 'exponential')
	# net.load_model('nn_model_exponential_one_hot_learn_rate_.1_lr_0.1_04.hdf5')
	net.train_model(model_number='exponential_one_hot')
	print('time taken to train in seconds:', time.time() - start)

def test(model_name = '', test_type = 'validation'):
	'''
	Gets the accuracy and validation error of a model.
	This function assumes you have been saving your models
	'''
	train_mat, val_mat, masks = util.k_cross()
	A = util.load_data_matrix()

	net = Collaborative_Filtering_Neural_Net(train_mat[0], val_mat[0], masks[0])
	net.learn_rate=.1
	net.construct_model(hidden_layer_pattern = 'exponential')
	net.load_model(model_name)

	pred_scores , true_scores= net.predict_values(test_type = test_type)
	pred_scores = pred_scores.argmax(axis=1)
	true_scores    = true_scores.argmax(axis=1)

	#get Accuracy
	num_correct = np.sum(pred_scores == true_scores)
	accuracy    = num_correct/pred_scores.shape[0]*100

	#get MSE
	error = pred_scores-true_scores
	mse   = np.mean(np.power(error, 2))

	print('The {} accuracy of the model is {}%'.format(test_type, accuracy))
	print('The {} mean squared error of the model is {}'.format(test_type, mse))

if __name__ == '__main__':
	train()
	test('nn_model_exponential_one_hot_round_2_lr_0.1_08.hdf5', test_type='training')
	test('nn_model_exponential_one_hot_round_2_lr_0.1_08.hdf5', test_type='validation')