In [4]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from CFModel import CFModel


ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'])

small_data = ratings.sample(frac=0.02)

# Create training set
shuffled_ratings = ratings.sample(frac=1.)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print 'Users:', Users, ', shape =', Users.shape

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print 'Movies:', Movies, ', shape =', Movies.shape

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print 'Ratings:', Ratings, ', shape =', Ratings.shape

Users: [1779 1922 2094 ... 1740 2527 5919] , shape = (1000209,)
Movies: [2114 3792 1126 ... 2577 1376  589] , shape = (1000209,)
Ratings: [3 4 2 ... 4 2 5] , shape = (1000209,)


In [5]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2000 # A random test user (user_id = 2000)

# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')

# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

Train on 900188 samples, validate on 100021 samples
Epoch 1/30
 - 414s - loss: 8.3262 - val_loss: 2.3057
Epoch 2/30
 - 417s - loss: 1.4956 - val_loss: 1.1396
Epoch 3/30
 - 438s - loss: 1.0002 - val_loss: 0.9460
Epoch 4/30
 - 447s - loss: 0.8887 - val_loss: 0.8825
Epoch 5/30
 - 437s - loss: 0.8408 - val_loss: 0.8508
Epoch 6/30
 - 426s - loss: 0.8096 - val_loss: 0.8287
Epoch 7/30
 - 405s - loss: 0.7847 - val_loss: 0.8138
Epoch 8/30
 - 401s - loss: 0.7632 - val_loss: 0.7987
Epoch 9/30
 - 417s - loss: 0.7428 - val_loss: 0.7887
Epoch 10/30
 - 435s - loss: 0.7227 - val_loss: 0.7804
Epoch 11/30
 - 426s - loss: 0.7022 - val_loss: 0.7718
Epoch 12/30
 - 421s - loss: 0.6816 - val_loss: 0.7662
Epoch 13/30
 - 426s - loss: 0.6609 - val_loss: 0.7601
Epoch 14/30
 - 420s - loss: 0.6406 - val_loss: 0.7577
Epoch 15/30
 - 424s - loss: 0.6197 - val_loss: 0.7547
Epoch 16/30
 - 431s - loss: 0.5994 - val_loss: 0.7538
Epoch 17/30
 - 413s - loss: 0.5789 - val_loss: 0.7545
Epoch 18/30
 - 420s - loss: 0.5589 - va

Unnamed: 0,movie_id,prediction,title,genres
0,3092,5.052228,Chushingura (1962),Drama
1,1423,5.042253,Hearts and Minds (1996),Drama
2,2905,5.040606,Sanjuro (1962),Action|Adventure
3,858,5.011073,"Godfather, The (1972)",Action|Crime|Drama
4,326,4.993318,To Live (Huozhe) (1994),Drama
5,3091,4.962155,Kagemusha (1980),Drama|War
6,919,4.943676,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical
7,668,4.91996,Pather Panchali (1955),Drama
8,649,4.915228,Cold Fever (Á köldum klaka) (1994),Comedy|Drama
9,3030,4.91478,Yojimbo (1961),Comedy|Drama|Western


In [None]:
# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

In [None]:
# Use 100 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=100, validation_split=.1, verbose=2, callbacks=callbacks)

In [None]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print 'Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss))

# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

# Pick a random test user
users[users['user_id'] == TEST_USER]

# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)