In [37]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

In [38]:
path = "/Users/trinakarmakar/anaconda2/data/fastai/recommendation/"
model_path = path+'models/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size = 64

In [39]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [40]:
len(ratings)

100004

In [41]:
movies = pd.read_csv(path+'movies.csv', index_col=0)
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [42]:
#Just for display lets read the movie names too.
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()
print(movie_names.values()[:10])

['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)', 'Sudden Death (1995)', 'GoldenEye (1995)']


In [43]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()
print(len(users))
print(len(movies))

671
9066


In [44]:
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

In [45]:
#We want to update user and movie id so that they are contigious integeres - we need that for embedding
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

In [46]:
user_min, user_max, movie_min, movie_max = (ratings.userId.min(), 
    ratings.userId.max(), ratings.movieId.min(), ratings.movieId.max())
user_min, user_max, movie_min, movie_max

(0, 670, 0, 9065)

In [47]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_users, n_movies

(671, 9066)

In [48]:
n_factors = 50

In [49]:
np.random.seed(42)

In [50]:
#split the data into random training and validation
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]

Create subset for Excel:


We create a crosstab of the most popular movies and most movie-addicted users which we'll copy into Excel for creating a simple example. This isn't necessary for any of the modeling below however.

Lets create a crosstab for most popular movie and most movie addcited user 

In [51]:
g = ratings.groupby('userId')['rating'].count()
topUsers = g.sort_values(ascending=False)[:15]

In [52]:
g = ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False)[:15]

In [53]:
top_r = ratings.join(topUsers, rsuffix='_r', how = 'inner', on = 'userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

In [54]:
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,27,49,57,72,79,89,92,99,143,179,180,197,402,417,505
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14,3.0,5.0,1.0,3.0,4.0,4.0,5.0,2.0,5.0,5.0,4.0,5.0,5.0,2.0,5.0
29,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,3.0,4.0,5.0
72,4.0,5.0,5.0,4.0,5.0,3.0,4.5,5.0,4.5,5.0,5.0,5.0,4.5,5.0,4.0
211,5.0,4.0,4.0,3.0,5.0,3.0,4.0,4.5,4.0,,3.0,3.0,5.0,3.0,
212,2.5,,2.0,5.0,,4.0,2.5,,5.0,5.0,3.0,3.0,4.0,3.0,2.0
293,3.0,,4.0,4.0,4.0,3.0,,3.0,4.0,4.0,4.5,4.0,4.5,4.0,
310,3.0,3.0,5.0,4.5,5.0,4.5,2.0,4.5,4.0,3.0,4.5,4.5,4.0,3.0,4.0
379,5.0,5.0,5.0,4.0,,4.0,5.0,4.0,4.0,4.0,,3.0,5.0,4.0,4.0
451,4.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,4.0,2.0,3.5,5.0
467,3.0,3.5,3.0,2.5,,,3.0,3.5,3.5,3.0,3.5,3.0,3.0,4.0,4.0


Most popular model is dot product of movie embedding and user embedding -- Lets see how this works

In [55]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)

In [56]:
x = merge([u,m], mode='dot')
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [35]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=1, 
          validation_data = ([val.userId, val.movieId], val.rating))

Train on 80099 samples, validate on 19905 samples
Epoch 1/1


<keras.callbacks.History at 0x1139637d0>

In [21]:
model.optimizer.lr = 0.001
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=1, 
          validation_data = ([val.userId, val.movieId], val.rating))

Train on 80099 samples, validate on 19905 samples
Epoch 1/1


<keras.callbacks.History at 0x1152891d0>

In [57]:
model.optimizer.lr = 0.001
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=2, 
          validation_data = ([val.userId, val.movieId], val.rating))

Train on 80099 samples, validate on 19905 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1165b0f10>

We are not there yet, we are missing bias to encode how positive or negative each user is and how good each movie is. 
We can add that easily by creating one embedding output for each user and movie and adding it to our output.

Bias:

The problem is likely to be that we don't have bias terms - that is, a single bias for each user and each movie representing how positive or negative each user is, and how good each movie is. We can add that easily by simply creating an embedding with one output for each movie and each user, and adding it to our output.

In [66]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

In [68]:
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-4)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-4)

In [69]:
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [70]:
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [71]:
x = merge([u, m], mode='dot')
x = Flatten()(x)
x = merge([x,ub], mode='sum')
x = merge([x,mb], mode='sum')
model = Model([user_in, movie_in], x)
model.compile(Adam(0.01), loss='mse')

In [72]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=1, 
          validation_data = ([val.userId, val.movieId], val.rating))

Train on 80099 samples, validate on 19905 samples
Epoch 1/1


<keras.callbacks.History at 0x111735ad0>

In [None]:
model.optimizer.lr = 0.01
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=4, 
          validation_data = ([val.userId, val.movieId], val.rating))

In [None]:
model.optimizer.lr = 0.001
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=5, 
          validation_data = ([val.userId, val.movieId], val.rating))

Train on 80099 samples, validate on 19905 samples
Epoch 1/5
Epoch 2/5

In [None]:
model.save_weights(model_path+'bias.h5')
model.load_weights(model_path+'bias.h5')

In [None]:
#We can predict the rating of a user for a movie
model.predict([np.array([3]), np.array([6]])

In [None]:
#analyze results - Tomake analyzing important factor more easy we will restruct to 2000 most popular movies
g = ratings.groupby['Movieid']['rating'].count()
topMovies = g.sort_values(ascending=False)[:2000]
topMovies = np.array(topMovies.index)

First, we'll look at the movie bias term. We create a 'model' - which in keras is simply a way of associating one or more inputs with one more more outputs, using the functional API. Here, our input is the movie id (a single id), and the output is the movie bias (a single float).

In [None]:
get_movie_bias = Model(movie_in, mb)
movie_bias = get_movie_bias(topMovies)
movie_rating = [(b[0], movie_names[[movies[i]]) for i,b in zip(topMovies, movie_bias)]

Now we can look at the top and bottom rated movies. These ratings are corrected for different levels of reviewer sentiment, as well as different types of movies that different reviewers watch.

In [None]:
sorted(movir_ratings, itemgetter(0))[:15]

In [None]:
sorted(movir_ratings, itemgetter(0), reverse=True)[:15]

In [None]:
#We can now do the same for embedding
get_movie_emb = Model(movie_in, m)
movie_emb = np.squeeze(get_movie_emb.predict([topMovies])) 
movie_emb.shape

In [None]:
#It is difficult to interpret 50 embedding so we'll use PCA to take it down to 3
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
movie_pca = pca.fit(movie_emb.T).components_

In [None]:
fca0 = movie_pca[0]

In [None]:
movie_comp = [(f, movie_names[movies[i]] for f, i in zip(fca0, topMovies))]

Here is the first components - It seems to be a critically acclaimed or classic movie

In [None]:
sorted(movie_comp, itemgetter(0))[:10]

In [None]:
sorted(movie_comp, itemgetter(0), reverse=True)[:10]

In [None]:
fca1 = movie_pca[1]

In [None]:
movie_comp = [(f, movie_names[movies[i]] for f, i in zip(fca1, topMovies))]

In [None]:
sorted(movie_comp, itemgetter(0))[:10]

In [None]:
sorted(movie_comp, itemgetter(0), reverse=True)[:10]

In [None]:
fca2 = = movie_pca[2]

In [None]:
movie_comp = [(f, movie_names[movies[i]] for f, i in zip(fca2, topMovies))]

In [None]:
sorted(movie_comp, itemgetter(0))[:10]

In [None]:
sorted(movie_comp, itemgetter(0), reverse=True)[:10]

In [None]:
#We can draw picture of how these movies appear along these components - we will see 1st and 3rd components
reload(sys)
sys.setdefaultencoding('utf8')

In [None]:
start=50, end=100
X = fac0[start:end]
Y = fac2[start:end]
plt.figure(figsize=(15,15))
plt.scatter(x,y)
for i, x, y in zip(topMovies[start:end], X , Y):
    plt.text(x,y, movie_names[movies[i]], color=np.random.rand(3)*.7, fontsize=14)
plt.show()

Neural Net:
    
Its often easier and more accurate than to create a special purpose architecture of our dot product and bias example is to create a neural network. For that we need to concatenate user and movie embedding to feed into neural netwrok.

In [None]:
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-4)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-4)

x = merge([u,m], modoe='concat')
x = Flatten()(x)
x = Dropout(0.2)
x = Dense(70, activation='relu')(x)
x = Dropout(0.75)(x)
x = Dense(1)(x)
nn = Model([user_in, movie_in], x)
nn.compile(Adma(0.001), loss='mse')

nn.fit([trn_userId, trn_movieId], trn.rating, batch_size=64, nb_epoch=8, 
       validation_data = ([val_userId, val_movieId], val.rating))