In [1]:
%run env_setup.py
import os
import pandas as pd
import numpy as np
import keras
from lessdeep.datasets.grouplens import movielens, movielens_small

Using TensorFlow backend.


Load data

In [2]:
data_path = movielens.download_data()
sample_path = movielens_small.download_data()

In [3]:
path = sample_path

In [4]:
rating = pd.read_csv(os.path.join(path, 'ratings.csv'))

In [5]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


Just for display purpose read movie names

In [6]:
movie_names = pd.read_csv(os.path.join(path, 'movies.csv')).set_index('movieId')['title'].to_dict()

In [7]:
users = rating['userId'].unique()
movies = rating['movieId'].unique()
userid2index = {o : i for i, o in enumerate(users)}
movieid2index = {o : i for i, o in enumerate(movies)}

rating.movieId = rating.movieId.apply(lambda x: movieid2index[x])
rating.userId = rating.userId.apply(lambda x: userid2index[x])

In [8]:
n_users = rating.userId.nunique()
n_movies = rating.movieId.nunique()
n_users, n_movies

(671, 9066)

In [9]:
np.random.seed(7)
train_select = np.random.rand(len(rating)) < 0.8
train_set = rating[train_select]
val_set = rating[~train_select]
(len(train_set), len(val_set))

(79831, 20173)

## Creating models

In [10]:
n_factors = 50

In [11]:
user_in = keras.layers.Input(shape=(1,), dtype=rating.userId.dtype, name='user_in')
user_embedding = keras.layers.Embedding(input_dim=n_users, output_dim=n_factors,
                                        embeddings_regularizer=keras.regularizers.l2(1e-4))(user_in)
movie_in = keras.layers.Input(shape=(1,), dtype=rating.movieId.dtype, name='movie_in')
movie_embedding = keras.layers.Embedding(input_dim=n_movies, output_dim=n_factors,
                                         embeddings_regularizer=keras.regularizers.l2(1e-4))(movie_in)

In [12]:
#x = keras.layers.merge([user_embedding, movie_embedding], mode='dot')
x = keras.layers.dot([user_embedding, movie_embedding], axes=(2, 2))
x = keras.layers.Flatten()(x)
model = keras.Model([user_in, movie_in], outputs=x)

#import tensorflow as tf
#run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
#run_metadata = tf.RunMetadata()
model.compile(keras.optimizers.Adam(0.001), loss='mse') #, options=run_options, run_metadata=run_metadata)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        33550       user_in[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        453300      movie_in[0][0]                   
__________________________________________________________________________________________________
dot_1 (Dot

In [13]:
def train(epoches, batch_size=512):
    model.fit([train_set.userId, train_set.movieId], train_set.rating, batch_size=batch_size, epochs=epoches,
              validation_data=([val_set.userId, val_set.movieId], val_set.rating))

In [14]:
train(1, batch_size=1024)

Train on 79831 samples, validate on 20173 samples
Epoch 1/1


In [15]:
model.optimizer.lr = 0.001

In [22]:
train(5)

TypeError: train() missing 1 required positional argument: 'epoches'

# Add bias

In [34]:
def embedding(input_dim, output_dim, dtype, name):
    input_layer = keras.layers.Input(shape=(1,), dtype=rating.userId.dtype, name=name)
    return input_layer, keras.layers.Embedding(input_dim=input_dim, output_dim=output_dim,
                                               embeddings_regularizer=keras.regularizers.l2(1e-4))(input_layer)
user_in, user_emb = embedding(n_users, output_dim=n_factors, dtype=rating.userId.dtype, name="user_in_1")
movie_in, movie_emb = embedding(n_movies, output_dim=n_factors, dtype=rating.movieId.dtype, name="movie_in_1")

def create_bias(num):
    return lambda input_l: keras.layers.Flatten()(keras.layers.Embedding(input_dim=num, output_dim=1)(input_l))

x = keras.layers.dot([user_emb, movie_emb], axes=(2, 2))
x = keras.layers.Flatten()(x)
users_bias = create_bias(n_users)(user_in)
movies_bias = create_bias(n_movies)(movie_in)
x = keras.layers.add([x, users_bias, movies_bias])
model_1 = keras.Model([user_in, movie_in], outputs=x)
model_1.compile(keras.optimizers.Adam(0.001), loss='mse')
model_1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in_1 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in_1 (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 50)        33550       user_in_1[0][0]                  
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1, 50)        453300      movie_in_1[0][0]                 
__________________________________________________________________________________________________
dot_3 (Dot

In [35]:
def train(model, epoches, batch_size=512):
    model.fit([train_set.userId, train_set.movieId], train_set.rating, batch_size=batch_size, epochs=epoches,
              validation_data=([val_set.userId, val_set.movieId], val_set.rating))

In [36]:
model_1.optimizer.lr = 0.01
train(model_1, 3)

Train on 79831 samples, validate on 20173 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [38]:
model_1.optimizer.lr = 0.001
train(model_1, 6, batch_size=512)

Train on 79831 samples, validate on 20173 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


# Analyze

In [39]:
# groupby group rows with same key, here group users to each movie
# then count users for each movie
g = rating.groupby('movieId')['rating'].count()

# take top 2000 most polular movies
# the more user rating, the more popular the movie is. No matter rating is high or low
g_top = g.sort_values(ascending=False)[:200]
top_movie = np.array(g_top.index)

## Movie bias
movie bias means general property of the movie

In [40]:
mbias_model = keras.Model(movie_in, movies_bias)
top_mbiase = mbias_model.predict(top_movie)

In [49]:
pred_rating = [(b[0], movie_names[movies[i]]) for i, b in zip(top_movie, top_mbiase)]

In [53]:
from operator import itemgetter
sorted(pred_rating, key=itemgetter(0))[:15]

[(0.23316805, 'Ace Ventura: When Nature Calls (1995)'),
 (0.28110349, 'Blair Witch Project, The (1999)'),
 (0.42569846, 'Ace Ventura: Pet Detective (1994)'),
 (0.4436073, 'Waterworld (1995)'),
 (0.48902661, 'Batman Forever (1995)'),
 (0.54419923, 'Dumb & Dumber (Dumb and Dumber) (1994)'),
 (0.61570925, 'Demolition Man (1993)'),
 (0.66523993, 'Mask, The (1994)'),
 (0.67755139, 'Home Alone (1990)'),
 (0.71063524, 'Star Wars: Episode I - The Phantom Menace (1999)'),
 (0.71754479, 'Cliffhanger (1993)'),
 (0.72607332, 'Net, The (1995)'),
 (0.74424171, 'Austin Powers: The Spy Who Shagged Me (1999)'),
 (0.74786627, 'Armageddon (1998)'),
 (0.75457269, 'Grease (1978)')]

In [54]:
sorted(pred_rating, key=itemgetter(0), reverse=True)[:15]

[(1.969792, 'Shawshank Redemption, The (1994)'),
 (1.7130233, 'Usual Suspects, The (1995)'),
 (1.7006024, 'Godfather, The (1972)'),
 (1.6850375, 'North by Northwest (1959)'),
 (1.6786536, 'Godfather: Part II, The (1974)'),
 (1.6743218, "Schindler's List (1993)"),
 (1.6656477, 'Rear Window (1954)'),
 (1.6387142, 'Graduate, The (1967)'),
 (1.6149404, 'Dark Knight, The (2008)'),
 (1.6141309, 'Fight Club (1999)'),
 (1.6084744, 'Amadeus (1984)'),
 (1.6081483, 'American Beauty (1999)'),
 (1.594745, 'Lord of the Rings: The Fellowship of the Ring, The (2001)'),
 (1.5921611, 'Fargo (1996)'),
 (1.5898565, 'Departed, The (2006)')]

## Movie Embedding
embedding is too large to analyze. We can use [PCA(Principal Component Analysis)](https://en.wikipedia.org/wiki/Principal_component_analysis) to find the most important features

In [69]:
memb_model = keras.Model(movie_in, movie_emb)
top_emb = np.squeeze(memb_model.predict(top_movie))
top_emb.shape

(200, 50)

In [70]:
from sklearn.decomposition import PCA
movie_pca = PCA(n_components=3).fit(top_emb.T).components_
movie_pca.shape

(3, 200)

In [72]:
movie_comp_0 = [(np.squeeze(em), movie_names[movies[i]]) for i, em in zip(top_movie, movie_pca[0])]

In [73]:
sorted(movie_comp_0, key=itemgetter(0))[:15]

[(-0.061748736, 'Batman Forever (1995)'),
 (-0.054186881, 'Ace Ventura: When Nature Calls (1995)'),
 (-0.054059129, 'Demolition Man (1993)'),
 (-0.053379644, 'Twister (1996)'),
 (-0.039314698, 'Clear and Present Danger (1994)'),
 (-0.038626142, 'Armageddon (1998)'),
 (-0.034184832, 'Net, The (1995)'),
 (-0.03115828, 'Cliffhanger (1993)'),
 (-0.025284175, 'Back to the Future Part II (1989)'),
 (-0.021968322, 'Broken Arrow (1996)'),
 (-0.021323923, 'Sleepless in Seattle (1993)'),
 (-0.020879289, 'Back to the Future Part III (1990)'),
 (-0.020588996, 'Waterworld (1995)'),
 (-0.020182498, 'Mask, The (1994)'),
 (-0.018857751, 'GoldenEye (1995)')]

In [74]:
sorted(movie_comp_0, key=itemgetter(0), reverse=True)[:15]

[(0.18720871, 'Lord of the Rings: The Fellowship of the Ring, The (2001)'),
 (0.18555619, 'Star Wars: Episode IV - A New Hope (1977)'),
 (0.18156554, 'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (0.18152682, 'Lord of the Rings: The Two Towers, The (2002)'),
 (0.17502758, 'Star Wars: Episode V - The Empire Strikes Back (1980)'),
 (0.15960774, 'Lord of the Rings: The Return of the King, The (2003)'),
 (0.15704687,
  'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)'),
 (0.15482344, 'Matrix, The (1999)'),
 (0.15457687, 'Godfather: Part II, The (1974)'),
 (0.1383649, 'Silence of the Lambs, The (1991)'),
 (0.13636601, 'Godfather, The (1972)'),
 (0.13013883, 'Pulp Fiction (1994)'),
 (0.1290341, "Schindler's List (1993)"),
 (0.12668279, 'Sixth Sense, The (1999)'),
 (0.12034744, 'Back to the Future (1985)')]