In [1]:
%run env_setup.py
import os
import pandas as pd
import numpy as np
import keras
from lessdeep.datasets.grouplens import movielens, movielens_small

Using TensorFlow backend.


Load data

In [2]:
data_path = movielens.download_data()
sample_path = movielens_small.download_data()

In [3]:
path = sample_path

In [4]:
rating = pd.read_csv(os.path.join(path, 'ratings.csv'))

In [5]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


Just for display purpose read movie names

In [6]:
movie_names = pd.read_csv(os.path.join(path, 'movies.csv')).set_index('movieId')['title'].to_dict()

In [7]:
users = rating['userId'].unique()
movies = rating['movieId'].unique()
userid2index = {o : i for i, o in enumerate(users)}
movieid2index = {o : i for i, o in enumerate(movies)}

rating.movieId = rating.movieId.apply(lambda x: movieid2index[x])
rating.userId = rating.userId.apply(lambda x: userid2index[x])

In [8]:
n_users = rating.userId.nunique()
n_movies = rating.movieId.nunique()
n_users, n_movies

(671, 9066)

In [9]:
np.random.seed(7)
train_select = np.random.rand(len(rating)) < 0.8
train_set = rating[train_select]
val_set = rating[~train_select]
(len(train_set), len(val_set))

(79831, 20173)

## Creating models

In [10]:
n_factors = 50

In [11]:
user_in = keras.layers.Input(shape=(1,), dtype=rating.userId.dtype, name='user_in')
user_embedding = keras.layers.Embedding(input_dim=n_users, output_dim=n_factors,
                                        embeddings_regularizer=keras.regularizers.l2(1e-4))(user_in)
movie_in = keras.layers.Input(shape=(1,), dtype=rating.movieId.dtype, name='movie_in')
movie_embedding = keras.layers.Embedding(input_dim=n_movies, output_dim=n_factors,
                                         embeddings_regularizer=keras.regularizers.l2(1e-4))(movie_in)

In [12]:
#x = keras.layers.merge([user_embedding, movie_embedding], mode='dot')
x = keras.layers.dot([user_embedding, movie_embedding], axes=(2, 2))
x = keras.layers.Flatten()(x)
model = keras.Model([user_in, movie_in], outputs=x)

#import tensorflow as tf
#run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
#run_metadata = tf.RunMetadata()
model.compile(keras.optimizers.Adam(0.001), loss='mse') #, options=run_options, run_metadata=run_metadata)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        33550       user_in[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        453300      movie_in[0][0]                   
__________________________________________________________________________________________________
dot_1 (Dot

In [13]:
def train(epoches, batch_size=512):
    model.fit([train_set.userId, train_set.movieId], train_set.rating, batch_size=batch_size, epochs=epoches,
              validation_data=([val_set.userId, val_set.movieId], val_set.rating))

In [14]:
train(1, batch_size=1024)

Train on 79831 samples, validate on 20173 samples
Epoch 1/1


In [15]:
model.optimizer.lr = 0.001

In [16]:
train(5)

Train on 79831 samples, validate on 20173 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Add bias

In [17]:
def embedding(input_dim, output_dim, dtype, name, regular=1e-4):
    input_layer = keras.layers.Input(shape=(1,), dtype=rating.userId.dtype, name=name)
    return input_layer, keras.layers.Embedding(input_dim=input_dim, output_dim=output_dim,
                                               embeddings_regularizer=keras.regularizers.l2(regular))(input_layer)
user_in, user_emb = embedding(n_users, output_dim=n_factors, dtype=rating.userId.dtype, name="user_in_1")
movie_in, movie_emb = embedding(n_movies, output_dim=n_factors, dtype=rating.movieId.dtype, name="movie_in_1")

def create_bias(num):
    return lambda input_l: keras.layers.Flatten()(keras.layers.Embedding(input_dim=num, output_dim=1)(input_l))

x = keras.layers.dot([user_emb, movie_emb], axes=(2, 2))
x = keras.layers.Flatten()(x)
users_bias = create_bias(n_users)(user_in)
movies_bias = create_bias(n_movies)(movie_in)
x = keras.layers.add([x, users_bias, movies_bias])
model_1 = keras.Model([user_in, movie_in], outputs=x)
model_1.compile(keras.optimizers.Adam(0.001), loss='mse')
model_1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in_1 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in_1 (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 50)        33550       user_in_1[0][0]                  
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 50)        453300      movie_in_1[0][0]                 
__________________________________________________________________________________________________
dot_2 (Dot

In [18]:
def train(model, epoches, batch_size=512):
    model.fit([train_set.userId, train_set.movieId], train_set.rating, batch_size=batch_size, epochs=epoches,
              validation_data=([val_set.userId, val_set.movieId], val_set.rating))

In [19]:
model_1.optimizer.lr = 0.01
train(model_1, 3)

Train on 79831 samples, validate on 20173 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [20]:
model_1.optimizer.lr = 0.001
train(model_1, 6, batch_size=512)

Train on 79831 samples, validate on 20173 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


# Analyze

In [21]:
# groupby group rows with same key, here group users to each movie
# then count users for each movie
g = rating.groupby('movieId')['rating'].count()

# take top 2000 most polular movies
# the more user rating, the more popular the movie is. No matter rating is high or low
g_top = g.sort_values(ascending=False)[:200]
top_movie = np.array(g_top.index)

## Movie bias
movie bias means general property of the movie

In [22]:
mbias_model = keras.Model(movie_in, movies_bias)
top_mbiase = mbias_model.predict(top_movie)

In [23]:
pred_rating = [(b[0], movie_names[movies[i]]) for i, b in zip(top_movie, top_mbiase)]

In [24]:
from operator import itemgetter
sorted(pred_rating, key=itemgetter(0))[:15]

[(0.26705092, 'Blair Witch Project, The (1999)'),
 (0.42768431, 'Dumb & Dumber (Dumb and Dumber) (1994)'),
 (0.49323946, 'Ace Ventura: When Nature Calls (1995)'),
 (0.55017078, 'Austin Powers: The Spy Who Shagged Me (1999)'),
 (0.57084793, 'Chicken Run (2000)'),
 (0.57818061, 'Ace Ventura: Pet Detective (1994)'),
 (0.60657442, 'Beetlejuice (1988)'),
 (0.62372738, 'Kill Bill: Vol. 1 (2003)'),
 (0.65086234, 'Meet the Parents (2000)'),
 (0.66375995, 'Kill Bill: Vol. 2 (2004)'),
 (0.67179209, 'Waterworld (1995)'),
 (0.68217385, 'Spider-Man (2002)'),
 (0.68540901, 'American Pie (1999)'),
 (0.68938136, 'Lethal Weapon (1987)'),
 (0.69118989, 'Clueless (1995)')]

In [25]:
sorted(pred_rating, key=itemgetter(0), reverse=True)[:15]

[(1.3445534, 'Shawshank Redemption, The (1994)'),
 (1.2478828, 'Heat (1995)'),
 (1.1993316, 'Rear Window (1954)'),
 (1.1493307, 'Fugitive, The (1993)'),
 (1.1267735, 'Silence of the Lambs, The (1991)'),
 (1.1224185, 'Fargo (1996)'),
 (1.1082902, 'Usual Suspects, The (1995)'),
 (1.1054639, "Schindler's List (1993)"),
 (1.1014304, 'Pulp Fiction (1994)'),
 (1.096014, 'Godfather, The (1972)'),
 (1.0873785, 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)'),
 (1.0763277, 'Clear and Present Danger (1994)'),
 (1.0734397, 'Dark Knight, The (2008)'),
 (1.0728868, 'Forrest Gump (1994)'),
 (1.0684099, 'Sense and Sensibility (1995)')]

## Movie Embedding
embedding is too large to analyze. We can use [PCA(Principal Component Analysis)](https://en.wikipedia.org/wiki/Principal_component_analysis) to find the most important features

In [26]:
memb_model = keras.Model(movie_in, movie_emb)
top_emb = np.squeeze(memb_model.predict(top_movie))
top_emb.shape

(200, 50)

In [27]:
from sklearn.decomposition import PCA
movie_pca = PCA(n_components=3).fit(top_emb.T).components_
movie_pca.shape

(3, 200)

In [28]:
movie_comp_0 = [(np.squeeze(em), movie_names[movies[i]]) for i, em in zip(top_movie, movie_pca[0])]

In [29]:
sorted(movie_comp_0, key=itemgetter(0))[:15]

[(-0.12532365, 'Lord of the Rings: The Fellowship of the Ring, The (2001)'),
 (-0.11365488, 'Pulp Fiction (1994)'),
 (-0.11153743, "Schindler's List (1993)"),
 (-0.11016429, 'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (-0.11009175,
  'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)'),
 (-0.1077768, 'Lord of the Rings: The Return of the King, The (2003)'),
 (-0.10640314, 'Silence of the Lambs, The (1991)'),
 (-0.10548574, 'Star Wars: Episode IV - A New Hope (1977)'),
 (-0.10178412, 'Godfather: Part II, The (1974)'),
 (-0.1013743, 'Godfather, The (1972)'),
 (-0.10077167, 'Usual Suspects, The (1995)'),
 (-0.099855773, 'Seven (a.k.a. Se7en) (1995)'),
 (-0.096807972, 'Dark Knight, The (2008)'),
 (-0.09612079, 'Lord of the Rings: The Two Towers, The (2002)'),
 (-0.095977843, 'Terminator 2: Judgment Day (1991)')]

In [30]:
sorted(movie_comp_0, key=itemgetter(0), reverse=True)[:15]

[(-0.017855579, 'Ace Ventura: When Nature Calls (1995)'),
 (-0.021866273, 'Batman Forever (1995)'),
 (-0.022151144, 'Twister (1996)'),
 (-0.029382469, 'Firm, The (1993)'),
 (-0.031611625, 'Broken Arrow (1996)'),
 (-0.031981662, 'Star Wars: Episode I - The Phantom Menace (1999)'),
 (-0.033278599, 'Pretty Woman (1990)'),
 (-0.034962945, 'Chicken Run (2000)'),
 (-0.03496344, 'Back to the Future Part II (1989)'),
 (-0.035524014, 'Net, The (1995)'),
 (-0.035920169, 'Blair Witch Project, The (1999)'),
 (-0.036154501, 'Demolition Man (1993)'),
 (-0.037076697, 'Happy Gilmore (1996)'),
 (-0.03742753, 'Waterworld (1995)'),
 (-0.038521368, 'Cliffhanger (1993)')]

# Neural net

In [156]:
user_in, user_emb = embedding(n_users, output_dim=n_factors, dtype=rating.userId.dtype, name="user_in_2")
movie_in, movie_emb = embedding(n_movies, output_dim=n_factors, dtype=rating.movieId.dtype, name="movie_in_2")
x = keras.layers.concatenate([user_emb, movie_emb])
x = keras.layers.Flatten()(x)
#x = keras.layers.BatchNormalization()(x)
x = keras.layers.Dropout(0.7)(x)
x = keras.layers.Dense(100, activation='relu')(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(1)(x)
nn = keras.Model([user_in, movie_in], x)
nn.compile(keras.optimizers.Adam(), loss='mse')
nn.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in_2 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in_2 (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_67 (Embedding)        (None, 1, 50)        33550       user_in_2[0][0]                  
__________________________________________________________________________________________________
embedding_68 (Embedding)        (None, 1, 50)        453300      movie_in_2[0][0]                 
__________________________________________________________________________________________________
concatenat

In [165]:
nn.optimizer.lr = 0.0001
train(nn, epoches=15, batch_size=2048)

Train on 79831 samples, validate on 20173 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
