## Initialize

In [43]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [1]:
from theano.sandbox import cuda

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


In [3]:
path = "data/ml-20m/"
#path = "data/ml-latest-small/"
model_path = path + 'models/'
if not os.path.exists(model_path): os.mkdir(model_path)

In [4]:
batch_size=16384

## Set up data

We're working with the movielens data, which contains one rating per row, like this:

In [5]:
ratings = pd.read_csv(path+'ratings.csv', dtype={'userId':'int','movieId':'int','rating':'float','timestamp':'int'})
ratings = ratings.assign(time = pd.to_datetime(ratings.timestamp, unit='s'))
ratings = ratings.assign(weekday = pd.DatetimeIndex(ratings.time).dayofweek)
ratings = ratings.assign(month = pd.DatetimeIndex(ratings.time).month-1)
ratings = ratings.assign(hour = pd.DatetimeIndex(ratings.time).hour)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,time,weekday,month,hour
0,1,2,3.5,1112486027,2005-04-02 23:53:47,5,3,23
1,1,29,3.5,1112484676,2005-04-02 23:31:16,5,3,23
2,1,32,3.5,1112484819,2005-04-02 23:33:39,5,3,23
3,1,47,3.5,1112484727,2005-04-02 23:32:07,5,3,23
4,1,50,3.5,1112484580,2005-04-02 23:29:40,5,3,23


Just for display purposes, let's read in the movie names too.

In [6]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [7]:
userid2idx = dict.fromkeys(users)
movieid2idx = dict.fromkeys(movies)

for i,o in enumerate(users):
   userid2idx[o] = i

for i,o in enumerate(movies):
   movieid2idx[o] = i

We update the movie and user ids so that they are contiguous integers, which we want when using embeddings.

In [8]:
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

In [9]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_weekdays = ratings.weekday.nunique()
n_months = ratings.month.nunique()
n_hours = ratings.hour.nunique()
n_users, n_movies, n_weekdays, n_months, n_hours

(138493, 26744, 7, 12, 24)

In [10]:
#np.random.seed = 42

Randomly split into training and validation.

In [11]:
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]

##  Neural net

Rather than creating a special purpose architecture (like our dot-product with bias earlier), it's often both easier and more accurate to use a standard neural network. Let's try it! Here, we simply concatenate the user and movie embeddings into a single vector, which we feed into the neural net.

In [54]:
user_in = Input(shape=(1,), dtype='int64', name='user')
movie_in = Input(shape=(1,), dtype='int64', name='movie')
weekday_in = Input(shape=(1,), dtype='int64', name='weekday')
month_in = Input(shape=(1,), dtype='int64', name='month')
hour_in = Input(shape=(1,), dtype='int64', name='hour')

user_latents = Embedding(n_users, 4, input_length=1, name='user latents')
movie_latents = Embedding(n_movies, 6, input_length=1, name='movie latents')
weekday_latents = Embedding(n_weekdays, 2, input_length=1, name='weekday latents')
month_latents = Embedding(n_months, 2, input_length=1, name='month latents')
hour_latents = Embedding(n_hours, 2, input_length=1, name='hour latents')

x = Concatenate(name='merge latents')([user_latents(user_in),
                                       movie_latents(movie_in),
                                       weekday_latents(weekday_in),
                                       month_latents(month_in),
                                       hour_latents(hour_in)])
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dropout(0.5)(x)

x = Dense(1, name='collapse')(x)
x = GaussianNoise(0.25)(x)

nn = Model([user_in, movie_in, weekday_in, month_in, hour_in], x)
# nn = Model([user_in, movie_in], x)
nn.compile(Nadam(0.01, schedule_decay=0.004), loss='mse')

In [55]:
nn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user (InputLayer)                (None, 1)             0                                            
____________________________________________________________________________________________________
movie (InputLayer)               (None, 1)             0                                            
____________________________________________________________________________________________________
weekday (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
month (InputLayer)               (None, 1)             0                                            
___________________________________________________________________________________________

In [56]:
nn.fit([trn.userId, trn.movieId, trn.weekday, trn.month, trn.hour], trn.rating, batch_size=batch_size, epochs=8, 
          validation_data=([val.userId, val.movieId, val.weekday, val.month, val.hour], val.rating))
nn.optimizer.get_config()['lr']

Train on 15998906 samples, validate on 4001357 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


0.009999999776482582

In [None]:
nn.optimizer.lr.set_value(nn.optimizer.get_config()['lr']/10)
nn.fit([trn.userId, trn.movieId, trn.weekday, trn.month, trn.hour], trn.rating, batch_size=batch_size, epochs=4,
          validation_data=([val.userId, val.movieId, val.weekday, val.month, val.hour], val.rating))
nn.optimizer.get_config()['lr']

## Use Pseudo-labeling from that first model, and re-learn with the supplemental data

In [45]:
val_pseudo = nn.predict([val.userId, val.movieId, val.weekday, val.month, val.hour], batch_size=batch_size)

In [46]:
comb_ratings = np.concatenate([trn.rating.values.reshape([-1,1]), val_pseudo])

In [47]:
comb_userid = np.concatenate([trn.userId, val.userId])
comb_movieid = np.concatenate([trn.movieId, val.movieId])
comb_weekday = np.concatenate([trn.weekday, val.weekday])
comb_month = np.concatenate([trn.month, val.month])
comb_hour = np.concatenate([trn.hour, val.hour])

In [48]:
nn.fit([comb_userid, comb_movieid, comb_weekday, comb_month, comb_hour], comb_ratings, batch_size=batch_size, epochs=2,
      validation_data =([val.userId, val.movieId, val.weekday, val.month, val.hour], val.rating))

Train on 20000263 samples, validate on 4001357 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8dd8abc290>

In [49]:
nn.optimizer.lr.set_value(nn.optimizer.get_config()['lr']/10)
nn.fit([comb_userid, comb_movieid, comb_weekday, comb_month, comb_hour], comb_ratings, batch_size=batch_size, epochs=2,
      validation_data =([val.userId, val.movieId, val.weekday, val.month, val.hour], val.rating))

Train on 20000263 samples, validate on 4001357 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8dce27b9d0>

## Analyze Results

In [None]:
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False)
topMovies = np.array(topMovies.index)
get_movie_bias = Model(movie_in, mb)
movie_bias = get_movie_bias.predict(topMovies)
movie_ratings = [(b[0], movie_names[movies[i]]) for i,b in zip(topMovies,movie_bias)]
sorted(movie_ratings, key=itemgetter(0), reverse=True)[:15]

In [None]:
get_movie_emb = Model(movie_in, m)
movie_emb = np.squeeze(get_movie_emb.predict([topMovies]))
movie_emb.shape

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
movie_pca = pca.fit(movie_emb.T).components_
fac0 = movie_pca[0]
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac0, topMovies)]
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [None]:
fac1 = movie_pca[1]
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac1, topMovies)]
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [None]:
import sys
stdout, stderr = sys.stdout, sys.stderr # save notebook stdout and stderr
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout, sys.stderr = stdout, stderr # restore notebook stdout and stderr

In [None]:
start=0; end=50
X = fac0[start:end]
Y = fac1[start:end]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(topMovies[start:end], X, Y):
    plt.text(x,y,movie_names[movies[i]], color=np.random.rand(3)*0.7, fontsize=11)
plt.show()