In [5]:
# implementation of probabilistic matrix factorisation

import pickle
import random
import numpy as np
from scipy.sparse import csr_matrix, save_npz, load_npz
from tqdm.notebook import tqdm
from itertools import chain
from collections import Counter, defaultdict
from pathlib import Path
from sklearn.naive_bayes import MultinomialNB
import csv
import tensorflow as tf

In [6]:
tag_names = Path("dataset")/"genome-tags.csv"                   # tag name lookup
movie_review_relevance = Path("dataset")/"genome-scores.csv"    # movieid/tagid/relevance
movie_genres = Path("dataset")/"movies.csv"                     # movieid/movie title/genres
reviews = Path("dataset")/"tags_shuffled_rehashed.csv"          # userid/movieid/tag
train_set = Path("dataset")/"train_ratings_binary.csv"          # train set - userid/movieid/ratings
val_set = Path("dataset")/"val_ratings_binary.csv"              # val set - userid/movieid/ratings
test_set = Path("dataset")/"test_ratings.csv"                   # test set - userid/movieids

NUM_MOVIES = 26744
NUM_USERS = 138493
NUM_TRAINING_SET = 11946576


In [7]:
# internal movieids are used as movieids aren't contiguous
userid_uid_lookup = lambda userid: userid-1

movieid_mid_lookup = {}
next_unassigned_mid = 0

def add_movieids_to_lookuptable(filename):
    global next_unassigned_mid

    print(f"updating lookuptable with mids from {filename}")
    with open(filename, newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        for rating in tqdm(reader):
            movieid = int(float(rating["movieId"]))
            if movieid not in movieid_mid_lookup:
                movieid_mid_lookup[movieid] = next_unassigned_mid
                next_unassigned_mid += 1

add_movieids_to_lookuptable(train_set)
add_movieids_to_lookuptable(val_set)
add_movieids_to_lookuptable(test_set)
add_movieids_to_lookuptable(movie_genres)

with open("movieid_mid_lookup", "wb+") as lookup_file:
    pickle.dump(movieid_mid_lookup, lookup_file)

updating lookuptable with mids from dataset/train_ratings_binary.csv


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [8]:
userid_uid_lookup = lambda userid: userid-1

with open("movieid_mid_lookup", "rb") as lookup_file:
    movieid_mid_lookup = pickle.load(lookup_file)

In [9]:
movie_summary_matrix = load_npz("movie_summary_matrix.npz")

def get_dataset(filename, include_ys=True):
    print(f"retrieving dataset from {filename}")
    with open(filename, newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        user_Xs = []
        movie_Xs = []
        ys = []
        for rating in tqdm(reader):
            userid = int(float(rating["userId"]))
            uid = userid_uid_lookup(userid)
            user_Xs.append(uid)
            
            movieid = int(float(rating["movieId"]))
            mid = movieid_mid_lookup[movieid]
            movie_Xs.append(mid)
            
            if include_ys:
                score = 1 if (rating["rating"] == "1") else -1
                ys.append(score)
    if include_ys:
        return np.array(user_Xs), np.array(movie_Xs), np.array(ys)
    else:
        return np.array(user_Xs), np.array(movie_Xs)

def batchify(*args, batch_size=1000, shuffle=True):
    if batch_size == -1:
        return args
    
    num_elems = len(args[0])
    
    if shuffle:
        shuffle_indices = np.arange(num_elems, dtype=np.int64)
        np.random.shuffle(shuffle_indices)
        for i in range(0, num_elems, batch_size):
            array_indices = shuffle_indices[i: i+batch_size]
            try:
                yield [arg[array_indices] for arg in args]
            except:
                raise Exception("args to batchify must be numpy arrays if shuffle True")
    else:
        for i in range(0, num_elems, batch_size):
            yield [arg[i: i+batch_size] for arg in args]

def genre_parser(genre):
    if genre == "(no genres listed)":
        return ["none/other"]
    return genre.split("|")

ALL_GENRES = ['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Crime', 'Horror', 'Documentary', 'Adventure', 'Sci-Fi', 'Mystery', 'Fantasy', 'War', 'Children', 'Musical', 'Animation', 'Western', 'Film-Noir', 'none/other', 'IMAX']
with open(movie_genres, newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    movie_genres_one_hot = {movieid_mid_lookup[int(float(movie["movieId"]))]: np.array([genre in movie["genres"] for genre in ALL_GENRES]) for movie in reader}        

user_Xs, movie_Xs, ys = get_dataset(train_set)
# user_val_Xs, movie_val_Xs = get_dataset(test_set, include_ys=False)
user_val_Xs, movie_val_Xs, val_ys = get_dataset(val_set)

retrieving dataset from dataset/train_ratings_binary.csv


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


retrieving dataset from dataset/val_ratings_binary.csv


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
print(ALL_GENRES)
with open(movie_genres, newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    for i, movie in enumerate(reader):
        print(movie["genres"])
        print(movie_genres_one_hot[movieid_mid_lookup[int(float(movie["movieId"]))]])
        if i  > 3:
            break

['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Crime', 'Horror', 'Documentary', 'Adventure', 'Sci-Fi', 'Mystery', 'Fantasy', 'War', 'Children', 'Musical', 'Animation', 'Western', 'Film-Noir', 'none/other', 'IMAX']
Adventure|Animation|Children|Comedy|Fantasy
[False  True False False False False False False  True False False  True
 False  True False  True False False False False]
Adventure|Children|Fantasy
[False False False False False False False False  True False False  True
 False  True False False False False False False]
Comedy|Romance
[False  True False  True False False False False False False False False
 False False False False False False False False]
Comedy|Drama|Romance
[ True  True False  True False False False False False False False False
 False False False False False False False False]
Comedy
[False  True False False False False False False False False False False
 False False False False False False False False]


In [12]:
# no_genre_count = 0
# total = 0

# with open(test_set, newline="") as csvfile:
#     reader = csv.DictReader(csvfile)
#     for rating in tqdm(reader):
#         if movieid_mid_lookup[int(float(rating["movieId"]))] not in movie_genres_one_hot:
#             no_genre_count += 1
#         total += 1

# print(f"{no_genre_count}/{total} entries in the test data doesn't have genre info ({no_genre_count/total}%)")

In [13]:
# # no memory - implicitly calculating user movie matrix from now on

# movie_embeddings = tf.Variable(tf.random_normal([5, NUM_MOVIES], stddev=0.03, dtype=tf.float32))
# user_embeddings = tf.Variable(tf.random_normal([NUM_USERS, 5], stddev=0.03, dtype=tf.float32))
# movie_bias = tf.Variable(tf.random_normal([1, NUM_MOVIES], stddev=0.03, dtype=tf.float32))
# user_bias = tf.Variable(tf.random_normal([NUM_USERS, 1], stddev=0.03, dtype=tf.float32))

# user_movie_score = tf.tensordot(user_embeddings, movie_embeddings, axes = 1)+.14*tf.tile(movie_bias, [NUM_USERS, 1]) +.87*tf.tile(user_bias, [1, NUM_MOVIES])

In [14]:
embedding_dim = 40
assert embedding_dim > 20

movie_genre_embeddings = tf.placeholder(dtype=tf.float32, shape=[None, 20])
movie_embeddings = tf.Variable(tf.random_normal([NUM_MOVIES, embedding_dim-20], stddev=0.03, dtype=tf.float32))
user_embeddings = tf.Variable(tf.random_normal([NUM_USERS, embedding_dim], stddev=0.03, dtype=tf.float32))
movie_bias = tf.Variable(tf.random_normal([NUM_MOVIES], stddev=0.03, dtype=tf.float32))
user_bias = tf.Variable(tf.random_normal([NUM_USERS], stddev=0.03, dtype=tf.float32))

# user_movie_score = tf.tensordot(user_embeddings, movie_embeddings, axes = 1)+.14*tf.tile(movie_bias, [NUM_USERS, 1]) +.87*tf.tile(user_bias, [1, NUM_MOVIES])

user_slice_idxs = tf.placeholder(dtype=tf.int64, shape=[None, 1])
movie_slice_idxs = tf.placeholder(dtype=tf.int64, shape=[None, 1])
# user_bias_idxs = tf.placeholder(dtype=tf.int64, shape=[None, 1])
# movie_bias_idxs = tf.placeholder(dtype=tf.int64, shape=[None, 1])

user_embedding_columns = tf.reshape(tf.gather_nd(user_embeddings, user_slice_idxs), [-1, embedding_dim])
movie_embedding_rows = tf.reshape(tf.gather_nd(movie_embeddings, movie_slice_idxs), [-1, embedding_dim-20])
print("movie_embedding_rows shape", movie_embedding_rows.shape)

user_slice_bias = tf.gather_nd(user_bias, user_slice_idxs)
movie_slice_bias = tf.gather_nd(movie_bias, movie_slice_idxs)
print("user_slice_bias shape", user_slice_bias.shape)

pred_y = tf.reduce_sum(user_embedding_columns * tf.concat((movie_embedding_rows, movie_genre_embeddings), axis=1), axis=1) + .14*movie_slice_bias + .87*user_slice_bias
print(pred_y.shape)
print(tf.concat((movie_embedding_rows, movie_genre_embeddings), axis=1).shape)
print(movie_embedding_rows.shape)
print(user_embedding_columns.shape)
y_true = tf.placeholder(dtype=tf.float32, shape=[None])

movie_embedding_rows shape (?, 20)
user_slice_bias shape (?,)
(?,)
(?, 40)
(?, 20)
(?, 40)


In [17]:
def compute_slices(user_Xs, movie_Xs, embedding_dim):
    user_slice_idxs = [[user_X] for user_X in user_Xs]
    movie_slice_idxs = [[movie_X] for movie_X in movie_Xs]

    return [np.array(user_slice_idxs).reshape([-1, 1]), np.array(movie_slice_idxs).reshape([-1, 1])]

learning_rate = .05
epochs = 15

loss = tf.reduce_mean(tf.squared_difference(pred_y, y_true))
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)

    for epoch in tqdm(range(epochs), leave=False):
        for b_m_Xs, b_u_Xs, b_ys in batchify(movie_Xs, user_Xs, ys, batch_size=1000000):
#             print("batch vals", b_m_Xs, b_u_Xs, b_ys)
            
            slice_indices = compute_slices(b_u_Xs, b_m_Xs, embedding_dim)
            user_slice, movie_slice = slice_indices
            
#             print("user_slice indexes shape", np.array(user_slice).shape)
#             print("movie bias indexes shape", np.array(m_bias_slice).shape)
            
            genres = np.array([movie_genres_one_hot[x] for x in b_m_Xs])
            feed_dict = {user_slice_idxs: user_slice, 
                         movie_slice_idxs: movie_slice, 
                         movie_genre_embeddings: genres,
                         y_true: b_ys}
            _, lossval, pred_y_vals = sess.run((train_step, loss, pred_y), feed_dict=feed_dict)
            print("train loss", lossval, "pred ys", pred_y_vals[:10].flatten(), "y_true", b_ys[:10].flatten())

        print("computing val acc...")
        val_slice_indices = compute_slices(user_val_Xs, movie_val_Xs, embedding_dim)
        user_val_slice, movie_val_slice = val_slice_indices
        genres = np.array([movie_genres_one_hot[x] for x in movie_val_Xs])
        feed_dict = {user_slice_idxs: user_val_slice, 
                         movie_slice_idxs: movie_val_slice,
                         movie_genre_embeddings: genres,
                         y_true: val_ys}
        val_y_pred, val_loss_val = sess.run((pred_y, loss), feed_dict=feed_dict)
        print("val loss", val_loss_val)
        print("val acc", sum([((1 if pred > .5 else -1) == true) for pred, true in zip(val_y_pred, val_ys)])/len(val_ys))

KeyboardInterrupt: 

In [15]:
print(movie_genres_one_hot[27278])

KeyError: 27278

In [11]:
val_slice_indices = compute_slices(user_val_Xs, movie_val_Xs, embedding_dim)
user_val_slice, movie_val_slice = val_slice_indices

print(user_val_slice.shape, movie_val_slice.shape, np.array(val_ys).shape)

slice_indices = compute_slices(user_Xs, movie_Xs, embedding_dim)
user_slice, movie_slice = slice_indices

print(user_slice.shape, movie_slice.shape, np.array(ys).shape)

(3999236, 1) (3999236, 1) (3999236,)
(11946576, 1) (11946576, 1) (11946576,)


In [60]:
print(len(user_Xs), len(ys))
print(len(user_val_Xs), len(val_ys))

11946576 11946576
3999236 3999236
