In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Embedding, Dense, Dropout, Input, Flatten, Multiply, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1,l2
from tensorflow.keras.constraints import UnitNorm, NonNeg

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
# enable eager mode
tf.enable_eager_execution()
tf.executing_eagerly()

In [None]:
df = pd.read_csv("../../movie-lens-small-latest-dataset/ratings.csv")
train_data = df.sample(frac=1)
train_data['rating']  = (train_data['rating'] - 2.5)/2.5

In [None]:
movies = pd.read_csv("../../movie-lens-small-latest-dataset/movies.csv")

In [None]:
movie_avg_rating = df.groupby('movieId')['rating'].mean().reset_index()
movie_avg_rating['rating'] = (movie_avg_rating['rating'] - 2.5)/2.5
movie_avg_rating.columns = ['movieId', 'avg_rating']

In [None]:
movies.iloc[0]['genres'].split('|')
movies.iloc[[0]]['genres'].astype(str)
tf.keras.preprocessing.text.one_hot(movies.iloc[0]['genres'], 18, split='|')

In [None]:
_GENRE_LIST = ['Action','Adventure','Animation', 'Children', 'Comedy', 'Crime',
               'Documentary', 'Drama', 'Fantasy','Film-Noir','Horror','Musical',
               'Mystery','Romance','SciFi','Thriller','War','Western']
_GENRE_DICT = {'Unknown':0, 'Action':1,'Adventure':2,'Animation':3, 'Children':4, 'Comedy':5,
               'Crime':6, 'Documentary':7, 'Drama':8, 'Fantasy':9,'Film-Noir':10,'Horror':11,
               'Musical':12,'Mystery':13,'Romance':14,'Sci-Fi':15,'Thriller':16,'War':17,'Western':18, }

In [None]:
len(_GENRE_DICT)

In [None]:
def genre_str_to_array(genre_str):
    zeros = np.zeros(len(_GENRE_DICT))
    genre_list = genre_str.split('|')
    idx = [_GENRE_DICT[genre] if genre in _GENRE_DICT else _GENRE_DICT['Unknown']for genre in genre_list]
    zeros.put(idx, 1)
    return zeros

In [None]:
genre_str_to_array(movies.iloc[0]['genres'])
movies['genres'] = movies['genres'].apply(genre_str_to_array, 'coloumns')

In [None]:
new_train_df = pd.merge(movie_avg_rating, movies, on='movieId')[['movieId', 'avg_rating', 'genres']]
new_train_df2 = pd.merge(train_data, new_train_df, on='movieId')

In [None]:
def build_model():
    """ Build a model that the rating equals the dot multiplication of user embedding and moving
           embedding.
           
           IMO, this is similar to matrix factoraization.
    """
    user = Input(shape=(1,))
    movie = Input(shape=(1,))
    movie_genre_mask = Input(shape=(19,))
    movie_base_score = Input(shape=(1,))
    # TODO(summerxyt): It might be better to use embeddings_constraints. But tf and keras throws
    # an error I couldn't figure out why.
    user_embedding = Flatten()(
        Embedding(611, 19, embeddings_regularizer=l2(0.01), name='user_embedding')(user))
    movie_raw_embedding = Flatten()(
        Embedding(193610, 19, embeddings_regularizer=l2(0.01), name='movie_raw_embedding')(movie))
    movie_embedding = Multiply()([movie_raw_embedding, movie_genre_mask])
    genre_score = tf.keras.layers.Dot(axes=-1)([user_embedding, movie_embedding])
    y = tf.keras.layers.Add()([genre_score, movie_base_score])

    model = Model(inputs=[user, movie, movie_genre_mask, movie_base_score], outputs=y)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [None]:
model = build_model()

history = model.fit(x=[new_train_df2.userId.values, new_train_df2.movieId.values,
                       np.stack(new_train_df2.genres), new_train_df2.avg_rating.values],
                    y=new_train_df2.rating.values,
                    batch_size=2048*32,
                    #shuffle=True,
                    epochs=1000,
                    validation_split = 0.2,
                    callbacks = [tf.keras.callbacks.EarlyStopping(patience=2)],
                    verbose = 0)

In [None]:
user_embeddings = model.get_layer('user_embedding').get_weights()[0]
movie_embeddings = model.get_layer('movie_raw_embedding').get_weights()[0]

In [None]:
users_clusters = KMeans(n_clusters=2).fit_predict(user_embeddings[1:])
users_tsne = TSNE(n_components=2).fit_transform(user_embeddings[1:])
plt.scatter(users_tsne[:,0], users_tsne[:,1], c=users_clusters)