In [71]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, Reshape,dot,Input,Dense
from keras.models import Sequential,Model
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt

In [72]:
data_dir = Path('/Users/kimdongkyu/Downloads/ml-100k/')
outdir = Path('/Users/kimdongkyu/Downloads/ml-100k/')

#Function to read data 
def create_data(rating,header_cols):
    data = pd.read_csv(rating,header=None,sep='\t')
    #print(data)
    data.columns = header_cols
    return data
 
#Movie id to movie name dict 
def create_movie_dict(movie_file):
    print(movie_file)
    df = pd.read_csv(movie_file,sep='|', encoding='latin-1',header=None)
    movie_dict = {}
    movie_ids = list(df[0].values)
    movie_name = list(df[1].values)
    for k,v in zip(movie_ids,movie_name):
        movie_dict[k] = v 
    return movie_dict
# Function to create training validation and test data
def train_val(df,val_frac=None):
    X,y = df[['userID','movieID']].values,df['rating'].values
    #Offset the ids by 1 for the ids to start from zero
    X = X - 1 
    if val_frac != None:
        X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=val_frac,random_state=0)
        return X_train, X_val, y_train, y_val
    else:
        return X,y

#  Define Model
# Converts the userId and the MovieId to embedding sizes of a given latent dimension and then takes the dot product 
# of those embeddings to get the rating score. 
def model(max_users,max_movies,latent_factors):
    user_ID = Input(shape=(1,))
    movie_ID = Input(shape=(1,))
    x = Embedding(max_users,latent_factors, input_length=1)(user_ID)
    print(x.shape)
    y = Embedding(max_movies,latent_factors, input_length=1)(movie_ID)
    print(y.shape)
    out = dot([x,y],axes=2, normalize=False)
    out= Reshape((1,))(out)
    model = Model(inputs=[user_ID,movie_ID],outputs=out)
    print(model.summary())
    return model

In [73]:
#Data Processing and Model Training 

train_ratings_df = create_data(f'{data_dir}/u1.base',['userID','movieID','rating','timestamp']) 
test_ratings_df = create_data(f'{data_dir}/u1.test',['userID','movieID','rating','timestamp']) 
X_train, X_val,y_train, y_val = train_val(train_ratings_df,val_frac=0.2)
movie_dict = create_movie_dict(f'{data_dir}/u.item')
num_users = len(train_ratings_df['userID'].unique())
num_movies = len(train_ratings_df['movieID'].unique())

print(f'Number of users {num_users}')
print(f'Number of movies {num_movies}')
model = model(num_users,num_movies,40)
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
model.compile(loss='mse',optimizer='adam')
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint(f'{outdir}/nn_factor_model.h5', save_best_only=True)]
model.fit([X_train[:,0],X_train[:,1]], y_train, epochs=30, validation_data=([X_val[:,0],X_val[:,1]], y_val), verbose=2, callbacks=callbacks)

/Users/kimdongkyu/Downloads/ml-100k/u.item
Number of users 943
Number of movies 1650
(None, 1, 40)
(None, 1, 40)
Model: "functional_33"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_32 (Embedding)        (None, 1, 40)        37720       input_33[0][0]                   
__________________________________________________________________________________________________
embedding_33 (Embedding)        (None, 1, 40)        66000       input_3

InvalidArgumentError:  indices[27,0] = 1675 is not in [0, 1650)
	 [[node functional_33/embedding_33/embedding_lookup (defined at <ipython-input-73-fab53f901943>:17) ]] [Op:__inference_train_function_10020]

Errors may have originated from an input operation.
Input Source operations connected to node functional_33/embedding_33/embedding_lookup:
 functional_33/embedding_33/embedding_lookup/9796 (defined at /Users/kimdongkyu/opt/anaconda3/lib/python3.7/contextlib.py:112)

Function call stack:
train_function
