# Recommendation System 
Trying out the process of creating a recommendation system using the [Movielens dataset](https://grouplens.org/datasets/movielens/)

Every movie and user can be assigned a vector which should describe their characterisits in such a way that when multiplied results in the users rating for that movie. These two embeddings could also be concatenated to form the input to a network which also could be used to predict the rating for this particular user-movie pair. 

In [348]:
import csv
import numpy as np
import pandas as pd

from math import floor, sqrt
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers import Input, Embedding, Activation, Flatten, Lambda, Concatenate, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

## Preparing the data
We have to clean the data a bit since the user and movie id's have to be in a contiguous order for the Keras Embedding layer to work propperly. We also want to make use of the movie tags when training the model, requering us to somehow turn these into vectors. 

One idea is to also create a embedding for each tag and combine these in an average fashion.

In [389]:
# user, movie, rating, time
ratings = np.genfromtxt('ml-latest-small/ratings.csv', delimiter=',', skip_header=1)

#id, title, tags
with open('ml-latest-small/movies.csv', 'r') as f:
    reader = csv.reader(f)
    movie_header = next(reader)
    mid2cmid = {} #movie id to contiguous id
    mid2tags = {} #contigious movie id to tag
    tags = []
    for i, row in enumerate(reader):
        mid2cmid[int(row[0])] = i #mapping movie id to contigious integers
        mid2tags[i] = row[2].split('|') 
        tags.append(mid2tags[i])
    #end for
#end with
mlb = MultiLabelBinarizer()
mlb_tags = mlb.fit_transform(tags)


uid2cuid = {} #user id to contiguous user id
users = set(ratings[:,0])
for i, u in enumerate(users):
    uid2cuid[u] = i


#Update the rating matrix
for i, row in enumerate(ratings):
    ratings[i][0] = uid2cuid[row[0]]
    ratings[i][1] = mid2cmid[int(row[1])]


In [276]:
n_users = 610 
n_movies = 9742
assert(len(users) == n_users)
assert(len(movie_map) == n_movies)

In [390]:
mlb_tags

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [278]:
set(tags)

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

Test-train split the data

In [394]:
train, test = train_test_split([ratings, mlb_tags], train_size=0.8)
print(len(train))
train_user, test_user = train[:,0], test[:,0]
train_movie, test_movie = train[:,1], test[:,1]
train_rating, test_rating = train[:,2], test[:,2]

1


TypeError: list indices must be integers or slices, not tuple

In [223]:
min_rateing = np.min(ratings[:,2])
max_rateing = np.max(ratings[:,2])
n_tags = mlb.classes_

Create the model

In [391]:
common_emb_ez = 50
movie_emb_sz = 50
user_emb_sz = 50
tag_emb_sz = 5

def gen_dot_model():
    movie_imp = Input(shape=(1,), name='movie_input')
    movie_emb = Embedding(n_movies, common_emb_ez, name='movie_embedding')(movie_imp)
    
    user_imp = Input(shape=(1,), name='user_input')
    user_emb = Embedding(n_users, common_emb_ez, name='user_embedding')(user_imp)
    
    movie_do = Dropout(0.1)(movie_emb)
    user_do = Dropout(0.1)(user_emb)
    
    dot = Flatten()(keras.layers.dot([movie_do, user_do], axes=2, name='dot'))
    dot_do = Dropout(0.1)(dot)
    
    act = Activation('sigmoid', name='activation')(dot_do)
    out = Lambda(lambda x: min_rateing + (max_rateing - min_rateing)*x)(act)

    model = Model(inputs=[movie_imp, user_imp], outputs=[out])
    model.compile(loss = 'mean_squared_error', optimizer = Adam(lr=0.001), metrics = ['mse'])
    model.summary()
    return model
#end def

def gen_net_model():
    movie_imp = Input(shape=(1,), name='movie_input')
    movie_emb = Embedding(n_movies, movie_emb_sz, name='movie_embedding')(movie_imp)
    
    user_imp = Input(shape=(1,), name='user_input')
    user_emb = Embedding(n_users, user_emb_sz, name='user_embedding')(user_imp)
    
    tag_imp = Input(shape=(n_tags,), name='tag_input')
    tag_dense_emb = Dense(tag_emb_sz)(tag_imp)
    
    movie_do = Dropout(0.1)(movie_emb)
    user_do = Dropout(0.1)(user_emb)
    
    concat = Flatten()(Concatenate(axis=2, name='concat')([movie_emb, user_emb, tag_emb]))
    
    dense1 = Dense(floor(2/3*(movie_emb_sz + user_emb_sz)), activation='tanh')(concat)
    d1_do = Dropout(0.1)(dense1)
    
    dense2 = Dense(floor(1/5*(movie_emb_sz + user_emb_sz)), activation='tanh')(d1_do)
    d2_do = Dropout(0.1)(dense2)
    
    dense3 = Dense(1, activation='sigmoid')(d2_do)    
    out = Lambda(lambda x: min_rateing + (max_rateing - min_rateing)*x)(dense3)

    model = Model(inputs=[movie_imp, user_imp], outputs=[out])
    model.compile(loss = 'mean_squared_error', optimizer = Adam(lr=0.001), metrics = ['mse'])
    model.summary()
    return model
#end def

es = EarlyStopping(patience=5, restore_best_weights=True)

In [344]:
model = gen_dot_model()
history = model.fit([train_movie, train_user, ], [train_rating], validation_split=0.1, epochs=100, batch_size=256, callbacks = [es])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        487100      movie_input[0][0]                
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 50)        30500       user_input[0][0]                 
__________________________________________________________________________________________________
dropout_71

In [345]:
mse = model.evaluate([test_movie, test_user], [test_rating])
rmse = sqrt(mse[0])
print(f'RMSE on test set: {rmse}')

RMSE on test set: 0.8885204333922158


In [346]:
model = gen_net_model()
history = model.fit([train_movie, train_user], [train_rating], validation_split=0.1, epochs=100, batch_size=256, callbacks = [es])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        487100      movie_input[0][0]                
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 50)        30500       user_input[0][0]                 
__________________________________________________________________________________________________
concat (Co

In [347]:
mse = model.evaluate([test_movie, test_user], [test_rating])
rmse = sqrt(mse[0])
print(f'RMSE on test set: {rmse}')

RMSE on test set: 0.8664244600557804
