In [8]:
import torch
import torchvision
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

%config InlineBackend.figure_format = 'svg' 
plt.style.use('seaborn')

In [9]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)


In [10]:
ratings_data = pd.read_csv('./ml-latest-small/ratings.csv')
movie_names_data = pd.read_csv('./ml-latest-small/movies.csv')

In [11]:
n_movies = len(movie_names_data)
n_user = len(ratings_data['userId'].unique())

In [12]:
ratings_data = pd.merge(ratings_data, movie_names_data, on='movieId', how='inner')

In [13]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [14]:
# us = ratings_data[ratings_data['userId'] == 1]
# a = us[us['movieId'] == 1].loc[:, 'genres']

In [15]:
from sklearn.preprocessing import LabelEncoder
import random
Y = ratings_data.rating
user_enc = LabelEncoder()
movie_enc = LabelEncoder()
X = np.array([user_enc.fit_transform(ratings_data.userId),
              movie_enc.fit_transform(ratings_data.title)]).T

In [16]:
user_enc.classes_[4], movie_enc.classes_[8871]

(5, 'Toy Story (1995)')

In [17]:
# sorted(X[:, 0])

In [18]:
for x, y in zip(X[:10], Y[:10]):
    print(list(x), y)

[0, 8871] 4.0
[4, 8871] 4.0
[6, 8871] 4.5
[14, 8871] 2.5
[16, 8871] 4.5
[17, 8871] 3.5
[18, 8871] 4.0
[20, 8871] 3.5
[26, 8871] 3.0
[30, 8871] 5.0


In [19]:
movie_enc.classes_[8871]

'Toy Story (1995)'

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [21]:
num_users = len(X)
num_movies = len(X)

In [22]:
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Activation, Dropout
from keras.models import Model

def build_model():
    movie_input = Input(shape=[1], name="Book-Input")
    movie_embedding = Embedding(n_movies+1, 15, name="Book-Embedding")(movie_input)
    movie_vec = Flatten(name="Flatten-Books")(movie_embedding)

    user_input = Input(shape=[1], name="User-Input")
    user_embedding = Embedding(n_user+1, 15, name="User-Embedding")(user_input)
    user_vec = Flatten(name="Flatten-Users")(user_embedding)
    
    prod = Dot(name="Dot-Product", axes=1)([user_vec, movie_vec])
    
    prod = Dense(32)(prod)
    prod = Activation('relu')(prod)
    prod = Dropout(0.5)(prod)

    prod = Dense(16)(prod)
    prod = Activation('relu')(prod)
    prod = Dropout(0.5)(prod)
    prod = Dense(1)(prod)


    model = Model([user_input, movie_input], prod)
    model.compile('adam', 'mean_squared_error', metrics=['accuracy'])

    return model


model = build_model()

Using TensorFlow backend.


In [27]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1)

history = model.fit([X_train[:, 0], X_train[:, 1]], Y_train, 
            epochs=15, 
            verbose=1,
            batch_size=64, 
            validation_data=([X_test[:, 0], X_test[:, 1]], Y_test), 
            callbacks=[model_checkpoint_callback])

Train on 80668 samples, validate on 20168 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.95883, saving model to ./checkpoint
Epoch 2/15

Epoch 00002: val_loss did not improve from 0.95883
Epoch 3/15

Epoch 00003: val_loss did not improve from 0.95883
Epoch 4/15

Epoch 00004: val_loss did not improve from 0.95883
Epoch 5/15

Epoch 00005: val_loss did not improve from 0.95883
Epoch 6/15

Epoch 00006: val_loss did not improve from 0.95883
Epoch 7/15

Epoch 00007: val_loss did not improve from 0.95883
Epoch 8/15

Epoch 00008: val_loss did not improve from 0.95883
Epoch 9/15

Epoch 00009: val_loss did not improve from 0.95883
Epoch 10/15

Epoch 00010: val_loss did not improve from 0.95883
Epoch 11/15

Epoch 00011: val_loss did not improve from 0.95883
Epoch 12/15

Epoch 00012: val_loss did not improve from 0.95883
Epoch 13/15

Epoch 00013: val_loss did not improve from 0.95883
Epoch 14/15

Epoch 00014: val_loss did not improve from 0.95883
Epoch 15/15

Epoch 00015: val_los

In [None]:
X_test[:5], Y_test[:5]

In [None]:
predictions = model.predict([X_test[:5, 0], X_test[:5, 1]])

In [None]:
print(predictions,"\n\n", Y_test[:5].values)

In [26]:
movie_enc.classes_[4]

"'Til There Was You (1997)"

In [27]:
def predict_ratings(user_id, X_test):
    '''
    given user id predict all ratings for movies
    '''
    user_data = ratings_data[ratings_data['userId'] == user_id]
    movie_ids, movie_names, predictions, movie_genres = list(), list(), list(), list()
    i = 0
    for _id, movie_id in X_test:
        if user_id == X_test[i][0]:
            movie_ids.append(X_test[i, 1])
            movie_names.append(movie_enc.classes_[movie_id])
            # movie_genres.append(user_data[user_data['movieId'] == movie_id].loc[:, 'genres'].values[0])

            pred = model.predict([ np.array([X_test[i, 0]]), np.array([X_test[i, 1]]) ])
            predictions.append(pred[0][0])
        i += 1
    return movie_ids, movie_names, movie_genres, predictions

In [298]:
X_test

array([[ 413, 8216],
       [ 297, 8032],
       [ 543, 1727],
       ...,
       [ 437, 5512],
       [ 593, 2770],
       [ 369, 9119]])

In [296]:
ratings_data[ratings_data['userId'] == 413]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
2035,413,296,5.0,1484439565,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3119,413,457,5.0,1484439876,"Fugitive, The (1993)",Thriller
3711,413,527,5.0,1484439759,Schindler's List (1993),Drama|War
4496,413,593,5.0,1484439762,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
5293,413,736,5.0,1484439991,Twister (1996),Action|Adventure|Romance|Thriller
7272,413,1198,1.0,1484439850,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
7841,413,1213,4.5,1484439561,Goodfellas (1990),Crime|Drama
10902,413,2028,5.0,1484439998,Saving Private Ryan (1998),Action|Drama|War
11947,413,2329,5.0,1484439570,American History X (1998),Crime|Drama
12048,413,2353,5.0,1484440019,Enemy of the State (1998),Action|Thriller


In [288]:
user_id = 40
movie_ids, movie_names, movie_genres, predictions = predict_ratings(user_id, X_test)

2791


IndexError: index 0 is out of bounds for axis 0 with size 0

In [215]:
dictionary = {"user_id": [user_id]*len(movie_ids),
              "movie_id": movie_ids,
              "movie_name":movie_names,
              "ratings":predictions 
              }

In [216]:
prediction_dataframe = pd.DataFrame.from_dict(dictionary, orient='index').transpose()
prediction_dataframe.sort_values('ratings', ascending=False)

Unnamed: 0,user_id,movie_id,movie_name,ratings
8,40,5484,Mary and Max (2009),4.75105
36,40,4413,Intouchables (2011),4.67731
31,40,7073,Requiem for a Dream (2000),4.42205
12,40,6865,Pulp Fiction (1994),4.37195
0,40,2791,Eternal Sunshine of the Spotless Mind (2004),4.32208
38,40,9407,Whiplash (2014),4.27964
27,40,563,Apocalypse Now (1979),4.25545
34,40,9442,"Whole Nine Yards, The (2000)",4.24181
11,40,405,American Beauty (1999),4.24065
4,40,2940,Fargo (1996),4.2274


Unnamed: 0,user_id,movie_id,movie_name,ratings
1,10,7761,Sling Blade (1996),4.67632
13,10,7680,"Silence of the Lambs, The (1991)",4.54579
5,10,1337,Braveheart (1995),4.40964
10,10,8363,Terminator 2: Judgment Day (1991),4.3694
9,10,3279,"Fugitive, The (1993)",4.28595
6,10,3529,GoldenEye (1995),3.99254
15,10,8779,Titanic (1997),3.94247
2,10,1828,Clear and Present Danger (1994),3.90622
0,10,3508,Godzilla (1998),3.45628
4,10,8949,True Lies (1994),3.22529
