In [8]:
import torch
import torchvision
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

%config InlineBackend.figure_format = 'svg' 
plt.style.use('seaborn')

In [9]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)


In [10]:
ratings_data = pd.read_csv('./ml-latest-small/ratings.csv')
movie_names_data = pd.read_csv('./ml-latest-small/movies.csv')

In [11]:
n_movies = len(movie_names_data)
n_user = len(ratings_data['userId'].unique())

In [12]:
ratings_data = pd.merge(ratings_data, movie_names_data, on='movieId', how='inner')

In [13]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [15]:
from sklearn.preprocessing import LabelEncoder
import random
Y = ratings_data.rating
user_enc = LabelEncoder()
movie_enc = LabelEncoder()
X = np.array([user_enc.fit_transform(ratings_data.userId),
              movie_enc.fit_transform(ratings_data.title)]).T

In [16]:
user_enc.classes_[4], movie_enc.classes_[8871]

(5, 'Toy Story (1995)')

In [18]:
for x, y in zip(X[:10], Y[:10]):
    print(list(x), y)

[0, 8871] 4.0
[4, 8871] 4.0
[6, 8871] 4.5
[14, 8871] 2.5
[16, 8871] 4.5
[17, 8871] 3.5
[18, 8871] 4.0
[20, 8871] 3.5
[26, 8871] 3.0
[30, 8871] 5.0


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [21]:
num_users = len(X)
num_movies = len(X)

In [28]:
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Activation, Dropout
from keras.models import Model

def build_model():
    movie_input = Input(shape=[1], name="Book-Input")
    movie_embedding = Embedding(n_movies+1, 15, name="Book-Embedding")(movie_input)
    movie_vec = Flatten(name="Flatten-Books")(movie_embedding)

    user_input = Input(shape=[1], name="User-Input")
    user_embedding = Embedding(n_user+1, 15, name="User-Embedding")(user_input)
    user_vec = Flatten(name="Flatten-Users")(user_embedding)
    
    prod = Dot(name="Dot-Product", axes=1)([user_vec, movie_vec])
    
    prod = Dense(32)(prod)
    prod = Activation('relu')(prod)
    prod = Dropout(0.5)(prod)

    prod = Dense(16)(prod)
    prod = Activation('relu')(prod)
    prod = Dropout(0.5)(prod)
    prod = Dense(1)(prod)


    model = Model([user_input, movie_input], prod)
    model.compile('adam', 'mean_squared_error', metrics=['accuracy'])

    return model


model = build_model()

In [29]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1)

history = model.fit([X_train[:, 0], X_train[:, 1]], Y_train, 
            epochs=15, 
            verbose=1,
            batch_size=64, 
            validation_data=([X_test[:, 0], X_test[:, 1]], Y_test), 
            callbacks=[model_checkpoint_callback])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Train on 80668 samples, validate on 20168 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 1.15957, saving model to ./checkpoint
Epoch 2/15

Epoch 00002: val_loss improved from 1.15957 to 0.91608, saving model to ./checkpoint
Epoch 3/15

Epoch 00003: val_loss improved from 0.91608 to 0.90212, saving model to ./checkpoint
Epoch 4/15

Epoch 00004: val_loss improved from 0.90212 to 0.88638, saving model to ./checkpoint
Epoch 5/15

Epoch 00005: val_loss improved from 0.88638 to 0.87976, saving model to ./checkpoint
Epoch 6/15

Epoch 00006: val_loss did not improve from 0.87976
Epoch 7/15

Epoch 00007: val_loss did not improve from 0.87976
Epoch 8/15

Epoch 00008: val_loss did not improve from 0.87976
Epoch 9/15

Epoch 00009: val_loss did not improve from 0.87976
Epoch 10/15

Epoch 00010: val_loss did not improve from 0.87976
Epoch 11/15

Epoch 00011: val_loss did not improve from 0.87976
Epoch 12/15

Epo

In [30]:
X_test[:5], Y_test[:5]

(array([[ 413, 8216],
        [ 297, 8032],
        [ 543, 1727],
        [ 190, 2478],
        [ 344,  221]]), 41008    2.0
 94274    2.0
 77380    5.0
 29744    4.0
 40462    4.0
 Name: rating, dtype: float64)

In [34]:
predictions = model.predict([X_test[:5, 0], X_test[:5, 1]])

In [35]:
print(predictions,"\n\n", Y_test[:5].values)

[[2.6292038]
 [2.9112046]
 [3.2737513]
 [3.3789482]
 [4.058033 ]] 

 [2. 2. 5. 4. 4.]


In [36]:
movie_enc.classes_[4]

"'Til There Was You (1997)"

In [130]:
extract_true_ratings(test_user_id, X_test)

[3.0, 1.0, 5.0, 3.0, 5.0, 3.0, 3.0, 5.0]

In [131]:
def extract_true_ratings(user_id, X_test):
    
    true_ratings = list()
    for x, y in X_test:
        if x == user_id:
            rating = ratings_data[(ratings_data['userId'] == user_enc.classes_[user_id]) \
                & (ratings_data['title'] == movie_enc.classes_[y])]['rating'].values[0]
            true_ratings.append(rating)

    return true_ratings

In [132]:
def predict_ratings(user_id, X_test):
    '''
    given user id predict all ratings for movies
    '''
    user_data = ratings_data[ratings_data['userId'] == user_id]
    movie_ids, movie_names, predictions, movie_genres = list(), list(), list(), list()
    i = 0
    for _id, movie_id in X_test:
        if user_id == X_test[i][0]:
            movie_ids.append(X_test[i, 1])
            movie_names.append(movie_enc.classes_[movie_id])
            pred = model.predict([ np.array([X_test[i, 0]]), np.array([X_test[i, 1]]) ])
            predictions.append(pred[0][0])
        i += 1
    return movie_ids, movie_names, movie_genres, predictions

In [154]:
test_user_id = 7
userid_rating_data = ratings_data[ratings_data['userId'] == test_user_id]
# userid_rating_data

In [155]:
movie_ids, movie_names, movie_genres, predictions = predict_ratings(test_user_id, X_test)

In [156]:
dictionary = {"user_id": [test_user_id]*len(movie_ids),
              "movie_id": movie_ids,
              "movie_name":movie_names,
              "predicted_ratings":predictions,
              "true_ratings": extract_true_ratings(test_user_id, X_test) 
              }

In [157]:
prediction_dataframe = pd.DataFrame.from_dict(dictionary, orient='index').transpose()
prediction_dataframe.sort_values('predicted_ratings', ascending=False)

Unnamed: 0,user_id,movie_id,movie_name,predicted_ratings,true_ratings
7,7,7523,Seven (a.k.a. Se7en) (1995),4.67376,4
6,7,713,Babe (1995),4.60559,5
4,7,8982,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),4.2711,3
5,7,1337,Braveheart (1995),3.74487,3
0,7,1091,"Birdcage, The (1996)",3.66631,3
3,7,836,Batman (1989),3.65645,3
1,7,9403,While You Were Sleeping (1995),2.708,3
2,7,2663,Ed Wood (1994),2.25248,3
