In [None]:
import numpy as np
import pandas as pd
import math 
import itertools

# Modelling Helpers :
from sklearn.preprocessing import Normalizer , scale
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score



# Evaluation metrics :

# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 

# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [None]:
movies = pd.read_csv('./input/movies.csv')
ratings = pd.read_csv('./input/ratings.csv')
tags = pd.read_csv('./input/tags.csv')
links = pd.read_csv('./input/links.csv')
df_r = ratings.copy()
df_m = movies.copy()

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
ratings.shape

(25000095, 4)

In [None]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


Combining the movie and rating data sets

In [None]:
df_combined = pd.merge(ratings, movies, on = 'movieId')


In [None]:
df_combined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


 Matrix Factorization using Deep Learning (Keras)

In [None]:
from keras.layers import Embedding, Input, dot, concatenate
from keras.models import Model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

In [None]:
# Deep Learning Libraries
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from keras.utils import to_categorical

In [None]:
X = ratings.iloc[:,:2]
Y = ratings.iloc[:,2]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 66)

In [None]:
n_latent_factors = 50

# no of users and movies
n_users, n_movies = len(ratings['userId'].unique()), len(ratings['movieId'].unique()) 

In [None]:
# Model Architecture


# User Embeddings
user_input = Input(shape=(1,), name='User_Input')
user_embeddings = Embedding(input_dim = n_users, output_dim=n_latent_factors, input_length=1, 
                              name='User_Embedding') (user_input)
user_vector = Flatten(name='User_Vector') (user_embeddings)


# Movie Embeddings
movie_input = Input(shape=(1,), name='Movie_Input')
movie_embeddings = Embedding(input_dim = n_movies, output_dim=n_latent_factors, input_length=1, 
                               name='Movie_Embedding') (movie_input)
movie_vector = Flatten(name='Movie_Vector') (movie_embeddings)


# Dot Product
merged_vectors = dot([user_vector, movie_vector], name='Dot_Product', axes=1)
model = Model([user_input, movie_input], merged_vectors)

In [None]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 User_Input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 Movie_Input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 User_Embedding (Embedding)     (None, 1, 50)        8127050     ['User_Input[0][0]']             
                                                                                                  
 Movie_Embedding (Embedding)    (None, 1, 50)        2952350     ['Movie_Input[0][0]']            
                                                                                              

In [None]:
import keras.backend as K
import keras
optimizer = Adam(lr = 0.0005)

model.compile(loss='mean_squared_error', optimizer = optimizer, metrics = ['accuracy'])

  super().__init__(name, **kwargs)


In [None]:
batch_size = 128
epochs = 20

In [None]:
keras.layers.Embedding(15000, 16)
history = model.fit(x=[x_train['userId'], x_train['movieId']], y=y_train, batch_size= batch_size, epochs=epochs, 
                    verbose= 2, validation_data=([x_test['userId'], x_test['movieId']], y_test))

Train on 80668 samples, validate on 20168 samples
Epoch 1/20
 - 6s - loss: 13.3356 - acc: 0.0000e+00 - val_loss: 12.9812 - val_acc: 0.0000e+00
Epoch 2/20
 - 3s - loss: 11.0575 - acc: 0.0080 - val_loss: 8.0642 - val_acc: 0.0314
Epoch 3/20
 - 2s - loss: 6.2983 - acc: 0.0826 - val_loss: 5.2491 - val_acc: 0.1221
Epoch 4/20
 - 2s - loss: 4.7860 - acc: 0.1524 - val_loss: 4.5375 - val_acc: 0.1723
Epoch 5/20
 - 2s - loss: 4.2771 - acc: 0.1938 - val_loss: 4.2153 - val_acc: 0.2019
Epoch 6/20
 - 3s - loss: 4.0258 - acc: 0.2206 - val_loss: 4.0425 - val_acc: 0.2203
Epoch 7/20
 - 2s - loss: 3.8829 - acc: 0.2364 - val_loss: 3.9427 - val_acc: 0.2309
Epoch 8/20
 - 2s - loss: 3.7942 - acc: 0.2473 - val_loss: 3.8821 - val_acc: 0.2397
Epoch 9/20
 - 2s - loss: 3.7352 - acc: 0.2558 - val_loss: 3.8423 - val_acc: 0.2452
Epoch 10/20
 - 3s - loss: 3.6939 - acc: 0.2617 - val_loss: 3.8170 - val_acc: 0.2470
Epoch 11/20
 - 2s - loss: 3.6625 - acc: 0.2656 - val_loss: 3.7981 - val_acc: 0.2509
Epoch 12/20
 - 2s - loss: 3.6375 - acc: 0.2709 - val_loss: 3.7846 - val_acc: 0.2503
Epoch 13/20
 - 2s - loss: 3.6163 - acc: 0.2765 - val_loss: 3.7770 - val_acc: 0.2527
Epoch 14/20
 - 2s - loss: 3.5982 - acc: 0.2810 - val_loss: 3.7711 - val_acc: 0.2520
Epoch 15/20
 - 3s - loss: 3.5812 - acc: 0.2856 - val_loss: 3.7659 - val_acc: 0.2538
Epoch 16/20
 - 2s - loss: 3.5653 - acc: 0.2922 - val_loss: 3.7629 - val_acc: 0.2540
Epoch 17/20
 - 2s - loss: 3.5503 - acc: 0.2965 - val_loss: 3.7613 - val_acc: 0.2548
Epoch 18/20
 - 2s - loss: 3.5352 - acc: 0.3023 - val_loss: 3.7603 - val_acc: 0.2536
Epoch 19/20
 - 2s - loss: 3.5209 - acc: 0.3086 - val_loss: 3.7598 - val_acc: 0.2547

In [None]:
# Get training and test loss histories
training_loss = history.history['loss']
test_loss = history.history['val_loss']

# Create count of the number of epochs
epoch_count = range(1, len(training_loss) + 1)

# Visualize loss history
plt.figure(figsize = (8,4))
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
score = model.evaluate([x_test['userId'], x_test['movieId']], y_test)

print('RMSE: {:.4f}'.format(np.sqrt(score[0])))
print('Accuracy of Model: ',score[1] )

In [None]:
df_comb = df_combined['movieId'].to_numpy()
df_movie_id = np.unique(df_comb)

input_user_id = 200
prediction_arr = []

pred_movie_frame = []

movie_id_arr = []


for i in range(len(df_movie_id)):
    input_movie_id = df_movie_id[i]
    data = {'userId':[input_user_id],'movieId':[input_movie_id]}
    input_user_data = pd.DataFrame(data)
    prediction = model.predict([input_user_data['userId'], input_user_data['movieId']])
    pred_df = pd.DataFrame(prediction[0],columns = ['prediction'])
    pred_val = prediction[0].astype(float)
    input_movie_id_df = pd.DataFrame([input_movie_id],columns = ['movieId'])
    
    
    pred_movie_data = pd.DataFrame([[input_movie_id,pred_val[0]]],
                   columns=['movieId', 'prediction'])
    
    pred_movie_frame.append(pred_movie_data)


final_pred_movie_frame = pd.concat(pred_movie_frame,axis =0,ignore_index=True)
print(final_pred_movie_frame)

In [None]:
max_rating_index = (final_pred_movie_frame['prediction']==final_pred_movie_frame.max()['prediction'])
max_rating_index = max_rating_index.astype(int).to_numpy()


for i in range(len(max_rating_index)):
    if(max_rating_index[i]==1):
        predicted_movieid = final_pred_movie_frame['movieId'][i]



df_comb_id = df_combined['movieId'].astype(int).to_numpy()


for i in range(len(df_combined)):
    if(df_comb_id[i]==predicted_movieid):
        predicted_movie_name = df_combined['title'][i]
        
        
print(predicted_movie_name)
print(final_pred_movie_frame.max()['prediction'])