In [10]:
import time
import numpy as np
import pandas as pd
import keras
import random

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


from keras.layers import Dropout, Flatten,Activation,Input,Embedding
from keras.models import Model
from keras.layers.merge import dot
from keras.optimizers import Adam
from keras.layers import Dense , merge
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

In [11]:
movies = pd.read_csv('./data/movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
ratings = pd.read_csv('./data/rating.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [13]:
movies.movieId.unique().shape

(27278,)

In [14]:
ratings.movieId.unique().shape

(26744,)

In [15]:
merge_ratings_movies = pd.merge(movies, ratings, on='movieId', how='inner')
del merge_ratings_movies["userId"]
merge_ratings_movies.head()

Unnamed: 0,movieId,title,genres,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5,2009-01-02 01:13:41


In [16]:
movies_avg_ratings = merge_ratings_movies.groupby('movieId').mean()
movies_avg_ratings.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.92124
2,3.211977
3,3.15104
4,2.861393
5,3.064592


In [17]:
movies_ratings=pd.merge(movies, movies_avg_ratings, on='movieId', how='inner')
movies_ratings.head()

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92124
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.211977
2,3,Grumpier Old Men (1995),Comedy|Romance,3.15104
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.861393
4,5,Father of the Bride Part II (1995),Comedy,3.064592


In [18]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3),max_features=10000, min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies_ratings['genres'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_similarities)

[[1.         0.20589718 0.04910177 ... 0.2050947  0.         0.11992171]
 [0.20589718 1.         0.         ... 0.26692393 0.         0.15607412]
 [0.04910177 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.2050947  0.26692393 0.         ... 1.         0.         0.2537066 ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.11992171 0.15607412 0.         ... 0.2537066  0.         1.        ]]


In [19]:
results = {}

for idx, row in movies_ratings.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], movies_ratings['movieId'][i]) for i in similar_indices]

    results[row['movieId']] = similar_items[1:]

In [20]:
def suggest(item_id, amount):
    count = amount
    rating_sum = 0
    recs = results[item_id]
    for rec in recs:
        if rec[1]!=item_id:
            index = movies_ratings[movies_ratings['movieId'] == rec[1]].index
            rating = movies_ratings.iloc[index]['rating']
            rating_sum += float(rating)
            amount -= 1
        if amount <= 0:
            break
    return rating_sum/count

In [21]:
pre_rating = suggest(5625,10)

In [22]:
print(pre_rating)

3.2646752861015402


In [23]:
np.random.seed(123)
# load data
def loadData():
    ratings = pd.read_csv('./data/rating.csv', parse_dates=['timestamp'])
    return ratings

In [24]:
# for test convenience, only use num% of data
def cutData(num, ratings):
    rand_userIds = np.random.choice(ratings['userId'].unique(), size=int(len(ratings['userId'].unique())*num), replace=False)
    ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]
    return ratings

In [25]:
def splitData(ratings):
    users = ratings.userId.unique()
    movies = ratings.movieId.unique()

    userid2idx = {o:i for i,o in enumerate(users)}
    movieid2idx = {o:i for i,o in enumerate(movies)}

    train_ratings, test_ratings= train_test_split(ratings, test_size=0.2, random_state=42)
    train_ratings_ori = train_ratings.copy()
    test_ratings_ori = test_ratings.copy()
    
    train_ratings['userId'] = train_ratings['userId'].apply(lambda x: userid2idx[x])
    train_ratings['movieId'] = train_ratings['movieId'].apply(lambda x: movieid2idx[x])
    test_ratings['userId'] = test_ratings['userId'].apply(lambda x: userid2idx[x])
    test_ratings['movieId'] = test_ratings['movieId'].apply(lambda x: movieid2idx[x])
    
    """
    ratings['userId'] = ratings['userId'].apply(lambda x: userid2idx[x])
    ratings['movieId'] = ratings['movieId'].apply(lambda x: movieid2idx[x])
    split = np.random.rand(len(ratings)) < 0.8
    train_ratings = ratings[split]
    test_ratings = ratings[~split]
    """
    
    print(train_ratings.shape , test_ratings.shape)
    print(train_ratings_ori.shape, test_ratings_ori)
    return train_ratings, test_ratings, train_ratings_ori, test_ratings_ori

In [26]:
def embeddingNNModel(ratings):
    n_movies=len(ratings['movieId'].unique())
    n_users=len(ratings['userId'].unique())
    n_latent_factors=50  # hyperparamter to deal with. 

    user_input=Input(shape=(1,),name='user_input',dtype='int64')
    user_embedding=Embedding(n_users,n_latent_factors,name='user_embedding')(user_input)
    user_vec =Flatten(name='FlattenUsers')(user_embedding)
    user_vec=Dropout(0.40)(user_vec)

    movie_input=Input(shape=(1,),name='movie_input',dtype='int64')
    movie_embedding=Embedding(n_movies,n_latent_factors,name='movie_embedding')(movie_input)
    movie_vec=Flatten(name='FlattenMovies')(movie_embedding)
    movie_vec=Dropout(0.40)(movie_vec)

    sim=dot([user_vec,movie_vec],name='Simalarity-Dot-Product',axes=1)
    nn_inp=Dense(96,activation='relu')(sim)
    nn_inp=Dropout(0.4)(nn_inp)
    nn_inp=Dense(1,activation='relu')(nn_inp)
    nn_model =keras.models.Model([user_input, movie_input],nn_inp)
    return nn_model

In [27]:
def cfFit(model, train_ratings, epochs, batch_size):
    model.compile(optimizer=Adam(lr=1e-4),loss='mse')
    model.fit([train_ratings.userId,train_ratings.movieId], train_ratings.rating, epochs=epochs, batch_size=batch_size, verbose=1)
    return model

In [28]:
def cfPredict(model, test_ratings):
    pre_ratings = model.predict([test_ratings.userId,test_ratings.movieId])
    return pre_ratings

In [29]:
def cbPredict(test_ratings_ori):
    cb_pre_ratings = []
    for index, row in test_ratings_ori.iterrows():
        pre_rateing = suggest(row['movieId'], 5)
        cb_pre_ratings.append(pre_rateing)
    cb_pre_ratings = np.asarray(cb_pre_ratings)
    cb_pre_ratings_reshape = np.reshape(cb_pre_ratings, (cb_pre_ratings.shape[0], 1))
    return cb_pre_ratings_reshape

In [30]:
def finalPredict(cbPre, cfPre, num1, num2):
    finalPre = num1 * cbPre + num2 * cfPre
    return finalPre

In [31]:
def rmse(prediction, ground_truth):
    return sqrt(mean_squared_error(prediction, ground_truth))

In [33]:
start = time.time()
ratings = loadData()
ratings_cut = cutData(0.1, ratings)
train_ratings, test_ratings, train_ratings_ori, test_ratings_ori = splitData(ratings_cut)

nn_model = embeddingNNModel(ratings_cut)
batch_size = 512
epochs = 10
new_model = cfFit(nn_model, train_ratings, epochs, batch_size)

cf_pre_ratings = cfPredict(new_model, test_ratings)
cb_pre_ratings = cbPredict(test_ratings_ori)

cbNums = [0.1, 0.25]
cfNums = [0.9, 0.75]
for i in range(len(cbNums)):
    final_pre_ratings = finalPredict(cb_pre_ratings, cf_pre_ratings, cbNums[i], cfNums[i])

    RMSE = rmse(cf_pre_ratings, test_ratings.rating)
    RMSE1 = rmse(final_pre_ratings, test_ratings.rating)

    print (f'Batch_size is: {batch_size}, epochs is: {epochs}, Neural Network RMSE is: {RMSE}')
    print (f'cbPre is {cbNums[i]}, cfPre is {cfNums[i]},  Hybrid RMSE is: {RMSE1}')



end = time.time()
print(f"Runtime of the program is {end - start}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ratings['userId'] = train_ratings['userId'].apply(lambda x: userid2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ratings['movieId'] = train_ratings['movieId'].apply(lambda x: movieid2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ratings['userId'] = test_ratings['u

(1567070, 4) (391768, 4)
(1567070, 4)           userId  movieId  rating           timestamp
12647531   87415     2396     2.5 2008-10-29 16:44:54
149321       989     2881     3.0 2004-09-09 05:40:36
1132856     7729     1099     4.0 2006-04-02 04:02:26
4127020    28082      858     4.0 1997-05-04 16:23:23
1891552    12778     7099     3.5 2005-12-12 19:13:38
...          ...      ...     ...                 ...
1471997     9955      180     4.5 2005-03-22 06:18:16
15896093  109961   111362     4.0 2015-02-03 11:52:51
13223078   91355        5     4.0 1997-06-18 12:35:41
11733087   80990     1591     1.0 1997-10-19 19:56:36
4212443    28702      176     5.0 1996-06-09 21:57:29

[391768 rows x 4 columns]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Batch_size is: 512, epochs is: 10, Neural Network RMSE is: 0.8650405849193189
cbPre is 0.1, cfPre is 0.9,  Hybrid RMSE is: 0.8724945347426856
Batch_size is: 512, epochs is: 10,