In [78]:
import numpy as np
import pandas as pd

In [79]:
rating_df = pd.read_csv(r'D:\Jupyter notebook\ratings_small.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [80]:
rating_df.shape

(100004, 4)

In [81]:
rating_df.nunique()

userId         671
movieId       9066
rating          10
timestamp    78141
dtype: int64

In [82]:
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [83]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [84]:
num_train = int(len(user_ids) * 0.8)
num_train

80003

In [85]:
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]
train_user_ids.shape, train_movie_ids.shape, train_ratings.shape, val_user_ids.shape, val_movie_ids.shape, val_ratings.shape

((80003,), (80003,), (80003,), (20001,), (20001,), (20001,))

In [86]:
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings
user2movie

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [87]:
def compute_user_similarity_matrix(user2movie):
    similarity_matrix = np.zeros([num_users, num_users])
    for i in range(num_users):
        for j in range(i, num_users):
            
            corr = pearson_correlation(user2movie[i], user2movie[j])
            
            similarity_matrix[i,j] = corr
            similarity_matrix[j,i] = corr
        
        return similarity_matrix
    
def pearson_correlation(x,y):
    
    filt = (x!=0) * (y!=0)
    
    x_mean = x.sum() / x[x!=0].shape
    y_mean = y.sum() / y[y!=0].shape
    
    x = x[filt]
    y = y[filt]
    
    corr = np.sum((x - x_mean) * (y - y_mean)) / (np.sum((y-y_mean) **2) * np.sum((x-x_mean) ** 2)) **0.5
    
    return corr
similarity_matrix = compute_user_similarity_matrix(user2movie)
similarity_matrix[:10]



array([[ 1., nan, nan, ...,  1., nan, -1.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0.,  0.,  0.]])

In [88]:
def compute_ucf(user2movie, similarity_matrix):
    #yk
    mean_ratings = np.sum(user2movie, axis = 1) / (user2movie !=0).sum(axis =1)
    
    #ykj - yk, user2movie: (num_users, num_movies), mean_ratings: (num_users) -> (num_users, 1)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, axis =1)
    
    sim_sum = np.sum(np.abs(similarity_matrix), axis =1)
    
    user2movie_diff[user2movie == 0] = 0
    
    #similarity_matrix: (num_users, num_users), user2movie_diff: (num_users, num_movies)
    # -> (num_users, num_movies), sim_sum: (num_users) -> (num_users, 1)
    weighted_sum = np.matmul(similarity_matrix, user2movie_diff) / np.expand_dims(sim_sum, axis=1)
    
    #weighted_sum: (num_users, num_movies), mean_ratings: (num_users)
    scores = weighted_sum + np.expand_dims(mean_ratings, axis=1)
    
    return scores

predictions = compute_ucf(user2movie, similarity_matrix)
predictions[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


array([[       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       ...,
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [3.75555556, 3.75555556, 3.75555556, ..., 3.75555556, 3.75555556,
        3.75555556],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [89]:
def dcg_at_k(r,k):
    r= r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    
    return dcg

In [90]:
def ndcg_at_k(r,k):
    dcg_max = dcg_at_k(sorted(r, reverse= True),k)
    
    return dcg_at_k(r,k) / dcg_max

In [91]:
def evaluate_prediction(predictions):
    
    ndcgs = []
    
    for target_user in np.unique(val_user_ids):
        
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user]
        target_val_ratings = val_ratings[val_user_ids == target_user]
        
        ndcg = ndcg_at_k(target_val_ratings[np.argsort(-predictions[val_user_ids == target_user])], k=30)
        ndcgs.append(ndcg)
    ndcg = np.mean(ndcgs)
    return ndcg

In [92]:
train_ratings.max()

5.0

把train_ratings.max() normalize到0和1之間 

In [93]:
train_ratings /=5
val_ratings /=5

模型

In [101]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Dense,Embedding,Flatten,Multiply

def get_mf_model():
    
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=num_users,output_dim=64)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=num_movies,output_dim=64)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    hidden = Multiply()([user_hidden, item_hidden])
    
    output = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer='adam')
    return model
model = get_mf_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 64)        42944       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 64)        580224      input_2[0][0]                    
______________________________________________________________________________________________

訓練模型

避免模型過度訓練

EarlyStopping(patience=1) 只要看到一個沒有變好的狀況訓練就終止

In [104]:
from tensorflow.keras.callbacks import EarlyStopping

callbacks = [EarlyStopping(patience=1)]

model.fit([train_user_ids, train_movie_ids], train_ratings,\
         validation_data=([val_user_ids, val_movie_ids], val_ratings),epochs=50, batch_size=128, callbacks=callbacks)

Epoch 1/50
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function 

<tensorflow.python.keras.callbacks.History at 0x12c81e16be0>

開始預測

In [105]:
predictions = model.predict([val_user_ids, val_movie_ids])
predictions.shape

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


(20001, 1)

把二維teansor的後面去掉變成一維tensor

In [106]:
evaluate_prediction(predictions[:,0])

0.8311279888662216

MLP model

In [107]:
from tensorflow.keras.layers import concatenate, Dropout
def get_mf_model():
    
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=num_users,output_dim=64)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=num_movies,output_dim=64)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    hidden = concatenate([user_hidden, item_hidden])
    hidden = Dense(128, activation='relu')(hidden)
    hidden = Dropout(0.2)(hidden)
    
    
    output = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer='adam')
    return model
model = get_mf_model()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 64)        42944       input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 64)        580224      input_4[0][0]                    
____________________________________________________________________________________________

In [108]:
model.fit([train_user_ids, train_movie_ids], train_ratings,\
         validation_data=([val_user_ids, val_movie_ids], val_ratings),epochs=50, batch_size=128, callbacks=callbacks)

Epoch 1/50
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function 

<tensorflow.python.keras.callbacks.History at 0x12c81f06a20>

In [114]:
predictions = model.predict([val_user_ids, val_movie_ids])
evaluate_prediction(predictions[:,0])

0.8754804526227554

MLP比較好

將上面兩個加起來 MF+MLP

In [111]:
def get_mf_model():
    
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=num_users,output_dim=64)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=num_movies,output_dim=64)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    mf_output = Multiply()([user_hidden, item_hidden])
    
    hidden = concatenate([user_hidden, item_hidden])
    hidden = Dense(128, activation='relu')(hidden)
    hidden = Dropout(0.2)(hidden)
    
    mlp_output = Dense(64, activation='relu')(hidden)
    
    output = concatenate([mf_output, mlp_output])
    
    output = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer='adam')
    return model
model = get_mf_model()
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 64)        42944       input_5[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 64)        580224      input_6[0][0]                    
____________________________________________________________________________________________

In [112]:
model.fit([train_user_ids, train_movie_ids], train_ratings,\
         validation_data=([val_user_ids, val_movie_ids], val_ratings),epochs=50, batch_size=128, callbacks=callbacks)

Epoch 1/50
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function 

<tensorflow.python.keras.callbacks.History at 0x12c84f5bc88>

In [113]:
predictions = model.predict([val_user_ids, val_movie_ids])
evaluate_prediction(predictions[:,0])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


0.8754804526227554