In [17]:
import numpy as np
import pandas as pd

In [18]:
rating_df = pd.read_csv(r'D:\Jupyter notebook\ratings_small.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [19]:
rating_df.shape

(100004, 4)

In [20]:
rating_df.nunique()

userId         671
movieId       9066
rating          10
timestamp    78141
dtype: int64

In [21]:
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [22]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [23]:
num_train = int(len(user_ids) * 0.8)
num_train

80003

In [24]:
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]
train_user_ids.shape, train_movie_ids.shape, train_ratings.shape, val_user_ids.shape, val_movie_ids.shape, val_ratings.shape

((80003,), (80003,), (80003,), (20001,), (20001,), (20001,))

In [25]:
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings
user2movie

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [27]:
def compute_user_similarity_matrix(user2movie):
    similarity_matrix = np.zeros([num_users, num_users])
    for i in range(num_users):
        for j in range(i, num_users):
            
            corr = pearson_correlation(user2movie[i], user2movie[j])
            
            similarity_matrix[i,j] = corr
            similarity_matrix[j,i] = corr
        
        return similarity_matrix
    
def pearson_correlation(x,y):
    
    filt = (x!=0) * (y!=0)
    
    x_mean = x.sum() / x[x!=0].shape
    y_mean = y.sum() / y[y!=0].shape
    
    x = x[filt]
    y = y[filt]
    
    corr = np.sum((x - x_mean) * (y - y_mean)) / (np.sum((y-y_mean) **2) * np.sum((x-x_mean) ** 2)) **0.5
    
    return corr
similarity_matrix = compute_user_similarity_matrix(user2movie)
similarity_matrix[:10]



array([[ 1., nan, nan, ...,  1., nan, -1.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0.,  0.,  0.]])

In [31]:
def compute_ucf(user2movie, similarity_matrix):
    #yk
    mean_ratings = np.sum(user2movie, axis = 1) / (user2movie !=0).sum(axis =1)
    
    #ykj - yk, user2movie: (num_users, num_movies), mean_ratings: (num_users) -> (num_users, 1)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, axis =1)
    
    sim_sum = np.sum(np.abs(similarity_matrix), axis =1)
    
    user2movie_diff[user2movie == 0] = 0
    
    #similarity_matrix: (num_users, num_users), user2movie_diff: (num_users, num_movies)
    # -> (num_users, num_movies), sim_sum: (num_users) -> (num_users, 1)
    weighted_sum = np.matmul(similarity_matrix, user2movie_diff) / np.expand_dims(sim_sum, axis=1)
    
    #weighted_sum: (num_users, num_movies), mean_ratings: (num_users)
    scores = weighted_sum + np.expand_dims(mean_ratings, axis=1)
    
    return scores

predictions = compute_ucf(user2movie, similarity_matrix)
predictions[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


array([[       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       ...,
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [3.75555556, 3.75555556, 3.75555556, ..., 3.75555556, 3.75555556,
        3.75555556],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [32]:
def dcg_at_k(r,k):
    r= r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    
    return dcg

In [33]:
def ndcg_at_k(r,k):
    dcg_max = dcg_at_k(sorted(r, reverse= True),k)
    
    return dcg_at_k(r,k) / dcg_max

In [34]:
def evaluate_prediction(predictions):
    
    ndcgs = []
    
    for target_user in np.unique(val_user_ids):
        
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user]
        target_val_ratings = val_ratings[val_user_ids == target_user]
        
        ndcg = ndcg_at_k(target_val_ratings[np.argsort(-predictions[target_user][target_val_movie_ids])], k=30)
        ndcgs.append(ndcg)
    ndcg = np.mean(ndcgs)
    return ndcg
evaluate_prediction(predictions)

0.8250944950179779

CSR形式的稀疏矩陣

In [35]:
from scipy.sparse import csr_matrix

user2movie=csr_matrix(user2movie)
user2movie

<671x9066 sparse matrix of type '<class 'numpy.float64'>'
	with 80003 stored elements in Compressed Sparse Row format>

開始做奇異值分解
u=>使用者矩陣
s=>奇異值矩陣
vt=>轉至矩陣

In [36]:
from scipy.sparse.linalg import svds

u, s, vt=svds(user2movie, k=30)
u.shape, s.shape, vt.shape

((671, 30), (30,), (30, 9066))

predictions預測結果

In [37]:
predictions = np.matmul(u * np.expand_dims(s, 0), vt)
evaluate_prediction(predictions)

0.841376648531207

結果比使用者好比物品差
但是執行速度非常快