In [3]:
import numpy as np
import pandas as pd

In [5]:
rating_df = pd.read_csv('C:\\Users\\User\\Downloads\\ratings_small.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
rating_df.shape


(100004, 4)

In [7]:
rating_df.nunique

<bound method DataFrame.nunique of         userId  movieId  rating   timestamp
0            1       31     2.5  1260759144
1            1     1029     3.0  1260759179
2            1     1061     3.0  1260759182
3            1     1129     2.0  1260759185
4            1     1172     4.0  1260759205
...        ...      ...     ...         ...
99999      671     6268     2.5  1065579370
100000     671     6269     4.0  1065149201
100001     671     6365     4.0  1070940363
100002     671     6385     2.5  1070979663
100003     671     6565     3.5  1074784724

[100004 rows x 4 columns]>

In [8]:
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [9]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [10]:
num_train = int(len(user_ids) * 0.8)
num_train

80003

In [11]:
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]
train_user_ids.shape,train_movie_ids.shape,train_ratings.shape,val_user_ids.shape, val_movie_ids.shape, val_ratings.shape,

((80003,), (80003,), (80003,), (20001,), (20001,), (20001,))

In [12]:
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings
user2movie

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [14]:
def compute_user_similarity_matrix(user2movie):
    similarity_matrix = np.zeros([num_users, num_users])
    for i in range(num_users):
        for j in range(i, num_users):
            
            corr = pearson_correlation(user2movie[i], user2movie[j])
            
            similarity_matrix[i,j] = corr
            similarity_matrix[j,i] = corr
        
        return similarity_matrix
    
def pearson_correlation(x,y):
    
    filt = (x!=0) * (y!=0)
    
    x_mean = x.sum() / x[x!=0].shape
    y_mean = y.sum() / y[y!=0].shape
    
    x = x[filt]
    y = y[filt]
    
    corr = np.sum((x - x_mean) * (y - y_mean)) / (np.sum((y-y_mean) **2) * np.sum((x-x_mean) ** 2)) **0.5
    
    return corr
similarity_matrix = compute_user_similarity_matrix(user2movie)
similarity_matrix[:10]



array([[ 1., nan, nan, ...,  1., nan, -1.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0.,  0.,  0.]])

In [15]:
similarity_matrix[np.arange(num_users), np.arange(num_users)] = 0
similarity_matrix[np.isnan(similarity_matrix)] = 0
similarity_matrix[:10]

array([[ 0.,  0.,  0., ...,  1.,  0., -1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [17]:
def compute_ucf(user2movie, similarity_matrix):
    #yk
    mean_ratings = np.sum(user2movie, axis = 1) / (user2movie !=0).sum(axis =1)
    
    #ykj - yk, user2movie: (num_users, num_movies), mean_ratings: (num_users) -> (num_users, 1)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, axis =1)
    
    sim_sum = np.sum(np.abs(similarity_matrix), axis =1)
    
    user2movie_diff[user2movie == 0] = 0
    
    #similarity_matrix: (num_users, num_users), user2movie_diff: (num_users, num_movies)
    # -> (num_users, num_movies), sim_sum: (num_users) -> (num_users, 1)
    weighted_sum = np.matmul(similarity_matrix, user2movie_diff) / np.expand_dims(sim_sum, axis=1)
    
    #weighted_sum: (num_users, num_movies), mean_ratings: (num_users)
    scores = weighted_sum + np.expand_dims(mean_ratings, axis=1)
    
    return scores

predictions = compute_ucf(user2movie, similarity_matrix)
predictions[:10]
    

  This is separate from the ipykernel package so we can avoid doing imports until
  


array([[2.60394488, 2.52142328, 2.53811983, ..., 2.55      , 2.55      ,
        2.55      ],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       ...,
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [3.75555556, 3.75555556, 3.75555556, ..., 3.75555556, 3.75555556,
        3.75555556],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [18]:
def dcg_at_k(r,k):
    r= r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    
    return dcg

In [19]:
def ndcg_at_k(r,k):
    dcg_max = dcg_at_k(sorted(r, reverse= True),k)
    
    return dcg_at_k(r,k) / dcg_max

In [20]:
def evaluate_prediction(predictions):
    
    ndcgs = []
    
    for target_user in np.unique(val_user_ids):
        
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user]
        target_val_ratings = val_ratings[val_user_ids == target_user]
        
        ndcg = ndcg_at_k(target_val_ratings[np.argsort(-predictions[target_user][target_val_movie_ids])], k=30)
        ndcgs.append(ndcg)
    ndcg = np.mean(ndcgs)
    return ndcg
evaluate_prediction(predictions)

0.8250944950179779

In [23]:
def compute_item_similarity_matrix(user2movie):
    
    x_mean = user2movie.sum(axis=0) / (user2movie !=0).sum(axis=0)
    
    filt = user2movie == 0
    rating_diff = user2movie - np.expand_dims(x_mean, axis =0)
    
    rating_diff[filt] =0
    
    #(num_movies, num_movies)
    corr = np.matmul(rating_diff.T, rating_diff) / \
    (np.matmul(rating_diff.T **2, (rating_diff !=0)) * np.matmul(rating_diff.T **2, (rating_diff !=0)).T) ** 0.5
    
    return corr
similarity_matrix = compute_item_similarity_matrix(user2movie)
similarity_matrix[:10]
    
    
    
    
    
    
    
    
    

  This is separate from the ipykernel package so we can avoid doing imports until
  if sys.path[0] == '':


array([[ 1.        ,  0.32949699,  0.14883085, ...,         nan,
                nan,         nan],
       [ 0.32949699,  1.        ,  0.11771157, ...,         nan,
                nan,         nan],
       [ 0.14883085,  0.11771157,  1.        , ...,         nan,
                nan,         nan],
       ...,
       [ 0.75689798,  0.79614979,  0.71287669, ...,         nan,
                nan,         nan],
       [-0.21912884,  0.5874723 ,  0.34578862, ...,         nan,
                nan,         nan],
       [ 0.02433877,  0.43192313, -0.08695837, ...,         nan,
                nan,         nan]])

In [24]:
similarity_matrix[np.arange(num_movies), np.arange(num_movies)] = 0
similarity_matrix[np.isnan(similarity_matrix)] =0
similarity_matrix[:10]

array([[ 0.        ,  0.32949699,  0.14883085, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.32949699,  0.        ,  0.11771157, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.14883085,  0.11771157,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.75689798,  0.79614979,  0.71287669, ...,  0.        ,
         0.        ,  0.        ],
       [-0.21912884,  0.5874723 ,  0.34578862, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02433877,  0.43192313, -0.08695837, ...,  0.        ,
         0.        ,  0.        ]])

In [25]:
def compute_icf(user2movie, similarity_matrix):
    
    #yk
    mean_ratings = np.sum(user2movie, axis = 0) / (user2movie !=0).sum(axis =0)
    
    #ykj - yk, user2movie: (num_users, num_movies), mean_ratings: (num_users) -> (num_users, 1)
    user2movie_diff = user2movie - np.expand_dims(mean_ratings, axis =0)
    
    sim_sum = np.sum(np.abs(similarity_matrix), axis =1)
    
    user2movie_diff[user2movie == 0] = 0
    
    #similarity_matrix: (num_movies, num_movies), user2movie_diff: (num_movies, num_users)
    # -> (num_users, num_movies), sim_sum: (num_users) -> (num_users, 1)
    weighted_sum = np.matmul(user2movie_diff, similarity_matrix) / np.expand_dims(sim_sum, axis=0)
    
    #weighted_sum: (num_users, num_movies), mean_ratings: (num_users)
    scores = weighted_sum + np.expand_dims(mean_ratings, axis=0)
    
    return scores

predictions = compute_icf(user2movie, similarity_matrix)
predictions[:10]

  after removing the cwd from sys.path.
  from ipykernel import kernelapp as app


array([[3.91995938, 3.36217492, 3.19289318, ...,        nan,        nan,
               nan],
       [3.92128976, 3.3640285 , 3.19388683, ...,        nan,        nan,
               nan],
       [3.91963665, 3.36367295, 3.19216674, ...,        nan,        nan,
               nan],
       ...,
       [3.92075554, 3.36605949, 3.1937442 , ...,        nan,        nan,
               nan],
       [3.92067407, 3.36313627, 3.1926433 , ...,        nan,        nan,
               nan],
       [3.92048034, 3.36408442, 3.19267732, ...,        nan,        nan,
               nan]])

In [26]:
evaluate_prediction(predictions)

0.8721779505007486