# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:
# please do not change this cell

from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
item_popularity = np.zeros(n_items)
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
    item_popularity[row[2]-1] =  item_popularity[row[2]-1] + 1
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
testsize = 0
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    if item_popularity[row[2]-1] > 30:
        test_ds[row[1]-1, row[2]-1] = row[3]
        testsize = testsize + 1
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

print("Testsize = " + str(testsize))

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Testsize = 17678


# Utils

In [4]:
# Please don't change this cell
# you can use this devaluate Utils here, and you can also implement your own MAE and RMSE calculation. 

EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

In [5]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 
train_ds = pd.DataFrame(train_ds)
test_ds = pd.DataFrame(test_ds)

# calculate the similarity values

# initialised an empty matrix to store similarity values
np_user_aed_corr = np.zeros((n_users, n_users))
for user, user_vector in enumerate(train_ds.values):
    for other_user, other_user_vector in enumerate(train_ds.values[user+1:], start=user+1):
        # ignore the same user
        if user == other_user:
            continue
        # find user and other_user's rated items
        mask_user = user_vector > 0
        mask_other_user = other_user_vector > 0
        # find user and other_user's co-rated items
        mask_both = mask_user & mask_other_user
        # find the co-rated items's id
        corated_item_id = np.where(mask_both)[0]+1
        # ignore other_user if they don't have co-rated item
        if len(corated_item_id) == 0:
            continue
        # calculate the max euclidean distance
        max_euclidean_distance =(16 * len(corated_item_id)) ** 0.5
        # calculate the euclidean distance
        euclidean_distance = np.sqrt(np.sum(np.square(user_vector[corated_item_id-1] - other_user_vector[corated_item_id-1])))
        # calculate the similarity between user and other_user and store the value into the matrix
        aed_similarity = 1 - euclidean_distance / (max_euclidean_distance + EPSILON)
        np_user_aed_corr[user, other_user] = aed_similarity
        np_user_aed_corr[other_user, user] = aed_similarity

# calculate the predictions
# initialised an empty matrix to store predictions values
np_predictions = np.zeros((n_users, n_items))

#define a k-value
K = 5

for (i,j), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        user_i_similarity = np_user_aed_corr[i]
        same_ratings_users = np.where(user_i_similarity == 1)[0]
        # sort by similarity and retrieve the top k items with the highest rankings
        sim_user_index = np.argsort(np_user_aed_corr[i])[-K:]
        # get the similarity values 
        sim_value = np_user_aed_corr[i][sim_user_index]
        # valid_users: similarity =1 and rated item j
        valid_users = [user for user in same_ratings_users if train_ds.values[user, j] > 0]

        if valid_users: #  use valid users set to predict the rating
            # the number of co-rated items for user and valid_user
            common_items_counts = [np.sum((train_ds.values[user] > 0) & (train_ds.values[i] > 0)) for user in valid_users]
            weighted_ratings_sum = np.sum([train_ds.values[user, j] * common for user, common in zip(valid_users, common_items_counts)])
            total_common_items = np.sum(common_items_counts)
            if total_common_items > 0: 
                np_predictions[i,j] = weighted_ratings_sum / total_common_items + EPSILON 
                np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)
        else: # no valid_user has ratings on item j, use the k neighbour has rating on item j to predict
            sim_user_index = [user for user in sim_user_index if train_ds.values[user, j] > 0]
            if sim_user_index:
                sim_value = np_user_aed_corr[i][sim_user_index]
                np_predictions[i,j] = np.sum(sim_value * train_ds.values[sim_user_index,j]) / (np.sum(np.abs(sim_value)) + EPSILON)
                np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)
            else:
                # no neighbour has ratings on item j, the prediction of j will be the average rating of user 
                user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
                np_predictions[i,j] = user_mean
                np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)

np_predictions_S = pd.DataFrame(np_predictions)

MAE, RMSE = evaluate(test_ds, np_predictions_S)

# MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
# RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.



In [6]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0       0.776571
1       0.680076
2       0.906579
3       0.685531
4       0.553694
          ...   
1677         NaN
1678         NaN
1679         NaN
1680         NaN
1681         NaN
Length: 1682, dtype: float64, RMSE: 0       1.034047
1       0.905090
2       1.122954
3       0.867659
4       0.723389
          ...   
1677         NaN
1678         NaN
1679         NaN
1680         NaN
1681         NaN
Length: 1682, dtype: float64
