# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:
from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [4]:
# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

In [5]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++                    
# Step 1
# User-based
train_dataframe = pd.DataFrame(train_ds)
test_dataframe = pd.DataFrame(test_ds)
GAMMA = 30
EPSILON = 1e-9

np_user_pearson_corr = np.zeros((n_users, n_users))

for i, user_i_vec in enumerate(train_dataframe.values):
    for j, user_j_vec in enumerate(train_dataframe.values):     
        # ratings corated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)
        
        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j
        
        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
        # significance weighting
        weighted_sim = sim
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim
        np_user_pearson_corr[i][j] = weighted_sim
np_user_pearson_corr
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++                    
# Step 2
# Create a blank dataframe for predictions
np_predictions = np.zeros((n_users, n_items))

EPSILON = 1e-9
LAMBDA = 1

for (i, j), rating in np.ndenumerate(test_dataframe.values):
    if rating > 0:
        devji_each_item = []
        cji = []
        # Loop through all items
        for item_id in range(n_items):
            if item_id != j and train_dataframe.iloc[i][item_id] > 0:
                # Filter co-rated users
                co_rated_users = np.where((train_ds[:, j] > 0) & (train_ds[:,item_id] > 0))[0]
                no_card_itemid_j = len(co_rated_users)
                
                if no_card_itemid_j != 0:
                    # Calculate ui - uj - calculate difference in ratings
                    uj_minus_ui = train_ds[co_rated_users, j] - train_ds[co_rated_users, item_id]
                    avg_1st_part_dev = np.sum(uj_minus_ui) / (no_card_itemid_j + EPSILON)

                    # Calculate 2nd part of the devj,i - use similarity from step 1
                    devji_2nd_part_upper = np.sum(uj_minus_ui * (2 ** np_user_pearson_corr[i,co_rated_users]))
                    devji_2nd_part_lower = np.sum((2 ** np_user_pearson_corr[i, co_rated_users])*no_card_itemid_j)
                    avg_devji_2nd_part = devji_2nd_part_upper / (devji_2nd_part_lower + EPSILON)

                    # Calculate devji
                    devji = LAMBDA * avg_1st_part_dev + (1 - LAMBDA) * avg_devji_2nd_part

                    # add rating for user i and save this part to use in prediction
                    item_id_ratings = train_dataframe.iloc[i][item_id]
                    
                    # Calculate (devji + uprime)
                    predict_upper = devji + train_ds[i, item_id]
                    devji_each_item.append(predict_upper * no_card_itemid_j)
                    cji.append(no_card_itemid_j)
                    
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++                    
# Step 3        
# Prediction of active user (u') on that particular item j
        for devji, no_card_itemid_j in zip(devji_each_item, cji):
            prediction_summation = np.array(devji_each_item)
            cji = np.array(cji)
        prediction = np.sum(prediction_summation) / (np.sum(cji) + EPSILON)
        np_predictions[i, j] = np.clip(prediction, 0, 5)
print(evaluate(test_ds, np_predictions))

(0.7435957831804696, 0.9516747501357116)


In [6]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0, RMSE: 0
