In [21]:
def ComputeRMSE(x, y):
    """Computes the RMSE of a dot product model and a trivial model.
  
    Args:
      x: the input embeddings, a list of length n, such that x[i] is a pair of
        user and item embeddings.
      y: the labels, a list of length n.
  
    Returns:
      rmse_trivial: the RMSE of a trivial model that always predicts 0.
      rmse_dot: the RMSE of a dot product model.
    """
    sum_sqr_trivial = 0.0
    sum_sqr_dot = 0.0
    for i in range(x.shape[0]):
        label = y[i]

        prediction = 0
        diff = prediction - label
        sum_sqr_trivial = sum_sqr_trivial + diff * diff

        user_emb = x[i][0]
        item_emb = x[i][1]
        prediction = np.dot(user_emb, item_emb)
        diff = prediction - label
        sum_sqr_dot = sum_sqr_dot + diff * diff

    rmse_trivial = np.sqrt(sum_sqr_trivial / x.shape[0])
    rmse_dot = np.sqrt(sum_sqr_dot / x.shape[0])
    return rmse_trivial, rmse_dot

In [22]:
results = dict()
embedding_dims = [16, 32, 64, 128]
num_users_ = [4000, 8000, 16000, 32000]

In [24]:
dirr = 'datasets'
for embedding_dim in embedding_dims:
    for num_users in num_users_:
        num_items = num_users
        load_spec = f'{embedding_dim}_{num_users}_{num_items}'
        train_x = np.load(f'{dirr}/train_x_{load_spec}.npy')
        train_y = np.load(f'{dirr}/train_y_{load_spec}.npy')
        test_x = np.load(f'{dirr}/test_x_{load_spec}.npy')
        test_y = np.load(f'{dirr}/test_y_{load_spec}.npy')
        fresh_x = np.load(f'{dirr}/fresh_x_{load_spec}.npy')
        fresh_y = np.load(f'{dirr}/fresh_y_{load_spec}.npy')
        
        rmse_train_naive, rmse_train_dot = ComputeRMSE(train_x, train_y)
        rmse_test_naive, rmse_test_dot = ComputeRMSE(test_x, test_y)
        rmse_fresh_naive, rmse_fresh_dot = ComputeRMSE(fresh_x, fresh_y)
        
        results[f'emb{embedding_dim}_num_u{num_users}_dot_train_rmse'] = rmse_train_dot 
        results[f'emb{embedding_dim}_num_u{num_users}_dot_test_rmse'] = rmse_test_dot 
        results[f'emb{embedding_dim}_num_u{num_users}_dot_fresh_rmse'] = rmse_fresh_dot 

In [36]:
import pandas as pd
df = pd.DataFrame.from_dict(results, orient='index', columns=['rmse'])

In [38]:
df.to_parquet('dot_results.parquet')