In [1]:
"""How well can a MLP approximate a dot product.
"""
import numpy as np
from tqdm import tqdm
rmse_best = 0.85
rmse_naive = 1.13


def GenerateData(emb_dim, num_users, num_items, num_train_samples,
                 num_test_samples, num_fresh_samples):
    """Generates a data set where ground truth is a dot product with noise.
  
    Each generated training case x is a real valued vector of dimension
    2*embedding_dim (this is the concatenation of the two embedding vectors for
    which we want to learn the similarity) and a label y that encodes the
    similarity.
  
    The data is generated such that a perfect model (the dot product) will have an
    RMSE of rmse_best. The naive model that predicts always 0 will have an RMSE of
    rmse_naive. See the paper for more details.
  
    Args:
      emb_dim: the embedding dimension.
      num_users: the total number of users.
      num_items: the total number of items.
      num_train_samples: the size of the training set.
      num_test_samples: the size of the first test set.
      num_fresh_samples: the size of the second test set.
  
    Returns:
    Three datasets are created:
    * train: consists of pairs of user,item embeddings and their label. User and
             item embeddings are drawn from a fixed set of <num_user> and
             <num_items> embeddings
    * test:  same as train but with the constraint that train and test do not
             overlap
    * fresh: same as train but using fresh embeddings, i.e., embeddings are not
             limited to the <num_user> and <num_items> embeddings from train and
             test
    """

    # Calculate standard deviation of embedding distribution and noise
    # distribution such that the data will have the desired RMSE properties.
    print('Calculate standard deviation')
    sd_noise = rmse_best
    sd_emb = np.sqrt(np.sqrt((np.square(rmse_naive)
                              - np.square(rmse_best)) / emb_dim))

    # Generate the embeddings:
    print('Generate the embeddings:')
    user_embs = np.random.normal(0, sd_emb, size=[num_users, emb_dim])
    item_embs = np.random.normal(0, sd_emb, size=[num_items, emb_dim])

    # Sample n combinations of user x item without replacement
    num_samples = num_train_samples + num_test_samples
    train_x = np.zeros([num_samples * 2, 2, emb_dim], dtype=float)
    train_y = np.zeros([num_samples * 2], dtype=float)
    sampling_prob = num_samples / (num_users * num_items)
    sampling_prob *= 1.1  # oversample to make sure we have enough samples
    counter = 0
    for u in tqdm(range(num_users), position=0,leave=False, desc='compute train/test'):
        for i in range(num_items):
            if np.random.uniform() < sampling_prob:
                user_emb = user_embs[u]
                item_emb = item_embs[i]
                train_x[counter][0] = user_emb
                train_x[counter][1] = item_emb
                train_y[counter] = (
                        np.random.normal(0.0, sd_noise) + np.dot(user_emb, item_emb))
                counter = counter + 1
    counter = np.min([counter, num_samples])

    # discard any additional items
    print('discard any additional items')
    train_x = train_x[:counter, :, :]
    train_y = train_y[:counter]

    # shuffle
    print('shuffle')
    p = np.random.permutation(train_x.shape[0])
    train_x = train_x[p]
    train_y = train_y[p]

    # Split into 90% training, 10% testing
    print('Split into 90% training, 10% testing')
    train_x, test_x = np.split(train_x,
                               [int((counter * num_train_samples) / num_samples)])
    train_y, test_y = np.split(train_y,
                               [int((counter * num_train_samples) / num_samples)])

    # Second set of holdout interactions, i.e., embeddings are new:
    fresh_x = np.random.normal(0, sd_emb, size=[num_fresh_samples, 2, emb_dim])
    fresh_y = np.zeros([num_fresh_samples], dtype=float)
    for counter in tqdm(range(num_fresh_samples), position=0,leave=False,desc='Second set of holdout interactions'):
        user_emb = fresh_x[counter][0]
        item_emb = fresh_x[counter][1]
        fresh_y[counter] = (
                np.random.normal(0.0, sd_noise) + np.dot(user_emb, item_emb))

    return train_x, train_y, test_x, test_y, fresh_x, fresh_y

In [2]:
# # Generate data and print some statistics
# python approx_dot.py --embedding_dim {16,32,64,128} \
#    --num_users {4000,8000,16000,32000,64000,128000} \
#    --num_items {4000,8000,16000,32000,64000,128000} \
#    --first_layer_mult {1,2,4} --learning_rate 0.001

embedding_dims = [16, 32, 64, 128]
num_users = 4000#[4000, 8000, 16000, 32000, 64000, 128000]
num_items = 4000#[4000, 8000, 16000, 32000, 64000, 128000]

for embedding_dim in embedding_dims:
    num_samples = num_users * 100  # 100 items per user on average
    train_x, train_y, test_x, test_y, fresh_x, fresh_y = GenerateData(
        emb_dim=embedding_dim,
        num_users=num_users,
        num_items=num_items,
        num_train_samples=int(num_samples * 0.9),
        num_test_samples=int(num_samples * 0.1), num_fresh_samples=100000)

    print('Num training examples: ', train_x.shape)
    print('Num test examples: ', test_x.shape)
    print('Num fresh examples: ', fresh_x.shape)
    
    # saving
    save_spec = f'{embedding_dim}_{num_users}_{num_items}'
    np.save(f'train_x_{save_spec}.npy', train_x)
    np.save(f'train_y_{save_spec}.npy', train_y)
    np.save(f'test_x_{save_spec}.npy', test_x)
    np.save(f'test_y_{save_spec}.npy', test_y)
    np.save(f'fresh_x_{save_spec}.npy', fresh_x)
    np.save(f'fresh_y_{save_spec}.npy', fresh_y)
    
    

compute train/test:   0%|          | 6/4000 [00:00<01:17, 51.43it/s]

Calculate standard deviation
Generate the embeddings:


                                                                       

discard any additional items
shuffle
Split into 90% training, 10% testing


                                                                                              

Num training examples:  (360000, 2, 16)
Num test examples:  (40000, 2, 16)
Num fresh examples:  (100000, 2, 16)


compute train/test:   0%|          | 6/4000 [00:00<01:17, 51.50it/s]

Calculate standard deviation
Generate the embeddings:


                                                                       

discard any additional items
shuffle
Split into 90% training, 10% testing


                                                                                             

Num training examples:  (360000, 2, 32)
Num test examples:  (40000, 2, 32)
Num fresh examples:  (100000, 2, 32)


compute train/test:   0%|          | 6/4000 [00:00<01:21, 49.07it/s]

Calculate standard deviation
Generate the embeddings:


                                                                       

discard any additional items
shuffle
Split into 90% training, 10% testing


                                                                                             

Num training examples:  (360000, 2, 64)
Num test examples:  (40000, 2, 64)
Num fresh examples:  (100000, 2, 64)


compute train/test:   0%|          | 6/4000 [00:00<01:12, 54.95it/s]

Calculate standard deviation
Generate the embeddings:


                                                                       

discard any additional items
shuffle
Split into 90% training, 10% testing


                                                                                             

Num training examples:  (360000, 2, 128)
Num test examples:  (40000, 2, 128)
Num fresh examples:  (100000, 2, 128)
