In [27]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import csv
import errno
import os
import re

import pandas as pd
import tensorflow as tf
from tensorboard.backend.event_processing import plugin_event_multiplexer as event_multiplexer  # pylint: disable=line-too-long


# Control downsampling: how many scalar data do we keep for each run/tag
# combination?
SIZE_GUIDANCE = {'scalars': 1000}


def extract_scalars(multiplexer, run, tag):
  """Extract tabular data from the scalars at a given run and tag.
  The result is a list of 3-tuples (wall_time, step, value).
  """
  tensor_events = multiplexer.Tensors(run, tag)
  return [
      (event.wall_time, event.step, tf.make_ndarray(event.tensor_proto).item())
      for event in tensor_events
  ]


def create_multiplexer(logdir):
  multiplexer = event_multiplexer.EventMultiplexer(
      tensor_size_guidance=SIZE_GUIDANCE)
  multiplexer.AddRunsFromDirectory(logdir)
  multiplexer.Reload()
  return multiplexer


def export_scalars(multiplexer, run, tag, filepath, write_headers=True):
  data = extract_scalars(multiplexer, run, tag)
  with open(filepath, 'w') as outfile:
    writer = csv.writer(outfile)
    if write_headers:
      writer.writerow(('wall_time', 'step', 'value'))
    for row in data:
      writer.writerow(row)


NON_ALPHABETIC = re.compile('[^A-Za-z0-9_]')

def munge_filename(name):
  """Remove characters that might not be safe in a filename."""
  return NON_ALPHABETIC.sub('_', name)


def mkdir_p(directory):
  try:
    os.makedirs(directory)
  except OSError as e:
    if not (e.errno == errno.EEXIST and os.path.isdir(directory)):
      raise
    
logdir = '/home/ec2-user/emb3/runs'
output_dir = '//home/ec2-user/emb3/csv_output'
mkdir_p(output_dir)

print("Loading data...")
multiplexer = create_multiplexer(logdir)

runs = list(multiplexer.Runs().keys())

tmpl = []
for i in range(len(runs)):
    try:
        data = extract_scalars(multiplexer, runs[i], "Epoch/pairs_ndcg")
        ddf =  pd.DataFrame(data, columns=["timestamp", "Epoch", "pairs_ndcg"])
        ddf["run_name"] = runs[i]
        tmpl.append(ddf)
    except:
        pass

    
pd.set_option('display.max_colwidth', -1)

all_vals = pd.concat(tmpl)
all_vals[all_vals.groupby("run_name")["pairs_ndcg"].transform(max) == all_vals["pairs_ndcg"]].sort_values(by="pairs_ndcg", ascending=False).head(10)

Loading data...


Unnamed: 0,timestamp,Epoch,pairs_ndcg,run_name
32,1567488000.0,32,0.285385,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.99-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None
22,1567467000.0,22,0.283133,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.97-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None
47,1567456000.0,47,0.281562,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.9-smpl-1.0-ng-5-bt-1024-drp-None-amsgrad-True-adamw=None
63,1567483000.0,63,0.280697,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.95-smpl-1.0-ng-5-bt-1024-drp-None-amsgrad-True-adamw=None
54,1567475000.0,54,0.278612,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.95-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None
54,1567483000.0,54,0.278262,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.98-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None
32,1567470000.0,32,0.277345,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.96-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None
11,1567490000.0,11,0.277079,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.999-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None
45,1567455000.0,45,0.27681,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.95-smpl-1.0-ng-5-bt-1024-drp-None-amsgrad-False-adamw=None
38,1567440000.0,38,0.273613,32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.92-smpl-1.0-ng-5-bt-1024-drp-None-amsgrad-False-adamw=None


In [31]:
%ls ~/emb3/models/32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.99-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None-32

/home/ec2-user/emb3/models/32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.99-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None-32


In [36]:
import torch
model = torch.load("/home/ec2-user/emb3/models/32-3-bpr-MLP-suf-500K-with-3-vids-loss-adaptive_hinge-lr-0.001-l2-1e-05-mom-0.99-smpl-1.0-ng-3-bt-1024-drp-None-amsgrad-True-adamw=None-32")

In [45]:
%load_ext autotime

In [46]:
suffix = "500K-with-3-vids"
neg_data_path  = "data/validate-neg-flatten-aug-28-phase" + suffix
pos_data_path = "data/validate-pos-flatten-aug-28-phase" + suffix

validate_neg_flatten_vids = pd.read_parquet(neg_data_path)
validate_pos_flatten_vids = pd.read_parquet(pos_data_path)

evaluate_data = [validate_pos_flatten_vids["uindex"].to_numpy(),
                     validate_pos_flatten_vids["vindex"].to_numpy(),
                     validate_neg_flatten_vids["uindex"].to_numpy(),
                     validate_neg_flatten_vids["nvindex"].to_numpy()]

time: 472 ms


In [52]:
import numpy as np
use_cuda=True
model.train(False)

MLP(
  (embedding_user): Embedding(500044, 64)
  (embedding_item): Embedding(17389, 64)
  (fc_layers): ModuleList(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=32, bias=True)
  )
  (affine_output): Linear(in_features=32, out_features=1, bias=True)
)

time: 2.64 ms


In [56]:
def eval_results_in_batch(serve_model,
                          test_users,
                          test_items,
                          batch_size=1024):
    total_size = len(test_users)
    tmp_ranges = np.arange(0, total_size + batch_size, batch_size)
    lower_indices = tmp_ranges[:-1]
    upper_indices = tmp_ranges[1:]
    subsets = []
    for i in range(len(lower_indices)):
        subset_users = test_users[lower_indices[i]:upper_indices[i]]
        subset_items = test_items[lower_indices[i]:upper_indices[i]]
        if len(subset_users) > 0:
            subsets.append(serve_model(subset_users, subset_items))
    return torch.cat(subsets, 0)

with torch.no_grad():
    test_users, test_items = torch.LongTensor(evaluate_data[0]), torch.LongTensor(evaluate_data[1])
    negative_users, negative_items = torch.LongTensor(evaluate_data[2]), torch.LongTensor(evaluate_data[3])
    if use_cuda is True:
        test_users = test_users.cuda()
        test_items = test_items.cuda()
        negative_users = negative_users.cuda()
        negative_items = negative_items.cuda()
    test_scores = eval_results_in_batch(model, test_users, test_items, batch_size=1024 * 3)
    negative_scores = eval_results_in_batch(model, negative_users, negative_items, batch_size=1024 * 3)
    if use_cuda is True:
        test_users = test_users.cpu()
        test_items = test_items.cpu()
        test_scores = test_scores.cpu()
        negative_users = negative_users.cpu()
        negative_items = negative_items.cpu()
        negative_scores = negative_scores.cpu()

time: 1.36 s


In [60]:
from spotlight.evaluation import MetronAtK
metron = MetronAtK(top_k=10)

metron.subjects = [test_users.data.view(-1).tolist(),
                                     test_items.data.view(-1).tolist(),
                                     test_scores.data.view(-1).tolist(),
                                     negative_users.data.view(-1).tolist(),
                                     negative_items.data.view(-1).tolist(),
                                     negative_scores.data.view(-1).tolist()]

time: 15.1 s


In [85]:
from spotlight.interactions import Interactions
train_data_path = "data/train-aug-28-phase" + suffix
original_train_data = pd.read_parquet(train_data_path)
train_data = original_train_data
interactions = Interactions(train_data["uindex"].to_numpy(),
            train_data["vindex"].to_numpy(),
            train_data["pct_cvt"].to_numpy(),
            train_data["latest_watch_time"].to_numpy(),
            num_users=len(original_train_data["uindex"].unique()),
            num_items=len(original_train_data["vindex"].unique()))

time: 1.43 s


In [83]:
sq_interactions = interactions.to_sequence(max_sequence_length=20)

ValueError: 0 is used as an item id, conflicting with the sequence padding value.

time: 20.7 ms


In [61]:
hit_ratio, ndcg = metron.cal_hit_ratio(), metron.cal_ndcg()
hit_ratio, ndcg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: math.log(2) / math.log(1 + x))  # the rank starts from 1


(0.9129733333333333, 0.6774019496519258)

time: 277 ms


In [67]:
from spotlight.evaluation import *

time: 547 µs


In [70]:
os.environ["BASE_DIR"] = "/home/ec2-user/emb3"
os.environ["SUFFIX"] = "500K-with-3-vids"
aa = pd.read_parquet(os.environ['BASE_DIR'] + "/data/video2index-pandas-aug-28-phase" + suffix)
videoid2index = dict(zip(aa["k"], aa["v"]))
als_embds = pd.read_parquet("/home/ec2-user/emb3/data/als-embs-pandas-aug-28-phase" + suffix)
number_of_videos = len(videoid2index)
embs = np.ma.masked_all((number_of_videos, 100))
for i, row in als_embds.iterrows():
    vindex = row["vindex"]
    if vindex != -1:
        embs[vindex, :] = row["vector"]
pairs_ndcg_score(embs)

  embeds_norm = np.divide(embeds, np.sqrt(np.square(embeds).sum(axis=1)).reshape(-1, 1))
  embeds_norm = np.divide(embeds, np.sqrt(np.square(embeds).sum(axis=1)).reshape(-1, 1))


0.2869818003525584

time: 1min 55s


In [79]:
number_of_videos = len(videoid2index)
aembs = np.zeros((number_of_videos, 100))
for i, row in als_embds.iterrows():
    vindex = row["vindex"]
    aembs[vindex, :] = row["vector"]
    
# evaluate_data = validation_data
test_users, test_items = evaluate_data[0], evaluate_data[1]
negative_users, negative_items = evaluate_data[2], evaluate_data[3]

suffix = os.environ["SUFFIX"]
train_data_path = "data/train-aug-28-phase" + suffix
train_regr_dataset = pd.read_parquet(train_data_path)

uniq_test_users = list(set(test_users))
test_users_vids = train_regr_dataset[train_regr_dataset["uindex"].isin(uniq_test_users)].groupby("uindex")["vindex"].agg(list)
user_avg_vid_embs = test_users_vids.apply(lambda x: aembs[x].mean(axis=0))

from numpy.linalg import norm

def cosine_sims(a,b):
    return np.dot(a,b)/(norm(a)*norm(b))

pos_scores = []
for tu, ti in zip(test_users, test_items):
    pos_scores.append(cosine_sims(user_avg_vid_embs[tu], aembs[ti]))
    
neg_scores = []
for tu, ti in zip(negative_users, negative_items):
    neg_scores.append(cosine_sims(user_avg_vid_embs[tu], aembs[ti]))
    
metron = MetronAtK(top_k=10)

metron.subjects = [test_users.tolist(),
                     test_items.tolist(),
                     pos_scores,
                     negative_users.tolist(),
                     negative_items.tolist(),
                     neg_scores]
hit_ratio, ndcg = metron.cal_hit_ratio(), metron.cal_ndcg()
hit_ratio, ndcg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lambda x: math.log(2) / math.log(1 + x))  # the rank starts from 1


(0.9171066666666666, 0.7022161946587601)

time: 14.3 s


In [59]:
negative_scores.mean(), test_scores.mean()

(tensor(7.1872), tensor(50.8512))

time: 5.41 ms


In [2]:
import ray
from ray import tune
import os

os.environ['SUFFIX']="500K-with-3-vids"
os.environ['BASE_DIR']="/home/ec2-user/emb3"

from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import torch
from torch.utils.tensorboard import SummaryWriter
from spotlight.factorization.representations import *
import os
from spotlight.evaluation import *
from spotlight.evaluation import mrr_score

def ray_train(input_config, reporter):
    if 'SUFFIX' in os.environ:
        suffix = os.environ['SUFFIX']
    else:
        suffix = "1"
    if 'LOSS' in input_config:
        loss = input_config['LOSS']
    else:
        loss="bpr"

    if 'LR' in input_config:
        lr = float(input_config['LR'])
    else:
        lr=1e-3

    if 'L2' in input_config:
        l2 = float(input_config['L2'])
    else:
        l2=1e-5

    if 'MOM' in input_config:
        mom = float(input_config['MOM'])
    else:
        mom=0.9

    if 'SAMPLE' in input_config:
        train_sample = float(input_config['SAMPLE'])
    else:
        train_sample = 1.0

    if 'NEGSAMPLES' in input_config:
        num_negative_samples = int(input_config['NEGSAMPLES'])
    else:
        num_negative_samples = 5


    if 'BATCH' in input_config:
        batch_size = int(input_config['BATCH'])
    else:
        batch_size=1024

    dropout = None
    if 'DROPOUT' in input_config and input_config['DROPOUT']:
        dropout = float(input_config['DROPOUT'])

    net_conf = "32-3-bpr-MLP"
    if 'NETCONG' in input_config and input_config['NETCONG']:
        net_conf = input_config['NETCONG']

    betas=(mom, 0.999)
    use_cuda=True

    tensorboard_base_dir="runs"
    model_alias = "{}-suf-{}-loss-{}-lr-{}-l2-{}-mom-{}-smpl-{}-ng-{}-bt-{}-drp-{}-ray".format(net_conf, suffix, loss, lr, l2, 
                                                                                          mom, train_sample,
                                                                                          num_negative_samples,
                                                                                         batch_size,
                                                                                                        dropout)
    model_store_dir="/home/ec2-user/emb3/models"
    n_iters=30
    #loss="adaptive_hinge"

    log_loss_interval=100
    log_eval_interval=5000
    #train_data_path = "s3a://tubi-playground-production/smistry/emb3/train-aug-28-phase1"
    train_data_path = os.environ['BASE_DIR'] + "/data/train-aug-28-phase" + suffix

    
    original_train_data = pd.read_parquet(train_data_path)
    writer = SummaryWriter(log_dir='{}/{}'.format(tensorboard_base_dir, model_alias))
    writer.add_text('alias', model_alias, 0)

    def notify_loss_completion(epoch_id, batch_id, loss, net, model):
        #print("notify_loss_completion")
        writer.add_scalar("Batch/loss", loss, batch_id)
        reporter(mean_accuracy=1-loss, timesteps_total=int(batch_id/log_loss_interval), checkpoint=model_alias)

    def notify_batch_eval_completion(epoch_id, batch_id, loss, net, model):
        #print("notify_batch_eval_completion")
        pairs_ndcg = nn_pairs_ndcg_score(net)
        writer.add_scalar("Batch/pairs_ndcg", pairs_ndcg, batch_id)


    def notify_epoch_completion(epoch_num, total_loss, net, model):
        #print("notify_epoch_completion")
        writer.add_scalar("Epoch/loss", total_loss, epoch_num)
        pairs_ndcg = nn_pairs_ndcg_score(net)
        writer.add_scalar("Epoch/pairs_ndcg", pairs_ndcg, epoch_num)
    #     hit_ratio, ndcg = evaluate_hit_ratio_and_ndcg(model)
    #     writer.add_scalar("Epoch/HR", hit_ratio, epoch_num)
    #     writer.add_scalar("Epoch/NDCG", ndcg, epoch_num)
        hit_ratio, ndcg = -1,-1
        torch.save(net, model_store_dir + "/" + model_alias + "-" + str(epoch_num))

    num_users=len(original_train_data["uindex"].unique())
    num_items=len(original_train_data["vindex"].unique())

    train_data = original_train_data.sample(frac=train_sample)

    interactions = Interactions(train_data["uindex"].to_numpy(),
                train_data["vindex"].to_numpy(),
                train_data["pct_cvt"].to_numpy(),
                train_data["latest_watch_time"].to_numpy(),
                num_users=len(original_train_data["uindex"].unique()),
                num_items=len(original_train_data["vindex"].unique()))

    if "-" in net_conf:
        args = net_conf.split("-")
        config = {
              "factor_size": int(args[0]),
              "num_layers": int(args[1]),
              "loss_type": args[2],
              "model_type": args[3],
            "num_users": num_users,
            "num_items": num_items,
        }
        if dropout:
            config["dropout"] = dropout

        num_layers = int(args[1])
        factor_size = int(args[0])
        config["layers"] = [4 * factor_size] + [factor_size * (2 ** i) for i in range(num_layers - 1, -1, -1)]
        config["latent_dim"] = 2 * factor_size
        writer.add_text('config', str(config), 0)

        rep = MLP(config)
    else:
        rep = None

    model = ImplicitFactorizationModel(n_iter=n_iters,
                                       loss=loss,
                                      notify_loss_completion=notify_loss_completion,
                                      notify_batch_eval_completion=notify_batch_eval_completion,
                                      notify_epoch_completion=notify_epoch_completion,
                                      log_loss_interval=log_loss_interval,
                                      log_eval_interval=log_eval_interval,
                                      betas=betas,
                                      learning_rate=lr,
                                      batch_size=batch_size,
                                      random_state=np.random.RandomState(2),
                                      num_negative_samples=num_negative_samples,
                                      l2=l2,
                                      use_cuda=use_cuda,
                                      representation=rep)
    model.fit(interactions)
    
    
NUM_GPUS=8
ray.init(ignore_reinit_error=True, num_gpus=NUM_GPUS)

from ray.tune.schedulers import AsyncHyperBandScheduler


exp_config = {
    "LOSS": tune.grid_search(["adaptive_hinge"]),
    "LR": tune.grid_search([1e-3, 1e-4, 1e-2, 0.005]),
    "L2": tune.grid_search([1e-5, 1e-8, 1e-10]),
    "DROPOUT": tune.grid_search([0.1,0.2,0.4,0.5]),
    "MOM": tune.grid_search([0.9,0.92,0.95,0.8]),
    "NEGSAMPLES": tune.grid_search([5, 3, 10])
}

configuration = tune.Experiment(
    "check_for_500K_3_vids",
    run=ray_train,
    num_samples=1,
    resources_per_trial={"gpu": 1},
    stop={"mean_accuracy": 0.95},  # TODO: Part 1
    config=exp_config
)

import sys
hyperband = AsyncHyperBandScheduler(
    time_attr='timesteps_total',
    reward_attr='mean_accuracy')
trials = tune.run_experiments(configuration, scheduler=hyperband, verbose=True, reuse_actors=True)

def get_sorted_trials(trial_list, metric):
    return sorted(trial_list, key=lambda trial: trial.last_result.get(metric, 0), reverse=True)
  
sorted_trials = get_sorted_trials(trials, metric="mean_accuracy")
print(str([(x.last_result.get("mean_accuracy", 0),  x.last_result.get("iterations_since_restore"), x) for x in sorted_trials]))

In [1]:
import torch.optim as optim

In [3]:
??optim.AdamW

[0;31mInit signature:[0m
[0moptim[0m[0;34m.[0m[0mAdamW[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbetas[0m[0;34m=[0m[0;34m([0m[0;36m0.9[0m[0;34m,[0m [0;36m0.999[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meps[0m[0;34m=[0m[0;36m1e-08[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweight_decay[0m[0;34m=[0m[0;36m0.01[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mamsgrad[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mAdamW[0m[0;34m([0m[0mOptimizer[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Implements AdamW algorithm.[0m
[0;34m[0m
[0;34m    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.[0m
[0;34m    The AdamW variant was proposed in `Decoupled 