In [1]:
import functools
import itertools
import logging
import math
import os
import pickle
import sys
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import seaborn as sns
import yaml

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set_context("poster")
sns.set(rc={"figure.figsize": (16, 12.0)})
sns.set_style("whitegrid")

import numpy as np
import pandas as pd
import torch.nn.functional as F

pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from lda4rec.datasets import Interactions, DataLoader, random_train_test_split
from lda4rec.evaluations import mrr_score, precision_recall_score, summary
from lda4rec.estimators import MFEst, PopEst, LDA4RecEst, SNMFEst
from lda4rec.utils import process_ids, cmp_ranks

In [3]:
import pyro
import pyro.distributions as dist
import pyro.optim as optim
import torch
from pyro.distributions import constraints
from pyro.infer import SVI, Predictive, Trace_ELBO, TraceEnum_ELBO, config_enumerate

In [4]:
import neptune.new as neptune
neptune.init(mode="offline");

offline/42a2ed19-709c-46f6-af2b-a667faad3100


## Experimenting with Matrix Factorization as aLDA formulation

In [5]:
loader = DataLoader()
data = loader.load_movielens("100k")

In [6]:
data.max_user_interactions_(200)
data.implicit_(0.)
train, test = random_train_test_split(data)

In [7]:
mf_est = MFEst(embedding_dim=8, n_iter=20)
mf_est.fit(train)

INFO:lda4rec.estimators:Epoch     0: loss 0.3823770273376155
INFO:lda4rec.estimators:Epoch     1: loss 0.15964230655415637
INFO:lda4rec.estimators:Epoch     2: loss 0.11581193530881727
INFO:lda4rec.estimators:Epoch     3: loss 0.09692244044429547
INFO:lda4rec.estimators:Epoch     4: loss 0.08652593110864226
INFO:lda4rec.estimators:Epoch     5: loss 0.07624727755382255
INFO:lda4rec.estimators:Epoch     6: loss 0.07235154715442174
INFO:lda4rec.estimators:Epoch     7: loss 0.06692514476945272
INFO:lda4rec.estimators:Epoch     8: loss 0.06370584648102522
INFO:lda4rec.estimators:Epoch     9: loss 0.06167086124722217
INFO:lda4rec.estimators:Epoch    10: loss 0.0598684781885429
INFO:lda4rec.estimators:Epoch    11: loss 0.059196134699458204
INFO:lda4rec.estimators:Epoch    12: loss 0.0577837115631917
INFO:lda4rec.estimators:Epoch    13: loss 0.05519735169647312
INFO:lda4rec.estimators:Epoch    14: loss 0.05447612520105936
INFO:lda4rec.estimators:Epoch    15: loss 0.05591662263346685
INFO:lda4r

0.051906296968258714

### Overall summaries showing equivalence of MF and adjoint LDA formulation

In [8]:
user_id = 42
mf_est.lda_trafo = False
mf_est.predict(user_id)

array([14.78094   ,  6.281404  , 17.850199  , ..., -3.3343346 ,
        0.38743448, -6.0783505 ], dtype=float32)

In [9]:
summary(mf_est, train=train, test=test)

Unnamed: 0_level_0,train,test
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
prec,0.309672,0.112479
recall,0.059727,0.080443
mrr,0.030368,0.037841


In [10]:
mf_est.lda_trafo = True
mf_est.predict(user_id) # the numbers differ as expected due to the transformation

array([1.44475067e-04, 1.22261292e-04, 1.52496679e-04, ...,
       9.71303089e-05, 1.06857246e-04, 8.99587394e-05], dtype=float32)

In [11]:
summary(mf_est, train=train, test=test)

Unnamed: 0_level_0,train,test
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
prec,0.309672,0.112479
recall,0.059727,0.080443
mrr,0.030368,0.037841


### Compare equivalence of the ranking from MF and adjoint LDA formulation for a single user

In [12]:
user_id = 140
orig_scores = mf_est.predict(np.array([user_id], dtype=np.int))
item_probs = mf_est.get_item_probs(torch.tensor([user_id]))

In [13]:
cmp_ranks(orig_scores, item_probs, eps=1e-5)

True