In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch
import scipy

from data_preprocessing.data_preprocessing import get_experimental_data

from inference.loss import LossTeaching
from inference.train import train

from plot.plot_hist_loss import plot_loss
from plot.plot_posterior import plot_posterior

In [2]:
%config InlineBackend.figure_format = 'retina'
sns.set()

We assume the following model:

\begin{align}
Z_u^\rho &\sim \mathcal{N}(0, \sigma_u^\rho)\\
Z_w^\rho &\sim \mathcal{N}(0, \sigma_w^\rho) \\
Z_{u, w}^\rho &= \mu^\rho + Z_u^\rho + Z_w^\rho \\
\end{align}
where $Z_u^{\rho}$ is a random variable whose distribution is specific to user $u$ and parameter $\rho$, and $\rho \in {\alpha, \beta}$.

The probability of recall for user $u$ and item/word $w$ at time $t$ is defined as:
\begin{align}
p(\omega = 1 \mid t, u, w) &= e^{-Z_{u, w}^\alpha (1-Z_{u, w}^\beta)^n \delta_{u, w}^t}  \\
\end{align}
where $\delta_{u, w}^t$ is the time elapsed since the last presentation for user $u$, item $w$ at time $t$.


# Run on experimental data

In [3]:
data = get_experimental_data()

Number of user 53
Number of items 1998
Total number of observations (excluding first presentation) 70618
Minimum number of observation for a single user 1285
Maximum number of observation for a single user 1404


In [None]:
z_flow, theta_flow, hist_loss = train(
    data,
    n_sample=40,
    epochs=5000)

  5%|█▍                        | 265/5000 [00:29<08:43,  9.04it/s, loss=5.11e+4]

In [None]:
run_name = "exp_data"

In [None]:
plot_loss(hist_loss, name=run_name)

In [None]:
plot_posterior(theta_flow, name=run_name)

In [None]:
z_flow.save(run_name)
theta_flow.save(run_name)

In [None]:
batch_size = 100000

z0_θ = theta_flow.sample_base_dist(batch_size)
zk_θ, base_dist_logprob_θ, log_det_θ = theta_flow(z0_θ)

mu1, log_var_u1, log_var_w1 = zk_θ.data[:, :3].T
mu2, log_var_u2, log_var_w2 = zk_θ.data[:, 3:].T

In [None]:
unconstrained = {
    "mu1": mu1.mean().item(), 
    "sigma_u1": np.exp(0.5*log_var_u1.mean().item()), 
    "sigma_w1": np.exp(0.5*log_var_w1.mean().item()),
    "mu2": mu2.mean().item(), 
    "sigma_u2": np.exp(0.5*log_var_u2.mean().item()),
    "sigma_w2": np.exp(0.5*log_var_w2.mean().item())}

df_param = pd.DataFrame([unconstrained, ], index=["unconstrained",])
df_param

In [None]:
df_param.to_csv(os.path.join("bkp", "param_exp_data.csv"))

In [None]:
unc_a = np.random.normal(df_param.loc["unconstrained", "mu1"], df_param.loc["unconstrained", "sigma_u1"], size=1000)