In [1]:
import numpy as np
import matplotlib.pyplot as plt
import stan

import multiprocessing
multiprocessing.set_start_method("fork")

from concurrent.futures import ThreadPoolExecutor as _ThreadPoolExecutor

def _exec_async(func, *args, **kwargs):
    with _ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(func, *args, **kwargs)
    return future.result()

def stan_build(*args, **kwargs): return _exec_async(stan.build, *args, **kwargs)

In [2]:
# names of 500 movies to be rated:
with open('top-names.txt') as f: names = f.read().split('\n')
    
# ratings = int(2000 x 500) ratings of 500 movies by 2000 people; -1 = not rated
ratings = np.loadtxt('top-ratings-missing.txt')
nUsers,nMovies = ratings.shape
# Let's split into training & test:
np.random.seed(0)
pi = np.random.permutation(nUsers)
iTr,iTe = pi[:int(nUsers*.7)], pi[int(nUsers*.7):]
Xtr,Xte = ratings[iTr,:],ratings[iTe,:]


In [3]:
!ls

[34mbuild[m[m                   top-names.txt           top-ratings-missing.txt
subset-data.py          top-ratings-imputed.txt


In [4]:
recsys = """
data {
  int<lower=1> M;             // Total number of movies
  int<lower=1> N;             // Total number of users
  int<lower=1> R;             // Total number of ratings
  int<lower=0> K;             // Number of latent dimensions
  int usr[R];                     // user id for r'th rating
  int movie[R];                   // movie id for r'th rating
  vector<lower=1,upper=10> [R] rating; // vector of rating values (1..10)
}
transformed data {            // transform interval 1..10 to [-3,3]
  vector [R] pred = log( (rating-.5)./(10.5-rating) );
}                             // inverse is: 0.5 + 10./(1+exp(-pred))
parameters {
  vector [N] u;
  vector [M] v;
  matrix [N,K] U;              // latent dimensions
  matrix [M,K] V;  
}
model{
  u ~ normal(0,1);
  v ~ normal(0,1);
  to_vector(U) ~ normal(0,1);
  to_vector(V) ~ normal(0,1);
  for (r in 1:R) pred[r] ~ normal(u[usr[r]]+v[movie[r]]+U[usr[r],]*V[movie[r],]', 0.1);
}
"""

In [5]:
Xte.shape

(600, 500)

In [6]:
Xtr.shape

(1400, 500)

array([  0,   1,   2, ..., 497, 498, 499])

In [7]:
N,M = Xtr.shape

rating = Xtr
## NOTE: Stan uses 1-based indexing, compared to python's zero-based, so adjust:
recdata = {"M": M, "N": N, "R":len(rating), "K":3,
    "usr": usr+1,
    "movie": movie+1,
    "rating":rating+1
}

In [8]:
posterior = stan_build(recsys, data=recdata, random_seed=0)

Building...



Building: found in cache, done.

RuntimeError: Error calling get_param_names: `Exception: mismatch in dimension declared and found in context; processing stage=data initialization; variable name=usr; position=0; dims declared=(1400); dims found=(700000) (in '/var/folders/xm/gyg77x2j7j7fybr9t8sgpc980000gn/T/httpstan_hwnx4fn1/model_4j74c46m.stan', line 7, column 2 to column 13)`