In [None]:
import pandas as pd
import pystan
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings['like'] = (ratings.rating > 3.5 )+ 0

In [None]:
ratings.head()

In [None]:
 = len(like)
num_users = 

In [None]:
user = ratings[ratings['userId'] == 1]
user['like'] = user.rating > 3.5
like = user['like'] + 0
N = len(like)

like.head()

# One person classifier

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    
}
parameters {
    vector[N] trait;
    real preference;
} 
model {
    vector[N] affinity;
    vector[N] noisy_affinity;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        trait[n] ~ normal(0,10);
        affinity[n] = trait[n]*preference;
        likes[n] ~ bernoulli_logit(affinity[n]);
    }

}
"""

In [None]:
data = {'N': N, 'likes': like}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
fit.traceplot()

# Multiple traits

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    int num_traits;
    
}
parameters {
    matrix[N,num_traits] trait;
    vector[num_traits] preference;
} 
model {
    //matrix[N, num_traits] trait_affinity ;
    vector[N] affinity ;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        real tmp = 0;
        for (t in 1:num_traits){
            trait[n,t] ~ normal(0,10);
            tmp += trait[n,t]*preference[t];
        
        }
        affinity[n] = tmp;
        likes[n] ~ bernoulli_logit(affinity[n]);
        
    }
}
"""

In [None]:
data = {'N': N, 'likes': like, 'num_traits':2}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
trait_mean =np.mean(fit['trait'],0)
np.subtract(trait_mean[:,0],trait_mean[:,1])

# Multiple people

In [None]:
num_movies = len(ratings.movieId.unique())
num_users = len(ratings.userId.unique())
missing_values = []
#Find missing values
all_users = ratings.userId
for movie_id in ratings.movieId.unique():
    missing_users = set(all_users).difference(set(ratings[ratings['movieId']==movie_id].userId))
    for i in missing_users:
        missing_values.append((movie_id, i))

In [None]:
# define Stan model
model_definition = """

data {
    int num_movies;             // number of data items
    int num_traits;
    int num_users;  
    
    int num_likes;
    int num_missing;
    
    int likes_obs[num_likes];
    int idx_obs [num_likes,2];
    
    int idx_missing [num_missing, 2];
    
    
}
parameters {
    matrix[num_movies,num_traits] trait;
    matrix[num_traits, num_users] preference;
    
    vector[num_missing] likes_missing;
} 

transformed parameters {
    real likes[num_movies, num_users];

    for(i in 1:num_likes){
        likes[idx_obs[i,1],idx_obs[i,2]] = likes_obs[i];
    }
    for(i in 1:num_missing){
        likes[idx_missing[i,1],idx_missing[i,2]] = likes_missing[i];
    }
    
}
model {
    real tmp;
    matrix[num_movies, num_users] affinity ;

    
    for (p in 1:num_users){
        for (t in 1:num_traits){
            preference[t,p] ~ normal(0,10);
        }
    }
    

    for (n in 1:num_movies){
        for (t in 1:num_traits){
            trait[n,t] ~ normal(0,10);
        }
    
        for(p in 1:num_users){
        
            tmp = 0;
            
            for(t in 1:num_traits){
                tmp += trait[n,t]*preference[t,p];
            }
            affinity[n,p] = tmp;
            likes[n,p] ~ bernoulli_logit(affinity[n,p]);
        }

    }
}
"""

In [None]:
data = {'num_movies': num_movies,
        'likes_obs': ratings['like'], 
        'num_traits':2, 
        'num_users':num_users, 
        'num_likes':len(like), 
        'num_missing': len(missing_values),
        'idx_obs': ratings[['userId','movieId']],
        'idx_missing':missing_values
       }

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)