In [None]:
import pandas as pd
import pystan
import numpy as np
import matplotlib.pyplot as plt
import pystan_utils
%matplotlib notebook

In [None]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
#Create like column
ratings['like'] = (ratings.rating > 3.5 )+ 0

unique_movies = ratings['movieId'].unique()

#movieId is not sequantial
movie_dict = {movieId: i for i,movieId in enumerate(unique_movies)}
ratings['movieId'] = ratings['movieId'].apply(lambda movieId: movie_dict[movieId])

In [None]:
ratings.head()

In [None]:
user = ratings[ratings['userId'] == 1]
user['like'] = user.rating > 3.5
like = user['like'] + 0
N = len(like)

like.head()

# One person classifier

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    
}
parameters {
    vector[N] trait;
    real preference;
} 
model {
    vector[N] affinity;
    vector[N] noisy_affinity;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        trait[n] ~ normal(0,10);
        affinity[n] = trait[n]*preference;
        likes[n] ~ bernoulli_logit(affinity[n]);
    }

}
"""

In [None]:
data = {'N': N, 'likes': like}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
fit.traceplot()

# Multiple traits

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    int num_traits;
    
}
parameters {
    matrix[N,num_traits] trait;
    vector[num_traits] preference;
} 
model {
    //matrix[N, num_traits] trait_affinity ;
    vector[N] affinity ;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        real tmp = 0;
        for (t in 1:num_traits){
            trait[n,t] ~ normal(0,10);
            tmp += trait[n,t]*preference[t];
        
        }
        affinity[n] = tmp;
        likes[n] ~ bernoulli_logit(affinity[n]);
        
    }
}
"""

In [None]:
data = {'N': N, 'likes': like, 'num_traits':2}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
trait_mean =np.mean(fit['trait'],0)
np.subtract(trait_mean[:,0],trait_mean[:,1])

# Multiple people

In [None]:
num_movies = len(ratings.movieId.unique())
num_users = len(ratings.userId.unique())
missing_userId = []
missing_movieId = []
#Find missing values
all_users = ratings.userId
for movie_id in ratings.movieId.unique():
    missing_users = set(all_users).difference(set(ratings[ratings['movieId']==movie_id].userId))
    for i in missing_users:
        missing_userId.append(i)
        missing_movieId.append(movie_id)


In [None]:
# define Stan model
model_definition = """

data {
    int num_movies;             // number of data items
    int num_traits;
    int num_users;  
    
    int num_likes;

    
    int likes_obs[num_likes];
    int userId_obs[num_likes];
    int movieId_obs[num_likes];
    
    //int num_missing;
    //int userId_missing [num_missing];
    //int movieId_missing [num_missing];
    
}
parameters {
    matrix[num_movies,num_traits] trait;
    matrix[num_users ,num_traits] preference;
    
} 

model {
    real affinity;

    for (n in 1:num_likes){
        affinity = 0;
        for (t in 1:num_traits){
            preference[userId_obs[n], t] ~ normal(0,10);
            trait[movieId_obs[n], t] ~ normal(0,10);
            
            affinity += trait[movieId_obs[n], t]*preference[userId_obs[n], t];
            
        }
        
        likes_obs[n] ~ bernoulli_logit(affinity);

    }
}

// Sampling predictions takes too much RAM
//generated quantities {
//    int predictions[num_missing];
    
//    for(i in 1:num_missing){
//        real affinity = 0;
//        for (t in 1:num_traits){
//            affinity += trait[movieId_missing[i], t] * preference[t, userId_missing[i]];
//       }
//        predictions[i] = bernoulli_logit_rng(affinity);
//    }
//}

"""

In [None]:
data = {'num_movies': num_movies,
        'likes_obs': ratings['like'], 
        'num_traits':2, 
        'num_users':num_users, 
        'num_likes':len(ratings), 
        'userId_obs': ratings['userId'],
        'movieId_obs':ratings['movieId']+1
        #'num_missing': len(missing_userId),
        #'userId_missing': missing_userId,
        #'movieId_missing': missing_movieId
       }

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)

In [None]:
%%time
#sampling takes forever here, but VB seems to work really well
#fit = sm.sampling(data=data, iter=10, algorithm="NUTS", chains=1, seed=42, verbose=True)
fit2 = sm.vb(data=data)

In [None]:
preferences=pystan_utils.vb_extract_variable(fit2, 'preference', var_type='matrix', dims=[num_users,2])
traits=pystan_utils.vb_extract_variable(fit2, 'trait', var_type='matrix', dims=[num_movies,2])

In [None]:
plt.scatter(preferences[:,0], preferences[:,1])

In [None]:
plt.scatter(traits[:,0], traits[:,1])