In [None]:
import pandas as pd
import pystan
import numpy as np
import matplotlib.pyplot as plt
import pystan_utils
import os
%matplotlib notebook

# Setting things up

In [None]:
seed = 42
#dataset = 'ml-20m' #big
dataset = 'ml-latest-small' #small
ratings = pd.read_csv(os.path.join(dataset,'ratings.csv'))
# HACK -- small movies.csv is apararently missing movies from small ratings.csv
movies = pd.read_csv(os.path.join('ml-20m','movies.csv')) 
#Create like column
ratings['like'] = (ratings.rating > 3.5 )+ 0

unique_movies = ratings['movieId'].unique()

#movieId is not sequential
movie_dict = {movieId: i for i, movieId in enumerate(unique_movies)}
ratings['movieId'] = ratings['movieId'].apply(lambda movieId: movie_dict[movieId])

Data samples

In [None]:
ratings.head()

For now instead of multiclass classification based on stars, turn problem into binary classification by defining 'like' for all movies rated above 3.5 stars, and 'not-like' for all movies below 3.5

In [None]:
user = ratings[ratings['userId'] == 1]
user['like'] = user.rating > 3.5
like = user['like'] + 0
N = len(like)

For each user split sample (without replacement) 90% of data as training data and the remaining 10% as validation data. As some movies might never be sampled in the training set, remove those films from the validation set.

In [None]:
val_size = 0.1
#For sample randomly the validation set (note some movies might never be sampled)
val_set = ratings.groupby('userId').apply(lambda g: g.sample(frac=val_size,random_state=seed))
val_set.index =  val_set.index.droplevel()
#train set is compliment of val_set
train_set = ratings[~ratings.isin(val_set).all(1)]
#Possibly remove movies from validation set that was never sampled in the dataset
val_set = val_set[val_set.movieId.isin(train_set.movieId)] 

Make sure that not too many samples was removed. Fraction of the removed data is:

In [None]:
(len(ratings) - (len(train_set)+len(val_set)))/len(ratings)

As the movieIds does not necesarrily correspond to integer indices, make new ids such that they can be used as indices in stan vectors/matrixes:

In [None]:
unique_keys = train_set.movieId.unique()
indices = range(1,len(unique_keys)+1)
movie_id_dict = dict(zip(unique_keys, indices ))
id_movie_dict = dict(zip(indices, unique_keys))
train_set['movieIdNoHoles'] = train_set['movieId'].apply(lambda movie_id: movie_id_dict[movie_id])
val_set['movieIdNoHoles'] = val_set['movieId'].apply(lambda movie_id: movie_id_dict[movie_id])

# One person classifier

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    
}
parameters {
    vector[N] trait;
    real preference;
} 
model {
    vector[N] affinity;
    vector[N] noisy_affinity;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        trait[n] ~ normal(0,10);
        affinity[n] = trait[n]*preference;
        likes[n] ~ bernoulli_logit(affinity[n]);
    }

}
"""

In [None]:
data = {'N': N, 'likes': like}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
fit.traceplot()

# Multiple traits

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    int num_traits;
    
}
parameters {
    matrix[N,num_traits] trait;
    vector[num_traits] preference;
} 
model {
    //matrix[N, num_traits] trait_affinity ;
    vector[N] affinity ;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        real tmp = 0;
        for (t in 1:num_traits){
            trait[n,t] ~ normal(0,10);
            tmp += trait[n,t]*preference[t];
        
        }
        affinity[n] = tmp;
        likes[n] ~ bernoulli_logit(affinity[n]);
        
    }
}
"""

In [None]:
data = {'N': N, 'likes': like, 'num_traits':2}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
trait_mean =np.mean(fit['trait'],0)
np.subtract(trait_mean[:,0],trait_mean[:,1])

# Multiple people

### Generative Process

```
for (userId, movieId) in [(u1,m1),(u2,m2),...,(uN,mN)]
    affinity = 0;
    for (t in 1:num_traits){
        traitAffinity = trait[movieId, t] * preference[userId, t];
        affinity += traitAffinity
    generate prediction such that prediction ~ bernoulli_logit(affinity);
```

Probably summing the trait affinities and using the affinity as logit is not the way to discrimitate between likes.

### PGM

![alt text](figs/PGM.png "Title")
*PGM of model. We use the notation of http://www.mbmlbook.com that specifices the pgm as a bipartite graph where the squares explicitely denotes the distribution* 

### STAN

In [None]:
#num_movies = len(ratings.movieId.unique())
#num_users = len(ratings.userId.unique())
#missing_userId = []
#missing_movieId = []
#Find missing values
#all_users = ratings.userId
#for movie_id in ratings.movieId.unique():
#    missing_users = set(all_users).difference(set(ratings[ratings['movieId']==movie_id].userId))
#    for i in missing_users:
#        missing_userId.append(i)
#        missing_movieId.append(movie_id)


In [None]:
model_definition = """ data {
    int num_movies;             // number of data items
    int num_traits;
    int num_users;  
    
    int num_likes;

    
    int likes_obs[num_likes];
    int userId_obs[num_likes];
    int movieId_obs[num_likes];
    
    int num_missing;
    int userId_missing [num_missing];
    int movieId_missing [num_missing];
    
}
parameters {
    matrix[num_movies,num_traits] trait;
    matrix[num_users ,num_traits] preference;
    
} 

model {
    real affinity;

    for (n in 1:num_likes){
        affinity = 0;
        for (t in 1:num_traits){
            preference[userId_obs[n], t] ~ normal(0,10);
            trait[movieId_obs[n], t] ~ normal(0,10);
            
            affinity += trait[movieId_obs[n], t]*preference[userId_obs[n], t];
            
        }
        
        likes_obs[n] ~ bernoulli_logit(affinity);

    }
}

generated quantities {
    int predictions[num_missing];
    
    for(i in 1:num_missing){
        real affinity = 0;
        for (t in 1:num_traits){
            affinity += trait[movieId_missing[i], t] * preference[userId_missing[i], t];
       }
        predictions[i] = bernoulli_logit_rng(affinity);
    }
}
"""

In [None]:
''' data = {'num_movies': num_movies,
       'likes_obs': ratings['like'], 
        'num_traits':2, 
        'num_users':num_users, 
        'num_likes':len(ratings), 
        'userId_obs': ratings['userId'],
        'movieId_obs':ratings['movieId']+1,
        'num_missing': len(missing_userId),
        'userId_missing': missing_userId,
        'movieId_missing': missing_movieId
       }
'''
num_movies = len(train_set.movieIdNoHoles.unique())
num_users = len(train_set.userId.unique())
data = {'num_movies': num_movies,
        'likes_obs': train_set['like'], 
        'num_traits': 2, 
        'num_users': num_users, 
        'num_likes': len(train_set), 
        'userId_obs': train_set['userId'],
        'movieId_obs': train_set['movieIdNoHoles'],
        'num_missing': len(val_set),
        'userId_missing': val_set['userId'],
        'movieId_missing': val_set['movieIdNoHoles']
       }

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)

In [None]:
%%time
#sampling takes forever here, but VB seems to work really well
#fit = sm.sampling(data=data, iter=100, algorithm="NUTS", chains=1, seed=seed, verbose=True)
fit2 = sm.vb(data=data)

## Show preferences
Here the latent traits and preferences 

In [None]:
preferences=pystan_utils.vb_extract_variable(fit2, 'preference', var_type='matrix', dims=[num_users,2])
traits=pystan_utils.vb_extract_variable(fit2, 'trait', var_type='matrix', dims=[num_movies,2])

In [None]:
plt.scatter(preferences[:,0], preferences[:,1])

In [None]:
plt.scatter(traits[:,0], traits[:,1])

It is noted that the plot is a little bit misleading as there are no guarentee that the traits are orthogonal.

Lets plot some extreme values of trait0. We would expect to see that the trait is discriminating between films using a latent trait of the film.

In [None]:
n_extreme = 10
sorted_trait_0_ids = np.argsort(traits[:,0])
lowest_ids = sorted_trait_0_ids[:n_extreme]
highest_ids = sorted_trait_0_ids[-n_extreme:]
traits[lowest_ids,0]

In [None]:
traits[highest_ids,0]

In [None]:
lowest_ids

In [None]:
lowest_movie_ids = [ id_movie_dict[lowest_id] for lowest_id in lowest_ids]
lowest_movie_ids

In [None]:
movies[movies.movieId.isin(lowest_movie_ids)]

In [None]:
highest_movie_ids = [ id_movie_dict[highest_id] for highest_id in highest_ids]
movies[movies.movieId.isin(highest_movie_ids)]

By visual inspection of the low/high scoring, we cannot really see any latent trait that is used for discriminating.

# Calculate precision of classification

In [None]:
predictions = pystan_utils.vb_extract_variable(fit2, 'predictions', var_type='vector', dims=[len(val_set)])

In [None]:
len(predictions)

In [None]:
true_labels = val_set['like']
1 - sum(abs(predictions - true_labels))/len(true_labels)

The presicion is low close to random and therefore at this time we cannot say that our model is actually precictiong something usefull.