In [1]:
import pandas as pd
import pystan
import numpy as np
import matplotlib.pyplot as plt
import pystan_utils
import os
from  movie_recommendation_aux import *
%matplotlib notebook
%load_ext autoreload
%autoreload 2

# Setting things up

In [11]:
seed = 42
#dataset = 'ml-20m' #big
dataset = 'ml-latest-small' #small
ratings = pd.read_csv(os.path.join(dataset,'ratings.csv'))
# HACK -- small movies.csv is apararently missing movies from small ratings.csv
movies = pd.read_csv(os.path.join('ml-20m','movies.csv')) 
#Create like column
ratings['like'] = (ratings.rating >= 3.0 )+ 0

unique_movies = ratings['movieId'].unique()

#movieId is not sequential
movie_dict = {movieId: i for i, movieId in enumerate(unique_movies)}
ratings['movieId'] = ratings['movieId'].apply(lambda movieId: movie_dict[movieId])

Data samples

In [None]:
ratings.head()

In [28]:
movies[movies.title.str.contains('Lord of the Rings')]

Unnamed: 0,movieId,title,genres
2032,2116,"Lord of the Rings, The (1978)",Adventure|Animation|Children|Fantasy
4897,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
5853,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
7041,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy


For now instead of multiclass classification based on stars, turn problem into binary classification by defining 'like' for all movies rated above 3.5 stars, and 'not-like' for all movies below 3.0

In [29]:
user = ratings[ratings['userId'] == 1]
user['like'] = user.rating >= 3.0
like = user['like'] + 0
N = len(like)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# One person classifier

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    
}
parameters {
    vector[N] trait;
    real preference;
} 
model {
    vector[N] affinity;
    vector[N] noisy_affinity;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        trait[n] ~ normal(0,10);
        affinity[n] = trait[n]*preference;
        likes[n] ~ bernoulli_logit(affinity[n]);
    }

}
"""

In [None]:
data = {'N': N, 'likes': like}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
fit.traceplot()

# Multiple traits

In [None]:
# define Stan model
model_definition = """

data {
    int<lower=0> N;             // number of data items
    int likes[N];
    int num_traits;
    
}
parameters {
    matrix[N,num_traits] trait;
    vector[num_traits] preference;
} 
model {
    //matrix[N, num_traits] trait_affinity ;
    vector[N] affinity ;
    
    preference ~ normal(0,10);
    for (n in 1:N){
        real tmp = 0;
        for (t in 1:num_traits){
            trait[n,t] ~ normal(0,10);
            tmp += trait[n,t]*preference[t];
        
        }
        affinity[n] = tmp;
        likes[n] ~ bernoulli_logit(affinity[n]);
        
    }
}
"""

In [None]:
data = {'N': N, 'likes': like, 'num_traits':2}

In [None]:
%%time
# create Stan model object
sm = pystan.StanModel(model_code=model_definition)
fit = sm.sampling(data=data, iter=10000, algorithm="NUTS", chains=1, seed=42, verbose=True)

In [None]:
print(fit)

In [None]:
trait_mean =np.mean(fit['trait'],0)
np.subtract(trait_mean[:,0],trait_mean[:,1])