In [162]:
import numpy as np
import pandas as pd
import math
from scipy.misc import logsumexp
from datetime import datetime

### Define the functions

In [69]:
def product_array(array):
    '''
    Takes the product of all the items in an iterable, one dimensional array
    '''
    array_prod = 1
    for i in range(len(array)):
        array_prod = array_prod * array[i]
    return(array_prod)

In [17]:
def indicator_funct(a,b):
    '''
    returns 1 if a == b is true; returns 0 if false
    '''
    if(a == b):
        return(1)
    else:
        return(0)

In [170]:
def likelihood(data, num_users, num_j, num_c, mu, gamma):
    '''
    Gets the log likelihood of the ratings EM algorithm
    '''
    
    #pre-set arrays for operations
    mu_prob = np.empty(num_c) #this will actually hold the product of the mus and the gamma product
    like_array = np.empty(num_users) #place to store all the parts of the likelihood
    
    #iterate through the users
    for i in range(num_users):
        
        #find where the relevant js are
        where_rel_js = np.logical_not(np.isnan(data[i,]))
        
        #get the relevant js, their length, and their index
        rel_j_index = np.arange(num_j)[where_rel_js]
        num_j_prime = len(rel_j_index)
        
        #pre-set selected gammas array
        selected_gammas = np.empty(num_j_prime) 
        
        #iterate through the groups
        for c in range(num_c):
            
            #iterate through the items
            for j in range(num_j_prime):
                
                #get the indeces of for the gamma array
                index_k = data[i,rel_j_index[j]] - 1 #subtract 1 to covert the rating to python's indexing
                #put the relevant gamma in the array
                selected_gammas[j] = gamma[c, rel_j_index[j], int(index_k)]
            
            #take the product of the selected gammas
            gamma_prod = logsumexp(selected_gammas)
            
            #take gamma_prod, and multiple it by the relevant mu. Put that in a list for later
            mu_prob[c] = mu[c] * gamma_prod
        
        #calculate a portion of the log likelihood
        like_array[i] = math.log(sum(mu_prob))
        
    #get the log likelihood once we exit the loops
    log_like = sum(like_array)
    
    return(log_like)

In [206]:
def E_step(data, num_users, num_j_prime, where_rel_js, rel_j_index, index_k, num_c, mu, gamma):
    '''
    Expectation step of of the rankings EM algorithm; returns the assignment matrix
    '''
    
    #allocate assignment matrix
    a = np.empty([num_users, num_c])
    
    #pre-set arrays for operations
    mu_prob = np.empty(num_c) #this will actually hold the product of the mus and the gamma product
    
    '''
    for c in range(num_c):
        select_gammas = gamma[c,:,:]
    #'''
    
    #loop through the users
    for i in range(num_users):
        
        #pre-set selected gammas array
        selected_gammas = np.empty(num_j_prime[i])
        
        #loop through the groups
        for c in range(num_c):
            
            #get an array of the relevant gammas times the relevant mus
            
            #iterate through the items
            for j in range(num_j_prime[i]):
                
                #put the relevant gamma in the array
                selected_gammas[j] = gamma[c, rel_j_index[i][j], int(index_k[i][j])]
            
            #take gamma product, and multiply it by the relevant mu. Put that in a list for later
            mu_prob[c] = mu[c] * logsumexp(selected_gammas)
            
        #get the sum of seleced_mus
        total_mu = sum(mu_prob)
        
        #calculate a. This requires looping through the mu_prob array
        for c in range(num_c):
            a[i,c] = mu_prob[c]/total_mu
        
        
    return(a)

In [231]:
def M_step(data, num_users, num_i, num_j, where_rel_i, rel_j_index, num_c, num_k, a, gamma):
    '''
    computes the M step of the ratings EM algorithm. Returns a tuple containing mu and gamma
    '''
    
    #get new mu
    new_mu = np.apply_along_axis(sum, 0, a)/num_users
    
    for c in range(num_c):
        
        #now start to fill the new gamma
        for j in range(num_j):
            
            a_col_sum = sum(a[:,c][where_rel_i[j]])
            
            tmp_a_col = np.empty(num_i[j])
            
            for k in range(num_k):
                
                mask = data[:,j] == k
                
                tmp_rel_i = where_rel_i[j][mask]
                tmp_a = a[:,c][mask]
                gamma[c, j, k] = sum(tmp_a[tmp_rel_i])/a_col_sum
                '''
                for i in range(num_i[j]):
                    
                    if data[rel_i_index[j][i],rel_j_index[i][j]] == k:
                        tmp_a_col[i] = a[i,c]
                
                gamma[c, j, k] = sum(tmp_a_col)/(a_col_sum)
    '''
    return((new_mu, gamma))

In [225]:
def EM_ratings_algo(data, num_c, num_k, eta, h = 500):
    '''
    EM algo for ratings problems. Returns a tuple with mu (a 1 dim array) and gamma (a 3 dim array)
    
    data is a matrix which contains the users' vote value for each item they voted on
    c is the number of groups
    k is the number of ratings/vote types
    eta is the termination threshold
    h is the cap on iterations of the while loop
    '''
    #get the dimensions of the data
    num_users = data.shape[0]
    num_j = data.shape[1] #number of things the users can vote on
    
    #pre-set arrays for finding the relevant js
    where_rel_js = np.empty(num_users, dtype=object)
    rel_j_index = np.empty(num_users, dtype=object)
    num_j_prime = np.empty(num_users, dtype=int)
    #preset array for finding the ks from the data
    index_k = np.empty(num_users, dtype=object)
    
    #loop through the users to get the relevant j indeces for each user
    for i in range(num_users):
        
        #find where the relevant js are
        where_rel_js[i] = np.logical_not(np.isnan(data[i,]))
        
        #get the relevant js, their length, and their index
        rel_j_index[i] = np.arange(num_j)[where_rel_js[i]]
        num_j_prime[i] = len(rel_j_index[i])
        
        index_k[i] = data[i,][where_rel_js[i]] - 1
    
    #set up arrays for the relevant i s
    where_rel_i = np.empty(num_j,dtype=object)
    rel_i_index = np.empty(num_j,dtype=object)
    num_i = np.empty(num_j, dtype=int)
    
    index_ki = np.empty(num_j, dtype=object)
    
    for j in range(num_j):
        
        where_rel_i[j] = np.logical_not(np.isnan(data[:,j]))
        #get the relevant i s, their length, and their index
        rel_i_index[j] = np.arange(num_users)[where_rel_i[j]]
        num_i[j] = len(rel_i_index[j])
        
        index_ki[j] = data[:,j][where_rel_i[j]] - 1
        
    
    #initialize mu as a vector with a slot of each group. pre-fill it with the uniform
    mu = np.empty(num_c)
    mu.fill(1/num_c)
    
    #initalize gamma as 3 dimensional array pre-filled with unifrom over ratings space
    gamma = np.empty([num_c, num_j, num_k])
    gamma.fill(1/num_k)
    
    #counter t
    t = 0
    print(str(datetime.now()))
    #likelihood intialize
    like = likelihood(data, num_users, num_j, num_c, mu, gamma)
    oldlike = like
    
    print(str(datetime.now()))
    
    #run the actual algorithm.
    while like <= (eta + oldlike) and t < h:
        
        t = t + 1
        
        #E step
        a = E_step(data, num_users, num_j_prime, where_rel_js, rel_j_index, index_k, num_c, mu, gamma)
        print(str(datetime.now()))
        #M step
        mu, gamma = M_step(data, num_users, num_i, num_j, where_rel_i, rel_j_index, num_c, num_k, a, gamma)
        
        #Likelihood calc
        if t % 10 == 0:
            oldlike = like
            like = likelihood(data, num_users, num_j, num_c, mu, gamma)
            print(like)
        
        print(str(datetime.now()))
    
    
    return((mu, gamma))

### Get the data

In [8]:
movie_trn = pd.read_csv("../data/movie_train.csv")
movie_tst = pd.read_csv("../data/movie_test.csv")

In [11]:
movie_trn.head(10)

Unnamed: 0.1,Unnamed: 0,1,2,3,4,5,6,7,8,9,...,1629,1630,1631,1632,1633,1634,1635,1638,1641,1648
0,1,4.0,4.0,,,,,,,,...,,,,,,,,,,6.0
1,23,5.0,,,,,,5.0,,,...,,,,,,,,,,
2,27,6.0,5.0,1.0,,5.0,,6.0,,,...,,,,,,,,,,
3,71,,5.0,4.0,,3.0,,5.0,,,...,,,,,,,,,,
4,119,5.0,,,,,5.0,,,4.0,...,,,,,,,,,,
5,160,,,3.0,4.0,4.0,,,,,...,,,,,,,,,,
6,162,,,2.0,3.0,2.0,3.0,3.0,3.0,3.0,...,,,,,,,,,,
7,245,,,3.0,,,5.0,5.0,,,...,,,,,,,,,,
8,251,6.0,,,,,,,,,...,,,,,,,,,,
9,254,6.0,4.0,,,,,,,,...,,,,,,,,,,


In [121]:
M_trn = movie_trn.values

In [122]:
M_trn = np.delete(M_trn, 0, 1)

In [123]:
max(M_trn[1,])

6.0

In [117]:
for i in (range(len(M_trn[1,]))):
    if(M_trn[1,][i] > 6):
        print(i)

0


In [140]:
len(M_trn[6,][np.logical_not(np.isnan(M_trn[6,]))])

499

In [169]:
x, y = (np.zeros(3),np.ones([2,3,4]))
print(x)
print(y)

[ 0.  0.  0.]
[[[ 1.  1.  1.  1.]
  [ 1.  1.  1.  1.]
  [ 1.  1.  1.  1.]]

 [[ 1.  1.  1.  1.]
  [ 1.  1.  1.  1.]
  [ 1.  1.  1.  1.]]]


In [228]:
M_trn.shape

(5055, 1619)

### Run the Algorithm

In [None]:
mu, gamma = EM_ratings_algo(M_trn, num_c = 5, num_k = 6, eta = .1, h = 200)

2018-04-09 00:13:08.094859
2018-04-09 00:13:13.536849
2018-04-09 00:13:17.281530
2018-04-09 00:13:19.081820
2018-04-09 00:13:22.875280
2018-04-09 00:13:24.681588
2018-04-09 00:13:28.424297
2018-04-09 00:13:30.235104
2018-04-09 00:13:33.989317
2018-04-09 00:13:35.795097
2018-04-09 00:13:39.715525
2018-04-09 00:13:41.548392
2018-04-09 00:13:45.549916
2018-04-09 00:13:47.450724
2018-04-09 00:13:51.347149
2018-04-09 00:13:53.241014
2018-04-09 00:13:57.214243
2018-04-09 00:13:59.058101
2018-04-09 00:14:02.870833
2018-04-09 00:14:04.711661
2018-04-09 00:14:08.608559
2405.59865138
2018-04-09 00:14:15.860563
2018-04-09 00:14:19.636298
2018-04-09 00:14:21.486638
2018-04-09 00:14:25.404493
2018-04-09 00:14:27.249895
2018-04-09 00:14:31.108251
2018-04-09 00:14:32.921045
2018-04-09 00:14:36.922814
2018-04-09 00:14:38.717590
2018-04-09 00:14:42.499807
2018-04-09 00:14:44.299073
2018-04-09 00:14:48.118806
2018-04-09 00:14:49.923572
2018-04-09 00:14:53.670268
2018-04-09 00:14:55.483574
2018-04-09 00: