In [5]:
import numpy as np
import math
import random

In [None]:
def product_array(array):
    '''
    Takes the product of all the items in an iterable, one dimensional array
    '''
    array_prod = 1
    for item in array:
        array_prod = array_prod * item
    return(array_prod)

In [None]:
def indicator_funct(a,b):
    '''
    returns 1 if a == b is true; returns 0 if false
    '''
    if(a == b):
        return(1)
    else:
        return(0)

In [None]:
def likelihood(data, num_users, num_j, num_c, mu, gamma):
    '''
    Gets the log likelihood of the ratings EM algorithm
    '''
    #pre-set arrays for operations
    selected_gammas = np.empty(num_j) 
    mu_prob = np.empty(num_c) #this will actually hold the product of the mus and the gamma product
    like_array = np.empty(num_users) #place to store all the parts of the likelihood
    
    #iterate through the users
    for i in range(num_users):
        
        #iterate through the groups
        for c in range(num_c):
            
            #iterate through the items
            for j in range(num_j):
                #get the indexes of for the gamma array
                index_k = data[i,j] - 1 #subtract 1 to covert the rating to python's indexing
                #put the relevant gamma in the array
                selected_gammas = gamma[c, j, index_k]
            
            #take the product of the selected gammas; set gamma_prod to 1 first
            gamma_prod = product_array(selected_gammas)
            
            #take gamma_prod, and multiple it by the relevant mu. Put that in a list for later
            mu_prob[c] = mu[c] * gamma_prod
        
        #calculate a portion of the log likelihood
        like_array = math.log(sum(mu_prob))
        
    #get the log likelihood once we exit the loops
    log_like = sum(like_array)
    
    return(log_like)

In [None]:
def E_step(data, num_users, num_j, num_c, mu, gamma):
    '''
    Expectation step of of the rankings EM algorithm; returns the assignment matrix
    '''
    
    #allocate assignment matrix
    a = np.empty([num_users, num_c])
    
    #pre-set arrays for operations
    selected_gammas = np.empty(num_j) #need just the number of movies the relevant user voted on, maybe?
    mu_prob = np.empty(num_c) #this will actually hold the product of the mus and the gamma product
    
    #loop through the users
    for i in range(num_users):
        
        #loop through the groups
        for c in range(num_c):
            
            #get an array of the relevant gammas times the relevant mus
            
            #iterate through the items
            for j in range(num_j):
                #get the indexes of for the gamma array
                index_k = data[i,j] - 1 #subtract 1 to covert the rating to python's indexing
                #put the relevant gamma in the array
                selected_gammas = gamma[c, j, index_k]
            
            #take the product of the selected gammas
            gamma_prod = product_array(selected_gammas)
            
            #take gamma_prod, and multiple it by the relevant mu. Put that in a list for later
            mu_prob[c] = mu[c] * gamma_prod
            
        #get the sum of seleced_mus
        total_mu = sum(mu_prob)
        
        #calculate a. This requires looping through the mu_prob array
        for c in range(num_c):
            a[i,c] = mu_prob[c]/total_mu
        
        
    return(a)

In [None]:
def M_step(data, num_users, num_j, num_c, num_k, a):
    '''
    computes the M step of the ratings EM algorithm. Returns a tuple containing mu and gamma
    '''
    #allocate new mu and gamma
    new_mu = np.empty(num_c)
    new_gamma = np.empty([num_c, num_j, num_k])
    
    #allocate tmp_a_col
    tmp_a_col = np.empty(num_users)
    
    for(c in range(num_c)):
        
        #get the relevant column of a, and its sum
        a_col = a[,c]
        a_col_sum = sum(a_col)
        
        #fill new mu
        new_mu = a_col_sum/num_user
        
        #now start to fill the new gamma
        for(j in range(num_j)):
            
            for(k in range(num_k)):
                
                for(i in range(num_users)):
                    
                    tmp_a_col = a[i,c] * indicator_funct(data[i,j] - 1, k)
                
                new_gamma[c, j, k] = sum(tmp_a_col)/a_col_sum
    
    return((new_mu, new_gamma)
    

In [None]:
def EM_ratings_algo(data, num_c, _num_k, eta, h = 1000):
    '''
    EM algo for ratings problems. Returns a tuple with mu (a 1 dim array) and gamma (a 3 dim array)
    
    data is a matrix which contains the users' vote value for each item they voted on
    c is the number of groups
    k is the number of ratings/vote types
    eta is the termination threshold
    h is the cap on iterations of the while loop
    '''
    #get the dimensions of the data
    num_users = len(data[,0])
    num_j = len(data[0,]) #number of things the users can vote on
    
    #initialize mu as a vector with a slot of each group. pre-fill it with the uniform
    mu = np.empty(num_c)
    mu.fill(1/num_c)
    
    #initalize gamma as 3 dimensional array pre-filled with unifrom over ratings space
    gamma = np.empty([num_c, num_j, num_k])
    gamma.fill(1/num_k)
    
    #counter t
    t = 0
    
    #run the actual
    while(likelihood(data, num_users, num_j, num_c, mu, gamma) > eta and t < h):
        t = t + 1
        
        #E step
        a = E_step(data, num_users, num_j, num_c, mu, gamma)
        
        #M step
        mu, gamma = M_step(data, num_users, num_j, num_c, num_k, a)
    
    
    return((mu, gamma))