In [7]:
import pandas as pd
import numpy as np
from scipy.special import digamma

In [8]:
df1 = pd.read_csv('unique_userid_sample_1.csv')
df1.drop(df1.columns[1:10], axis = 1, inplace = True)
df1.rename(columns = {'count': 'Count', 'group': 'UserId', 'movie_id':'ItemId'}, inplace = True)

In [9]:
import pandas as pd
import numpy as np

# Sample data structure
# Note: Replace this with your actual dataframe


# Ensure every user and every item that is in the validation set is also in the training set
def create_validation_set(df, val_size=0.2):
    # Start by shuffling the dataframe to ensure randomness
    users = df['UserId'].unique()
    items = df['ItemId'].unique()

    # Create a mask to filter validation set
    mask = df.duplicated(subset=['UserId'], keep=False) & df.duplicated(subset=['ItemId'], keep=False)
    temp_df = df[mask]

    # Now let's take the top `val_size` proportion of this temporary dataframe for validation
    val_count = int(len(temp_df) * val_size)
    validation_set = temp_df.head(val_count)
    training_set = pd.concat([df, validation_set]).drop_duplicates(keep=False)

    # Ensure that all users and items in the validation set are also in the training set
    validation_set = validation_set[validation_set['UserId'].isin(training_set['UserId'])]
    validation_set = validation_set[validation_set['ItemId'].isin(training_set['ItemId'])]

    # Adjust the training set in case any validation rows are left
    training_set = pd.concat([training_set, validation_set]).drop_duplicates(keep=False)

    return training_set, validation_set



In [10]:
# Create the validation set
training_set, validation_set = create_validation_set(df1)

# Display the sizes of the datasets to check the ratio
(training_set.shape[0], validation_set.shape[0])
# Fill missing observations with 0
interaction_matrix = training_set.pivot_table(index='ItemId', columns='UserId', values='Count', fill_value=0)


values_users = [i for i in range(len(interaction_matrix))]
user_dict = {k:v for (k,v) in zip(interaction_matrix.columns, values_users)}
values_items = [i for i in range(len(interaction_matrix))]
item_dict = {k:v for (k,v) in zip(interaction_matrix.index, values_items)}

y_iu = interaction_matrix.values
y_ui = y_iu.transpose()


In [31]:
def initiliaze_parameters(num_users, num_items, k, a=0.3, a_prime=0.3, c=0.3, c_prime=0.3, b_prime=1.0, d_prime=1.0):
    
    k_shp = (a_prime + k * a) * np.ones(num_users)
    t_shp = (c_prime + k * c) * np.ones(num_items)
    k_rte = 1 * np.ones(num_users)
    t_rte = 1 * np.ones(num_items)
    
    rng = np.random.default_rng()
    gamma_rte = a_prime + 0.01 * rng.random((num_users, k))
    gamma_shp = a_prime + 0.01 * rng.random((num_users, k))
    lambda_rte = c_prime + 0.01 * rng.random((num_items, k))
    lambda_shp = c_prime + 0.01 * rng.random((num_items, k))
    
    phi = np.empty((num_users, num_items, k))
    
    return gamma_shp, gamma_rte, lambda_shp, lambda_rte, k_shp, k_rte, t_shp, t_rte, phi, a, a_prime, c, c_prime, b_prime, d_prime

In [32]:
def CAVI(num_users, num_items, k, y_ui, gamma_shp, gamma_rte, lambda_shp, lambda_rte, k_shp, k_rte, t_shp, t_rte, phi, a=0.3, a_prime=0.3, c=0.3, c_prime=0.3, b_prime=1.0, d_prime=1.0):
    
    # Update phi
    phi = digamma(gamma_shp)[:, None, :] - np.log(gamma_rte)[:, None, :] \
          + digamma(lambda_shp)[None, :, :] - np.log(lambda_rte)[None, :, :]
    phi -= np.max(phi, axis=2, keepdims=True)  # for numerical stability
    np.exp(phi, out=phi)  # in-place exponentiation
    phi /= np.sum(phi, axis=2, keepdims=True)
    
    print('Phi updated')
    
    # Users (Theta) - Gamma Shape Update
    gamma_shp = a + np.einsum('ui, uik -> uk', y_ui, phi)
    
    print('Users (Theta) - Gamma Shape Updated')
    
    # Users (Theta) - Gamma Rate Update
    gamma_rte = k_shp[:, None] / k_rte[:, None] + np.einsum('ik, uik -> uk', lambda_shp / lambda_rte, phi)
    
    print('Users (Theta) - Gamma Rate Updated')
    
    # Users Activity - Gamma Rate Update
    k_rte = a_prime / b_prime + np.sum(gamma_shp / gamma_rte, axis=1)
    
    print('Users Activity - Gamma Rate Updated')
    
    # Items (Beta) - Gamma Shape Update
    lambda_shp = c + np.einsum('ui, uik -> ik', y_ui, phi)
    
    print('Items (Beta) - Gamma Shape Updated')
    
    # Items (Beta) - Gamma Rate Update
    lambda_rte = t_shp[:, None] / t_rte[:, None] + np.einsum('uk, uik -> ik', gamma_shp / gamma_rte, phi)
    
    print('Items (Beta) - Gamma Rate Updated')
    
    # Item Popularity - Gamma Rate Update
    t_rte = c_prime / d_prime + np.sum(lambda_shp / lambda_rte, axis=1)
    
    print('Item Popularity - Gamma Rate Updated')
    
    return gamma_shp, gamma_rte, lambda_shp, lambda_rte, k_shp, k_rte, t_shp, t_rte, phi


In [27]:
def expectation(gamma_shp, gamma_rte, lambda_shp, lambda_rte, k, y_ui):
    
    num_users = y_ui.shape[0]
    num_items = y_ui.shape[1]
    
    ev_theta = np.empty((num_users,k))
    ev_beta = np.empty((num_items,k))
    for u in range(num_users):
        for d in range(k):
            ev_theta[u,d] = gamma_shp[u,d] / gamma_rte[u,d]

    for i in range(num_items):
        for d in range(k):
            ev_beta[i,d] = lambda_shp[i,d] / lambda_rte[i,d]
            
    return ev_theta, ev_beta
    
        

In [28]:
def log_likelihood(ev_theta, ev_beta, user_dict, item_dict, validation_set):
    likelihood_parameters = np.matmul(ev_theta, ev_beta.T)
    llk = 0
    for v in range(len(validation_set)):
        uu = validation_set['UserId'].iloc[v]
        ii = validation_set['ItemId'].iloc[v]
        u = user_dict[uu]
        i = item_dict[ii]
        value = validation_set['Count'].iloc[v]
        llk += - likelihood_parameters[u,i] - np.log(np.math.factorial(value)) + np.log(likelihood_parameters[u,i])*value
    return llk

In [36]:
i = 0
results = []
gamma_shp, gamma_rte, lambda_shp, lambda_rte, k_shp, k_rte, t_shp, t_rte, phi, a, a_prime, c, c_prime, b_prime, d_prime = initiliaze_parameters(y_ui.shape[0], y_ui.shape[1], k=5, a=0.3, a_prime=0.3, c=0.3, c_prime=0.3, b_prime=1.0, d_prime=1.0)
results.append(-np.inf)
while i < 500:
    print(f"Iteration number {i}")
    gamma_shp, gamma_rte, lambda_shp, lambda_rte, k_shp, k_rte, t_shp, t_rte, phi = CAVI(y_ui.shape[0], y_ui.shape[1], 5, y_ui, gamma_shp, gamma_rte, lambda_shp, lambda_rte, k_shp, k_rte, t_shp, t_rte, phi)
    ev_theta, ev_beta = expectation(gamma_shp, gamma_rte, lambda_shp, lambda_rte, 5, y_ui)
    llk = log_likelihood(ev_theta, ev_beta, user_dict, item_dict, validation_set)
    print(llk)
    results.append(llk)
    if ((results[-1] - results[-2])/ np.abs(results[-2])) < 0.000001:
        break
    i += 1
    

Iteration number 0
Phi updated
Users (Theta) - Gamma Shape Updated
Users (Theta) - Gamma Rate Updated
Users Activity - Gamma Rate Updated
Items (Beta) - Gamma Shape Updated
Items (Beta) - Gamma Rate Updated
Item Popularity - Gamma Rate Updated
-232882.0472536995
Iteration number 1


  if ((results[-1] - results[-2])/ np.abs(results[-2])) < 0.000001:


Phi updated
Users (Theta) - Gamma Shape Updated
Users (Theta) - Gamma Rate Updated
Users Activity - Gamma Rate Updated
Items (Beta) - Gamma Shape Updated
Items (Beta) - Gamma Rate Updated
Item Popularity - Gamma Rate Updated
-236080.44355552742


In [35]:
results

[-68517.8399245928, -232880.97265696622]