In [None]:
import pandas as pd
import numpy as np
import scipy
import torch

In [None]:
import pandas as pd

# Load the item data
item_data_path = 'data/sushi3-2016/sushi3.idata'
item_columns = ['item_id', 'name', 'style', 'major_group', 'minor_group', 'heaviness', 'consumption_frequency', 'normalized_price', 'sell_frequency']
item_df = pd.read_csv(item_data_path, sep='\t', header=None, names=item_columns)

# Filter the item data to include only the 10 items used in the paper
item_set_A_ids = [0, 1, 2, 3, 4, 6, 7, 8, 26, 29]
#item_set_A_ids=[i for i in range(100)]
item_set_A_df = item_df[item_df['item_id'].isin(item_set_A_ids)]

# Preprocess the item features
categorical_features = ['style', 'major_group', 'minor_group']
numerical_features = ['heaviness', 'consumption_frequency', 'normalized_price', 'sell_frequency']

# Convert categorical features to strings to ensure get_dummies works correctly

item_features = pd.get_dummies(item_set_A_df[categorical_features].astype(str))
item_features = pd.concat([item_features, item_set_A_df[numerical_features]], axis=1)

# Display the preprocessed item features
print("Item features shape:", item_features.shape)

# Load the user data
user_data_path = 'data/sushi3-2016/sushi3.udata'
user_columns = ['user_id', 'gender', 'age', 'total_time', 'prefecture_longest', 'region_longest', 'east_west_longest', 'prefecture_current', 'region_current', 'east_west_current', 'prefecture_diff']
user_df = pd.read_csv(user_data_path, sep='\t', header=None, names=user_columns)

# Preprocess the user features
categorical_features_user = ['gender', 'age', 'prefecture_longest', 'region_longest', 'east_west_longest', 'prefecture_current', 'region_current', 'east_west_current']

# Convert categorical features to strings to ensure get_dummies works correctly
user_df[categorical_features_user] = user_df[categorical_features_user].astype(str)

user_features = pd.get_dummies(user_df[categorical_features_user])

# Display the preprocessed user features
print("User features shape:", user_features.shape)


# Carregando o arquivo de preferencias e removendo as duas primeiras colunas de metadados

In [None]:
import pandas as pd

# Load and parse the preference order data manually
preference_data_path = 'data/sushi3-2016/sushi3a.5000.10.order'
with open(preference_data_path, 'r') as file:
    lines = file.readlines()

# Remove the first row which contains metadata
lines = lines[1:]

# Split each line into a list of preferences
preference_data = [line.strip().split() for line in lines]

# Convert to a DataFrame
preference_df = pd.DataFrame(preference_data)

# Convert all values to integers
preference_df = preference_df.astype(int)

# Rename columns for clarity
preference_df.columns = [f'pref_{i}' for i in range(preference_df.shape[1])]

# Display the processed preference data

# Remove the first two columns if they contain metadata
preference_df = preference_df.drop(columns=['pref_0', 'pref_1'])

# Rename columns for clarity
preference_df.columns = [f'pref_{i}' for i in range(preference_df.shape[1])]

# Display the cleaned preference data
print(preference_df.shape)

In [None]:
def convert_to_pairwise(preference_df):
    pairwise_data = []
    for user_id, preferences in preference_df.iterrows():
        for i in range(len(preferences)):
            for j in range(i + 1, len(preferences)):
                item_i = preferences[i]
                item_j = preferences[j]
                pairwise_data.append([user_id, item_i, item_j])
    pairwise_df = pd.DataFrame(pairwise_data, columns=['user_id', 'item_i', 'item_j'])
    return pairwise_df

# Convert the preference data to pairwise format
pairwise_df = convert_to_pairwise(preference_df)

# Display the processed pairwise preference data
print(pairwise_df.head())
print(pairwise_df.shape)

In [None]:
#now subsample the pairwise data taking 5 pairs for each of the first 50 users
pairwise_df = pairwise_df[pairwise_df['user_id'] < 50]
pairwise_df = pairwise_df.sample(5*50, random_state=0)

pairwise_df

In [None]:
class GPPrefenceElicitation:
    def __init__(self):
        pass
    
    def fit(self, user_features, item_features, preference_data, max_iter=100):
        self.user_features = user_features
        self.item_features = item_features
        self.preference_data = preference_data
        self.n_users = user_features.shape[0]
        self.n_items = item_features.shape[0]
        self.maximum_hyperparameters()
        user_covariance = self.kernel_covariance_matrix(user_features, user_features, self.sv_t, self.ls_t)
        item_covariance = self.kernel_covariance_matrix(item_features, item_features, self.sv_k, self.ls_k)
        self.covariance_matrix = np.kron(user_covariance, item_covariance)
        self.precision_matrix = np.linalg.inv(self.covariance_matrix)
    
    def grad_log_p_D_given_f(self, f):
        grad = np.zeros_like(f)
        
        for preference in self.preference_data:
            t, k_1, k_2 = preference
            index_1 = t * self.n_users + k_1
            index_2 = t * self.n_users + k_2
            z = (f[index_1] - f[index_2]) / self.noise_sd
            phi_val = scipy.stats.norm.pdf(z)
            Phi_val = scipy.stats.norm.cdf(z)
            ratio = phi_val / (Phi_val * self.noise_sd)
            grad[index_1] += ratio
            grad[index_2] -= ratio
        
        return grad
        
    def hessian_log_p_D_given_f(self, f):
        hessian = np.zeros((f.shape[0], f.shape[0]))
        
        for preference in self.preference_data:
            t, k_1, k_2 = preference
            index_1 = t * self.n_users + k_1
            index_2 = t * self.n_users + k_2
            z = (f[index_1] - f[index_2]) / self.noise_sd
            phi_val = scipy.stats.norm.pdf(z)
            Phi_val = scipy.stats.norm.cdf(z)
            ratio = phi_val / (Phi_val * self.noise_sd)
            
            second_derivative = (z * ratio + ratio**2) / (self.noise_sd ** 2)
            hessian[index_1, index_1] -= second_derivative
            hessian[index_2, index_2] -= second_derivative
            hessian[index_1, index_2] += second_derivative
            hessian[index_2, index_1] += second_derivative
        
        return hessian
    
    def maximum_a_posteriori(self, initial_f, max_iter=100):
        f = initial_f
        for i in range(max_iter):
            hessian = self.hessian_log_p_D_given_f(f)
            precision_matrix_post = hessian + self.precision_matrix
            grad = self.grad_log_p_D_given_f(f)
            hessian_mult_f = hessian @ f
            f = np.linalg.solve(precision_matrix_post, grad + hessian_mult_f)
        
        return f
    
    def se_kernel(self, x, y, sv, ls):
        return sv * np.exp(-0.5 * np.linalg.norm(x - y) ** 2 / ls ** 2)
    
    def kernel_covariance_matrix(self, x, y, sv, ls):
        n_x = x.shape[0]
        n_y = y.shape[0]
        covariance_matrix = np.zeros((n_x, n_y))
        
        for i in range(n_x):
            for j in range(n_y):
                covariance_matrix[i, j] = self.se_kernel(x[i], y[j], sv, ls)
        
        return covariance_matrix

    def maximum_hyperparameters(self, init_sv_t=1, init_ls_t=1, init_sv_k=1, init_ls_k=1, init_noise_sd=0.1, max_iter=100):
        sv_t = torch.tensor(init_sv_t, requires_grad=True)
        ls_t = torch.tensor(init_ls_t, requires_grad=True)
        sv_k = torch.tensor(init_sv_k, requires_grad=True)
        ls_k = torch.tensor(init_ls_k, requires_grad=True)
        self.noise_sd = torch.tensor(init_noise_sd, requires_grad=True)
        optimizer = torch.optim.Adam([sv_t, ls_t, sv_k, ls_k, self.noise_sd])
        identity = torch.eye(self.n_users * self.n_items)
        for i in range(max_iter):
            optimizer.zero_grad()
            user_covariance = self.kernel_covariance_matrix(self.user_features, self.user_features, sv_t, ls_t)
            item_covariance = self.kernel_covariance_matrix(self.item_features, self.item_features, sv_k, ls_k)
            self.covariance_matrix = np.kron(user_covariance, item_covariance)
            self.precision_matrix = torch.inverse(torch.tensor(self.covariance_matrix))
            
            # Remove no_grad()?
            with torch.no_grad():
                f = self.maximum_a_posteriori(np.zeros(self.n_users * self.n_items))
                hessian = self.hessian_log_p_D_given_f(f)
                grad = self.grad_log_p_D_given_f(f)
            
            loss = 0.5 * torch.logdet(self.covariance_matrix @ hessian + identity) + 0.5 * f @ self.precision_matrix @ f - grad
            loss.backward()
            optimizer.step()
        
        self.sv_t = sv_t.item()
        self.ls_t = ls_t.item()
        self.sv_k = sv_k.item()
        self.ls_k = ls_k.item()
        self.noise_sd = self.noise_sd.item()
    
    def predictive_mean_and_variance(self, user_features, item_features):
        user_covariance = self.kernel_covariance_matrix(user_features, self.user_features, self.sv_t, self.ls_t)
        item_covariance = self.kernel_covariance_matrix(item_features, self.item_features, self.sv_k, self.ls_k)
        covariance_matrix = np.kron(user_covariance, item_covariance)
        predictive_mean = item_covariance @ self.precision_matrix @ self.preference_data
        predictive_variance = item_covariance - item_covariance @ self.precision_matrix @ item_covariance
        
        return predictive_mean, predictive_variance
            