In [1]:
import numpy as np
from scipy import sparse
from scipy.stats import ttest_rel
from typing import Union, Tuple


def load_data(data_path, args):
    raw_data = np.loadtxt(data_path, dtype=np.float, delimiter=args.delim, usecols=[0, 1, 2])
    if args.implicit:
        raw_data = raw_data[raw_data[:, 2] > 3]
        raw_data[:, 2] = 1
    users = list(set(raw_data[:, 0].astype(np.int)))
    users.sort()
    user_dict = {k: i for i, k in enumerate(users)}
    items = list(set(raw_data[:, 1].astype(np.int)))
    items.sort()
    item_dict = {k: i for i, k in enumerate(items)}
    for i in range(len(raw_data)):
        raw_data[i, 0] = user_dict[raw_data[i, 0]]
        raw_data[i, 1] = item_dict[raw_data[i, 1]]
    return raw_data


def build_user_item_matrix(ratings, n_user, n_item):
    data = ratings[:, 2]
    row_index = ratings[:, 0]
    col_index = ratings[:, 1]
    shape = (n_user, n_item)
    return sparse.csr_matrix((data, (row_index, col_index)), shape=shape)


def RMSE(estimation, truth):
    truth_coo = truth.tocoo()
    row_idx = truth_coo.row
    col_idx = truth_coo.col
    data = truth.data
    pred = np.zeros(shape=data.shape)
    for i in range(len(data)):
        pred[i] = estimation[row_idx[i], col_idx[i]]
    sse = np.sum(np.square(data - pred))
    return np.sqrt(np.divide(sse, len(data)))


def RMSE_with_ttest(estimation, old_estimation, truth):
    truth_coo = truth.tocoo()
    row_idx = truth_coo.row
    col_idx = truth_coo.col
    data = truth_coo.data
    pred_dis = np.zeros(shape=data.shape)
    old_pred_dis = np.zeros(shape=data.shape)
    for i in range(len(data)):
        pred_dis[i] = abs(estimation[row_idx[i], col_idx[i]] - data[i])
        old_pred_dis[i] = abs(old_estimation[row_idx[i], col_idx[i]] - data[i])
    _, p_value = ttest_rel(pred_dis, old_pred_dis)
    sse = np.sum(np.square(pred_dis))
    sse_old = np.sum(np.square(old_pred_dis))
    return np.sqrt(np.divide(sse, len(data))), np.sqrt(np.divide(sse_old, len(data))), p_value


def RMSE_weighted_with_t_test(estimation, old_estimation, val_confidence):
    val_confidence_dense = val_confidence.toarray()
    val_preference_dense = val_confidence_dense.copy()
    val_preference_dense[val_preference_dense > 0] = 1
    val_confidence_dense[val_confidence_dense == 0] = 1
    old_error = val_confidence_dense * np.power(old_estimation - val_preference_dense, 2)
    new_error = val_confidence_dense * np.power(estimation - val_preference_dense, 2)
    _, p_val = ttest_rel(new_error.flatten(), old_error.flatten())
    return np.sqrt(np.mean(new_error)), np.sqrt(np.mean(old_error)), p_val


def roc_auc_grouped(labels: np.ndarray,
                    predictions: np.ndarray,
                    group_ids: np.ndarray,
                    return_aucs_list: bool = False) -> Union[Tuple[float, float, int], np.ndarray]:
    # efficient implementation of grouped auc, see test_metrics.py for the correctness check

    # l_max = labels.max()
    # l_min = labels.min()
    # logging.info(str(l_max) + ' ' + str(l_min))
    # labels = (labels > l_max * 0.8).astype(int)
    # sort group_ids, predictions and labels jointly by (group_id, prediction) key
    indices = np.lexsort((predictions, group_ids))
    group_ids = group_ids[indices]
    labels = labels[indices]

    # unique monotonic group_id
    _, group_ids2 = np.unique(group_ids, return_inverse=True)
    _, unique_counts = np.unique(group_ids, return_counts=True)

    offsets = np.cumsum(unique_counts)
    offsets = np.insert(offsets, 0, 0)

    # number of negatives up to current element
    nneg_thru = np.cumsum(1 - labels)

    # number of negatives at the beginning of each group
    group_starts = nneg_thru[offsets - 1]
    group_starts[0] = 0

    # number of negatives up to current element, restarting at each group
    nneg = nneg_thru - group_starts[group_ids2]

    # number of ordered pairs with the current element
    inversions = (nneg * labels)

    # number of negatives in each group
    nneg_counts = nneg[offsets[1:] - 1]
    npos_counts = unique_counts - nneg_counts

    total_pairs = nneg_counts * npos_counts

    # Number of ordered pairs in each group
    ordered_pairs = np.bincount(group_ids2, weights=inversions)

    aucs = ordered_pairs[total_pairs > 0] / total_pairs[total_pairs > 0]

    if return_aucs_list:
        return aucs
    else:
        return float(np.mean(aucs)), float(np.std(aucs)), int(np.sum(total_pairs > 0))


def roc_auc_with_t_test(estimation, old_estimation, truth):
    user_ids = np.repeat(np.array(range(estimation.shape[1])), estimation.shape[0])
    aucs_old = roc_auc_grouped(truth.toarray().flatten(), old_estimation.flatten(), user_ids, True)
    aucs_new = roc_auc_grouped(truth.toarray().flatten(), estimation.flatten(), user_ids, True)
    _, p_value = ttest_rel(aucs_new, aucs_old)
    return np.mean(aucs_new), np.mean(aucs_old), p_value


def precision_at_k_grouped(labels: np.ndarray,
                           predictions: np.ndarray,
                           group_ids: np.ndarray,
                           k: int = 10,
                           return_precision_list: bool = False) -> Union[Tuple[float, float, int], np.ndarray]:
    # efficient implementation of grouped precision@k, see test_metrics.py for the correctness check

    # l_max = labels.max()
    # l_min = labels.min()
    # logging.info(str(l_max) + ' ' + str(l_min))
    # labels = (labels > l_max * 0.8).astype(int)
    # sort group_ids, predictions and labels jointly by (group_id, prediction) key
    indices = np.lexsort((-predictions, group_ids))
    group_ids = group_ids[indices]
    labels = labels[indices]

    # 0000, 1111, 222, 3, 555555

    # unique monotonic group_id
    _, group_ids2 = np.unique(group_ids, return_inverse=True)
    _, unique_counts = np.unique(group_ids, return_counts=True)

    offsets = np.cumsum(unique_counts)
    offsets = np.insert(offsets, 0, 0)

    # independent indexing in each group. e.g., [0, 1, 2, 0, 0, 0, 1, 2, 3, 4, 0, 1, 2]
    group_indices = np.arange(group_ids.shape[0]) - offsets[group_ids2]

    # number of points in each group or k
    denominator = np.minimum(unique_counts[group_ids2], np.repeat(k, group_ids.shape[0]))
    pr_at_k_vals = labels / denominator

    pr_at_k_vals[group_indices >= k] = 0

    group_pr_at_k = np.zeros(unique_counts.shape[0])
    np.add.at(group_pr_at_k, group_ids2, pr_at_k_vals)

    if return_precision_list:
        return group_pr_at_k
    else:
        return float(np.mean(group_pr_at_k)), float(np.std(group_pr_at_k)), group_pr_at_k.shape[0]


def precision_at_10_with_t_test(estimation, old_estimation, truth):
    user_ids = np.repeat(np.array(range(estimation.shape[1])), estimation.shape[0])
    precisions_old = precision_at_k_grouped(truth.toarray().flatten(), old_estimation.flatten(), user_ids, 10, True)
    precisions_new = precision_at_k_grouped(truth.toarray().flatten(), estimation.flatten(), user_ids, 10, True)
    _, p_value = ttest_rel(precisions_new, precisions_old)
    return np.mean(precisions_new), np.mean(precisions_old), p_value


def u_emb_d_c(lamb, C, R, v, user_ind, vvt):
    # calculates derivatives of each component of the embedding of user 'user_ind'
    # wrt each confidence value of the user
    # return shape (embedding_dim, num non-zero elements in C[user_ind]

    # see test_gradients.py for the correctness check
    idxs = np.argwhere(C[user_ind]).flatten()
    m_inv = np.linalg.inv(lamb * np.eye(v.shape[1], v.shape[1]) + vvt + \
                          np.einsum('i,ik->ik', C[user_ind, idxs] - R[user_ind, idxs], v[idxs]).T.dot(v[idxs]))
    outer_products = np.einsum('ij,il->ijl', v[idxs], v[idxs])
    m_inv_v_outer = np.einsum('ij,kj->ki', m_inv, v[idxs])
    m_inv_dot_outer_products = np.einsum('ij,cjk->cik', m_inv, outer_products)
    first_part = np.einsum('cji,i->cj', m_inv_dot_outer_products,
                           m_inv.dot(np.einsum('i,ik->k', C[user_ind, idxs], v[idxs])))
    return -first_part + m_inv_v_outer


def i_emb_d_c(lamb, C, R, u, item_ind, uut):
    # calculates derivatives of each component of the embedding of item 'item_ind'
    # wrt each confidence value of the item
    # return shape (embedding_dim, num non-zero elements in C[:, item_ind]

    # see test_gradients.py for the correctness check
    idxs = np.argwhere(C[:, item_ind]).flatten()
    m_inv = np.linalg.inv(lamb * np.eye(u.shape[1], u.shape[1]) + uut + \
                          np.einsum('i,ik->ik', C[idxs, item_ind] - R[idxs, item_ind], u[idxs]).T.dot(u[idxs]))
    outer_products = np.einsum('ij,il->ijl', u[idxs], u[idxs])
    m_inv_u_outer = np.einsum('ij,kj->ki', m_inv, u[idxs])
    m_inv_dot_outer_products = np.einsum('ij,cjk->cik', m_inv, outer_products)
    first_part = np.einsum('cji,i->cj', m_inv_dot_outer_products,
                           m_inv.dot(np.einsum('i,ik->k', C[idxs, item_ind], u[idxs])))
    return -first_part + m_inv_u_outer


def loss_d_emb(confidence_val: np.ndarray,
               preference_val: np.ndarray,
               pred_val: np.ndarray,
               user_embeddings: np.ndarray,
               item_embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    # calculates derivatives of loss on validation
    # wrt user embeddings and item embeddings
    # (note that following the original paper we ignore regularization here)

    # see test_gradients.py for the correctness check
    error_weights = confidence_val.copy()
    error_weights[error_weights == 0] = 1
    print(error_weights.shape, (pred_val-preference_val).shape)
    diffs = 2 * np.multiply(error_weights, (pred_val - preference_val))
    print(diffs.shape, item_embeddings.shape)
    grad_r_user = diffs.dot(item_embeddings)
    grad_r_item = diffs.T.dot(user_embeddings)
    return grad_r_user, grad_r_item


In [2]:
#overload for fit method in als class for implementing negative weighting

from implicit.utils import check_blas_config, check_random_state, nonzeros
from implicit.cpu import _als
from scipy.sparse import diags
from scipy.sparse import csr_matrix
import scipy
import time
import logging
from tqdm import tqdm
from implicit.cpu.als import AlternatingLeastSquares as als

log = logging.getLogger("implicit")

def check_csr(user_items):
    if not isinstance(user_items, scipy.sparse.csr_matrix):
        class_name = user_items.__class__.__name__
        start = time.time()
        user_items = user_items.tocsr()
        warnings.warn(
            f"Method expects CSR input, and was passed {class_name} instead. "
            f"Converting to CSR took {time.time() - start} seconds",
            ParameterWarning,
        )
    return user_items

class als_with_weights(als):
    def __init__(
        self,
        factors=100,
        regularization=0.01,
        alpha=1.0, 
        weights = False,
        dtype=np.float32,
        use_native=True,
        use_cg=True,
        iterations=15,
        calculate_training_loss=False,
        num_threads=0,
        random_state=None,
    ):
        super().__init__(
            factors=factors,
            regularization=regularization,
            dtype=dtype,
            use_native=use_native,
            use_cg=use_cg,
            iterations=iterations,
            calculate_training_loss=calculate_training_loss,
            num_threads=num_threads,
            random_state=random_state,
        )
        self.alpha = alpha
        self.weights = weights
        
        
    def fit(self, user_items, show_progress=True, callback=None):
        """Factorizes the user_items matrix.
        After calling this method, the members 'user_factors' and 'item_factors' will be
        initialized with a latent factor model of the input data.
        The user_items matrix does double duty here. It defines which items are liked by which
        users (P_ui in the original paper), as well as how much confidence we have that the user
        liked the item (C_ui).
        The negative items are implicitly defined: This code assumes that positive items in the
        user_items matrix means that the user liked the item. The negatives are left unset in this
        sparse matrix: the library will assume that means Piu = 0 and Ciu = 1 for all these items.
        Negative items can also be passed with a higher confidence value by passing a negative
        value, indicating that the user disliked the item.
        Parameters
        ----------
        user_items: csr_matrix
            Matrix of confidences for the liked items. This matrix should be a csr_matrix where
            the rows of the matrix are the users, the columns are the items liked that user,
            and the value is the confidence that the user liked the item.
        show_progress : bool, optional
            Whether to show a progress bar during fitting
        callback: Callable, optional
            Callable function on each epoch with such arguments as epoch, elapsed time and progress
        """
        # initialize the random state
        random_state = check_random_state(self.random_state)

        Cui = check_csr(user_items)
        if Cui.dtype != np.float32:
            Cui = Cui.astype(np.float32)

        # Give the positive examples more weight if asked for
#         if self.alpha != 1.0:
#             Cui = self.alpha * Cui
        if self.weights:   #negative weighting wiu = alpha*|Iu|
            neg_Cui = csr_matrix(np.abs(Cui.astype(bool).toarray()-1))
            sums = np.array(np.sum(Cui, axis = 1)).squeeze()      #weights can be really high, so just make them lower
            neg_weights = diags(sums/np.mean(sums)/6)@neg_Cui
            #tri = csr_matrix(np.random.random_integers(0, 1, (neg_weights.shape)))
            val = np.prod(neg_weights.shape)
            tri = np.zeros(val)
            idx = np.random.choice(np.arange(0, val, 1), size = val//3, replace = False)
            tri[idx]=1
            shape = neg_weights.shape
            neg_weights = csr_matrix(np.where(tri.reshape(shape)==1, neg_weights.toarray(), 0))           
            Cui = neg_weights+Cui
        s = time.time()
        Ciu = Cui.T.tocsr()
        log.debug("Calculated transpose in %.3fs", time.time() - s)

        items, users = Ciu.shape

        s = time.time()
        # Initialize the variables randomly if they haven't already been set
        if self.user_factors is None:
            self.user_factors = random_state.rand(users, self.factors).astype(self.dtype) * 0.01
        if self.item_factors is None:
            self.item_factors = random_state.rand(items, self.factors).astype(self.dtype) * 0.01

        log.debug("Initialized factors in %s", time.time() - s)

        # invalidate cached norms and squared factors
        self._item_norms = self._user_norms = None
        self._YtY = None
        self._XtX = None
        loss = None

        solver = self.solver

        log.debug("Running %i ALS iterations", self.iterations)
        with tqdm(total=self.iterations, disable=not show_progress) as progress:
            # alternate between learning the user_factors from the item_factors and vice-versa
            for iteration in range(self.iterations):
                s = time.time()
                solver(
                    Cui,
                    self.user_factors,
                    self.item_factors,
                    self.regularization,
                    num_threads=self.num_threads,
                )
                solver(
                    Ciu,
                    self.item_factors,
                    self.user_factors,
                    self.regularization,
                    num_threads=self.num_threads,
                )
                progress.update(1)

                if self.calculate_training_loss:
                    loss = _als.calculate_loss(
                        Cui,
                        self.user_factors,
                        self.item_factors,
                        self.regularization,
                        num_threads=self.num_threads,
                    )
                    progress.set_postfix({"loss": loss})

                    if not show_progress:
                        log.info("loss %.4f", loss)

                # Backward compatibility
                if not callback:
                    callback = self.fit_callback
                if callback:
                    callback(iteration, time.time() - s, loss)

        if self.calculate_training_loss:
            log.info("Final training loss %.4f", loss)

        self._check_fit_errors()

  f"CUDA extension is built, but disabling GPU support because of '{e}'",


In [3]:
import argparse
import os
from copy import deepcopy
from multiprocessing import Process
from numpy.linalg import inv
from scipy import sparse
import logging
import implicit

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)



def partition(ratings, seed, fold):
    np.random.RandomState(seed).shuffle(ratings)
    test_size = int(0.2 * ratings.shape[0])

    test_data = ratings[:test_size]
    val_data = dict()
    fold_size = (ratings.shape[0] - test_size) // fold
    for i in range(fold - 1):
        val_data[i + 1] = ratings[test_size + i * fold_size: test_size + (i + 1) * fold_size]
    val_data[fold] = ratings[test_size + (fold - 1) * fold_size:]
    return val_data, test_data


def data_split(data_path, args):
    fold = args.fold
    ratings = load_data(data_path, args)
    n_users = int(max(ratings[:, 0]) + 1)
    n_items = int(max(ratings[:, 1]) + 1)
    max_rating = max(ratings[:, 2])
    min_rating = min(ratings[:, 2])

    val_data, test_data = partition(ratings, 0, fold)

    lambda_dict = dict()
    for i in range(fold):
        if i == 0:
            lambda_dict[1] = deepcopy(val_data[2])
            for j in range(3, fold + 1):
                lambda_dict[1] = np.vstack((lambda_dict[1], val_data[j]))
        else:
            lambda_dict[i + 1] = deepcopy(val_data[1])
            for j in range(2, i + 1):
                lambda_dict[i + 1] = np.vstack((lambda_dict[i + 1], val_data[j]))
            for j in range(i + 2, fold + 1):
                lambda_dict[i + 1] = np.vstack((lambda_dict[i + 1], val_data[j]))

    zipped_index_dict = dict()
    for i in range(fold):
        zipped_index_dict[i + 1] = [(int(_[0]), int(_[1])) for _ in lambda_dict[i + 1]]

    train_csr_dict = dict()
    for i in range(fold):
        train_csr_dict[i + 1] = build_user_item_matrix(lambda_dict[i + 1], n_users, n_items)
    val_csr_dict = dict()
    for i in range(fold):
        val_csr_dict[i + 1] = build_user_item_matrix(val_data[i + 1], n_users, n_items)
    test_csr = build_user_item_matrix(test_data, n_users, n_items)
    return train_csr_dict, val_csr_dict, test_csr, zipped_index_dict, max_rating, min_rating


def cf_ridge_regression(csr_matrix, reg_lambda, fixed_feature, update_feature):
    n_feature = fixed_feature.shape[1]
    for i in range(csr_matrix.shape[0]):
        _, idx = csr_matrix[i, :].nonzero()
        valid_feature = fixed_feature.take(idx, axis=0)
        ratings = csr_matrix[i, idx].todense()
        A_i = np.dot(valid_feature.T, valid_feature) + reg_lambda * np.eye(n_feature)
        V_i = np.dot(valid_feature.T, ratings.T)
        update_feature[i, :] = np.squeeze(np.dot(inv(A_i), V_i))


def ALS(train_csr, args, n_iters, init_user_features=None, init_item_features=None):
    if args.implicit:
        logging.info('Implicit ALS, alpha {} max rating {}'.format(args.alpha, train_csr.data.max()))
        model = als_with_weights(factors=args.factor, iterations=n_iters, weights = args.negative, #num_threads=args.als_threads,
                                                     regularization=max(args.lambda_u, args.lambda_v),
                                                     random_state=0)
        model.fit(train_csr, show_progress=False)
        return model.user_factors, model.item_factors
    else:
        user_features = 0.1 * np.random.RandomState(seed=0).rand(train_csr.shape[0], args.factor)
        item_features = 0.1 * np.random.RandomState(seed=0).rand(train_csr.shape[1], args.factor)
        if init_user_features is not None:
            user_features = init_user_features
        if init_item_features is not None:
            item_features = init_item_features
        train_csr_transpose = train_csr.T.tocsr()
        for iteration in range(n_iters):
            logging.info('Explicit ALS iteration {}'.format(iteration))
            cf_ridge_regression(train_csr, args.lambda_u, item_features, user_features)
            cf_ridge_regression(train_csr_transpose, args.lambda_v, user_features, item_features)
        return user_features, item_features


def grad_calc(train_csr, val_csr, zipped_index, user_features, item_features, args):
    n_users = train_csr.shape[0]
    n_items = train_csr.shape[1]

    grad_r_user = np.zeros(shape=user_features.shape, dtype=np.float)
    grad_r_item = np.zeros(shape=item_features.shape, dtype=np.float)

    val_coo = val_csr.tocoo()
    pred_val = np.dot(user_features, item_features.T)
    for i, j, v in zip(val_coo.row, val_coo.col, val_coo.data):
        loss = 2 * (pred_val[i, j] - v)
        grad_r_user[i] += loss * item_features[j]
        grad_r_item[j] += loss * user_features[i]

    grad_user_m = np.zeros(shape=(train_csr.nnz, args.factor))
    grad_user_dict = {}
    cnt = 0
    for i in range(n_users):
        _, item_idx = train_csr[i, :].nonzero()
        item_feat = item_features.take(item_idx, axis=0)
        A = np.eye(args.factor, dtype=np.float) * args.lambda_u + np.dot(item_feat.T, item_feat)
        grad_user_m_i = np.dot(item_features, inv(A))
        for i_idx in item_idx:
            tup = (i, i_idx)
            grad_user_dict[tup] = cnt
            grad_user_m[cnt] = grad_user_m_i[i_idx][:]
            cnt += 1

    train_csc = train_csr.tocsc()
    grad_item_m = np.zeros(shape=(train_csc.nnz, args.factor))
    grad_item_dict = {}
    cnt = 0
    for i in range(n_items):
        user_idx, _ = train_csc[:, i].nonzero()
        user_feat = user_features.take(user_idx, axis=0)
        A = np.eye(args.factor, dtype=np.float) * args.lambda_v + np.dot(user_feat.T, user_feat)
        grad_item_m_i = np.dot(user_features, inv(A))
        for u_idx in user_idx:
            tup = (i, u_idx)
            grad_item_dict[tup] = cnt
            grad_item_m[cnt] = grad_item_m_i[u_idx][:]
            cnt += 1

    row = [i for i, j in zipped_index]
    col = [j for i, j in zipped_index]
    data = [(np.dot(grad_r_user[i], grad_user_m[grad_user_dict[(i, j)]].T)
             + np.dot(grad_r_item[j], grad_item_m[grad_item_dict[(j, i)]].T))
            for i, j in zipped_index]
    return sparse.coo_matrix((data, (row, col)), shape=(n_users, n_items)).tocsr()


def grad_calc_implicit(train_csr, val_csr, zipped_index, user_features, item_features, args):
    n_users = train_csr.shape[0]
    n_items = train_csr.shape[1]

    pred_val = np.dot(user_features, item_features.T)
    confidence_val_dense = val_csr.toarray()
    confidence_train_dense = train_csr.toarray()
    preference_val_dense = confidence_val_dense.copy()
    preference_val_dense[preference_val_dense > 0] = 1
    preference_train_dense = confidence_train_dense.copy()
    preference_train_dense[preference_train_dense > 0] = 1
    grad_r_user, grad_r_item = loss_d_emb(confidence_val_dense,
                                          preference_val_dense,
                                          pred_val,
                                          user_features,
                                          item_features)

    grad_user_m = np.zeros(shape=(train_csr.nnz, args.factor))
    grad_user_dict = {}
    cnt = 0
    VVT = np.dot(item_features.T, item_features)
    UUT = np.dot(user_features.T, user_features)

    for i in range(n_users):
        _, item_idx = train_csr[i, :].nonzero()
        grad_user_m_i = u_emb_d_c(args.lambda_u, confidence_train_dense, preference_train_dense, item_features, i, VVT)
        for item_num, i_idx in enumerate(item_idx):
            tup = (i, i_idx)
            grad_user_dict[tup] = cnt
            grad_user_m[cnt] = grad_user_m_i[item_num][:]
            cnt += 1

    train_csc = train_csr.tocsc()
    grad_item_m = np.zeros(shape=(train_csc.nnz, args.factor))
    grad_item_dict = {}
    cnt = 0
    for i in range(n_items):
        user_idx, _ = train_csc[:, i].nonzero()
        grad_item_m_i = i_emb_d_c(args.lambda_v, confidence_train_dense, preference_train_dense, user_features, i, UUT)
        for user_num, u_idx in enumerate(user_idx):
            tup = (i, u_idx)
            grad_item_dict[tup] = cnt
            grad_item_m[cnt] = grad_item_m_i[user_num][:]
            cnt += 1

    row = [i for i, j in zipped_index]
    col = [j for i, j in zipped_index]
    data = [(np.dot(grad_r_user[i], grad_user_m[grad_user_dict[(i, j)]].T)
             + np.dot(grad_r_item[j], grad_item_m[grad_item_dict[(j, i)]].T))
            for i, j in zipped_index]
    return sparse.coo_matrix((data, (row, col)), shape=(n_users, n_items)).tocsr()


def grad_update_loop(train_csr, val_csr, zipped_index, max_rating, min_rating, args):
    user_feature, item_feature = ALS(train_csr, args, args.als_iter)

    A_i = train_csr
    C_i = sparse.csr_matrix(train_csr.shape)
    user_feature_i = user_feature
    item_feature_i = item_feature
    for i in range(args.debug_iter):
        if args.implicit:
            gradients = grad_calc_implicit(A_i, val_csr, zipped_index, user_feature_i, item_feature_i, args)
        else:
            gradients = grad_calc(A_i, val_csr, zipped_index, user_feature_i, item_feature_i, args)
        A_i = A_i - gradients * args.debug_lr
        for _ in range(A_i.nnz):
            A_i.data[_] = min(max_rating, max(min_rating, A_i.data[_]))
        logging.info("A_i mean {}, min {}, max {}".format(A_i.data.mean(), A_i.data.min(), A_i.data.max()))
        if args.retrain == "full":
            user_feature_i, item_feature_i = ALS(A_i, args, args.als_iter)
        if args.retrain == "inc":
            user_feature_i, item_feature_i = ALS(A_i, args, 1, user_feature_i, item_feature_i)
        C_i = A_i - train_csr
    return C_i


def get_path(args, part_id):
    path = f"./save/{args.dataset}/f{args.fold}_m{args.debug_iter}_lr{args.debug_lr}_part{part_id}_{args.retrain}"
    if args.implicit:
        path += '_implicit'
    return path + '.txt'


def debug_process(train_csr, val_csr, zipped_index, max_rating, min_rating, id, args):
    if args.implicit:
        alpha = args.alpha
    else:
        alpha = 1
    change_csr = grad_update_loop(alpha * train_csr, alpha * val_csr, zipped_index, max_rating, min_rating, args)
    change_arr = change_csr.toarray()
    path = get_path(args, id)
    with open(path, "w+") as f:
        for i, j in zipped_index:
            print(i, j, change_arr[i, j], file=f, sep=',')


def aggregate_process(edit, sorted_edges, train_csr, test_csr, args, old_pred, max_rating, min_rating, percent):
    if args.implicit:
        alpha = args.alpha
    else:
        alpha = 1
    cut_pos = int(len(sorted_edges) * percent * 0.01)
    base_arr = train_csr.todense()
    for i, j, v in sorted_edges[:cut_pos]:
        if edit == "del":
            base_arr[i, j] = 0
        elif edit == "mod":
            base_arr[i, j] += v
            base_arr[i, j] = min(max_rating, max(min_rating, base_arr[i, j]))
    user_feature, item_feature = ALS(alpha * sparse.csr_matrix(base_arr), args, args.als_iter)
    new_pred = np.dot(user_feature, item_feature.T)
    if args.implicit:
        aucs = roc_auc_with_t_test(new_pred, old_pred, test_csr)
        mse = RMSE_weighted_with_t_test(new_pred, old_pred, alpha * test_csr)
        precisions = precision_at_10_with_t_test(new_pred, old_pred, test_csr)
    else:
        test_csr_binarized = test_csr.copy()
        test_csr_binarized[test_csr_binarized <= 3] = 0
        test_csr_binarized[test_csr_binarized > 3] = 1
        aucs = roc_auc_with_t_test(new_pred, old_pred, test_csr_binarized)
        mse = RMSE_with_ttest(new_pred, old_pred, test_csr)
        precisions = precision_at_10_with_t_test(new_pred, old_pred, test_csr_binarized)
    return aucs, mse, precisions




In [4]:
class tempo:
    def __init__(self, mode, implicit, negative):
        self.dataset = 'movielens'
        self.delim = '::'
        self.fold = 4
        self.factor = 10
        self.lambda_u = 0.1
        self.lambda_v = 0.1
        self.als_iter = 15
        self.debug_iter = 20
        self.debug_lr = 0.05
        self.retrain = 'full'
        self.process = 4
        self.mode = mode#'test'
        self.implicit = implicit#'store_true'
        self.alpha = 5
        self.als_threads = 2
        self.negative = negative#False   #change here to switch between negative weighting and confidence

In [6]:
%%time
import pandas as pd
def run(args):
    file_path = "./data/" + args.dataset + ".txt"
    if not os.path.exists(f"./save/{args.dataset}"):
        os.mkdir(f"./save/{args.dataset}")

    if args.mode == "debug":
        train_csr, val_csr, test_csr, zipped_index, max_rating, min_rating = data_split(file_path, args)
        if args.implicit:
            max_rating *= args.alpha
            min_rating = 0
        fold_id = 0
        for rnd in range((args.fold + args.process - 1) // args.process):
            processes = []
            for i in range(fold_id, fold_id + args.process):
                process = Process(target=debug_process, args=(train_csr[i + 1], val_csr[i + 1], zipped_index[i + 1],
                                                              max_rating, min_rating, i + 1, args))
                processes.append(process)
            for p in processes:
                p.start()
            for p in processes:
                p.join()
            fold_id += args.process
    elif args.mode == "test":
        del_rmse = []
        rem_rmse = []
        del_roc = []
        rem_roc = []
        del_prec = []
        rem_prec = []
        train_csr, val_csr, test_csr, zipped_index, max_rating, min_rating = data_split(file_path, args)
        if args.implicit:
            max_rating *= args.alpha
            min_rating = 0
            alpha = args.alpha
        else:
            alpha = 1
        test_train_csr = sparse.csr_matrix(test_csr.shape)
        for i in range(args.fold):
            test_train_csr = test_train_csr + val_csr[i + 1]
        user_feature, item_feature = ALS(alpha * test_train_csr, args, args.als_iter)
        old_pred = np.dot(user_feature, item_feature.T)

        edge_dict = dict()
        for i in range(1, args.fold + 1):
            path = get_path(args, i)
            for line in open(path):
                l = line.strip().split(',')
                x = int(l[0])
                y = int(l[1])
                r = float(l[2])
                if (x, y) not in edge_dict.keys():
                    edge_dict[(x, y)] = r / (args.fold - 1)
                else:
                    if edge_dict[(x, y)] * r > 0:
                        edge_dict[(x, y)] += r / (args.fold - 1)
                    else:
                        edge_dict[(x, y)] = 0
        edges = [(key[0], key[1], values) for key, values in edge_dict.items()]
        if args.implicit:
            sorted_edges = sorted(edges, key=lambda _: _[2], reverse=False)
        else:
            sorted_edges = sorted(edges, key=lambda _: abs(_[2]), reverse=True)
        for edit in ["del", "mod"]:
            for percent in [0.1, 0.2, 0.5, 1, 2, 5, 10]:
                aucs, mse, precisions = aggregate_process(edit, sorted_edges, test_train_csr,
                                                          test_csr, args, old_pred,
                                                          max_rating, min_rating, percent)
                if edit =='del':
                    del_rmse.append(mse[0])
                    del_roc.append(aucs[0])
                    del_prec.append(precisions[0])
                else:
                    rem_rmse.append(mse[0])
                    rem_roc.append(aucs[0])
                    rem_prec.append(precisions[0])
                if args.implicit:
                    print(f"{edit} {percent}% training data, weighted rmse on test: {mse[1]} -> {mse[0]}, p_value: {mse[2]}")
                else:
                    print(f"{edit} {percent}% training data, rmse on test: {mse[1]} -> {mse[0]}, p_value: {mse[2]}")
                print(f"{edit} {percent}% training data, aucs on test: {aucs[1]} -> {aucs[0]}, p_value: {aucs[2]}")
                print(f"{edit} {percent}% training data, p@10 on test: {precisions[1]} -> {precisions[0]}, p_value: {precisions[2]}")
        explicit = {}
        explicit['modify'] = {'rmse':rem_rmse, 'roc_auc':rem_roc, 'precision': rem_prec}
        explicit['delete'] = {'rmse':del_rmse, 'roc_auc': del_roc, 'precision': del_prec}

        tr = pd.DataFrame(explicit).transpose()
        pos = pd.DataFrame({'rmse':tr.rmse.explode(), 'roc_auc':tr.roc_auc.explode(), 'precision':tr.precision.explode()})
        modify_exp = pos[pos.index=='modify'].copy()
        modify_exp['percentage'] = [0.1, 0.2, 0.5, 1, 2, 5, 10]
        cols = ['percentage', 'rmse', 'roc_auc', 'precision']
        modify_exp = modify_exp[cols]
        delete_exp = pos[pos.index=='delete'].copy()
        delete_exp['percentage'] = [0.1, 0.2, 0.5, 1, 2, 5, 10]
        cols = ['percentage', 'rmse', 'roc_auc', 'precision']
        delete_exp = delete_exp[cols]
        return modify_exp, delete_exp

CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 18.6 µs


In [None]:
args = tempo(mode = 'debug', implicit = 'store_true', negative = False)
run(args)



In [7]:
args = tempo(mode = 'test', implicit='', negative = False)
modify_exp, delete_exp = run(args)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if sys.path[0] == "":
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app
  self._set_arrayXarray(i, j, x)


del 0.1% training data, rmse on test: 0.913437474491702 -> 0.9073873138880987, p_value: 1.4588867653275616e-15
del 0.1% training data, aucs on test: 0.7688397347194403 -> 0.7723590040132347, p_value: 6.129901054179996e-47
del 0.1% training data, p@10 on test: 0.0001618996222342148 -> 0.00021586616297895306, p_value: 0.5271617308085856
del 0.2% training data, rmse on test: 0.913437474491702 -> 0.9033484900440794, p_value: 8.050089614980532e-23
del 0.2% training data, aucs on test: 0.7688397347194403 -> 0.7743213250567609, p_value: 8.555573053898794e-76
del 0.2% training data, p@10 on test: 0.0001618996222342148 -> 0.00021586616297895306, p_value: 0.5637730167013997
del 0.5% training data, rmse on test: 0.913437474491702 -> 0.898292760326892, p_value: 1.1773408796878419e-36
del 0.5% training data, aucs on test: 0.7688397347194403 -> 0.7801331147497039, p_value: 5.780865113039953e-157
del 0.5% training data, p@10 on test: 0.0001618996222342148 -> 0.00035078251484079874, p_value: 0.0706960

In [8]:
modify_exp

Unnamed: 0,percentage,rmse,roc_auc,precision
modify,0.1,0.905294,0.772578,0.000162
modify,0.2,0.901122,0.776043,0.00027
modify,0.5,0.894385,0.781843,0.000297
modify,1.0,0.888028,0.788721,0.000459
modify,2.0,0.881206,0.79775,0.000486
modify,5.0,0.873512,0.805692,0.002779
modify,10.0,0.869514,0.81021,0.005774


In [9]:
delete_exp

Unnamed: 0,percentage,rmse,roc_auc,precision
delete,0.1,0.907387,0.772359,0.000216
delete,0.2,0.903348,0.774321,0.000216
delete,0.5,0.898293,0.780133,0.000351
delete,1.0,0.892717,0.785114,0.000405
delete,2.0,0.887628,0.792592,0.000486
delete,5.0,0.885676,0.802563,0.001052
delete,10.0,0.888981,0.811353,0.001214


In [35]:
args = tempo(mode = 'test', implicit = 'store_true', negative = False)
modify_pos, delete_pos = run(args)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if sys.path[0] == "":
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app


del 0.1% training data, weighted rmse on test: 0.18257745873775652 -> 0.18249644353091043, p_value: 0.0
del 0.1% training data, aucs on test: 0.9267385280885827 -> 0.9266967009060519, p_value: 0.0001276966667295583
del 0.1% training data, p@10 on test: 0.13523917350693462 -> 0.1351259552787999, p_value: 0.6625835311614025
del 0.2% training data, weighted rmse on test: 0.18257745873775652 -> 0.18241544387685954, p_value: 0.0
del 0.2% training data, aucs on test: 0.9267385280885827 -> 0.926672111344329, p_value: 0.0002249629032130969
del 0.2% training data, p@10 on test: 0.13523917350693462 -> 0.13453155958109259, p_value: 0.03523967584882992
del 0.5% training data, weighted rmse on test: 0.18257745873775652 -> 0.18217356975189314, p_value: 0.0
del 0.5% training data, aucs on test: 0.9267385280885827 -> 0.9266177272628835, p_value: 4.315969578028615e-05
del 0.5% training data, p@10 on test: 0.13523917350693462 -> 0.13424851401075574, p_value: 0.019068586819048775
del 1% training data, we

In [20]:
%%time
args = tempo(mode = 'test', implicit = 'store_true', negative = True)
modify_neg, delete_neg = run(args)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if sys.path[0] == "":
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app


del 0.1% training data, weighted rmse on test: 0.22937876966031367 -> 0.22918851646859728, p_value: 9.172257738508885e-149
del 0.1% training data, aucs on test: 0.925282652862054 -> 0.9251733977402747, p_value: 0.3002032108743159
del 0.1% training data, p@10 on test: 0.13470138692329467 -> 0.1356071327483725, p_value: 0.3351366276267669
del 0.2% training data, weighted rmse on test: 0.22937876966031367 -> 0.22919547113560973, p_value: 5.2221957849643696e-136
del 0.2% training data, aucs on test: 0.925282652862054 -> 0.9250716421787, p_value: 0.038785521765133044
del 0.2% training data, p@10 on test: 0.13470138692329467 -> 0.13422020945372207, p_value: 0.6094581331873967
del 0.5% training data, weighted rmse on test: 0.22937876966031367 -> 0.22882014853625174, p_value: 0.0
del 0.5% training data, aucs on test: 0.925282652862054 -> 0.9252554912996659, p_value: 0.80160644342261
del 0.5% training data, p@10 on test: 0.13470138692329467 -> 0.13458816869515994, p_value: 0.9034108218656115
de

In [23]:
modify_neg


Unnamed: 0,percentage,rmse,roc_auc,precision
modify,0.1,0.229359,0.925198,0.134616
modify,0.2,0.229303,0.925305,0.135267
modify,0.5,0.229165,0.925126,0.133598
modify,1.0,0.22906,0.925295,0.134815
modify,2.0,0.228744,0.925128,0.133145
modify,5.0,0.227932,0.925277,0.134333
modify,10.0,0.226704,0.925167,0.134758


In [24]:
delete_neg

Unnamed: 0,percentage,rmse,roc_auc,precision
delete,0.1,0.229189,0.925173,0.135607
delete,0.2,0.229195,0.925072,0.13422
delete,0.5,0.22882,0.925255,0.134588
delete,1.0,0.228511,0.925191,0.134333
delete,2.0,0.227749,0.925031,0.134843
delete,5.0,0.225486,0.923873,0.134928
delete,10.0,0.221687,0.922842,0.133909
