# P3Alpha

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

### Read data

In [None]:
# base path for csv files
base_path = "data"
interactions_df_path = base_path + "/interactions_and_impressions.csv"
items_length_df_path = base_path + "/data_ICM_length.csv"
items_type_df_path = base_path + "/data_ICM_type.csv"
users_df_path = base_path + "/data_target_users_test.csv"

In [None]:
dtype={0:int, 1:int, 2:str, 3:int}
interactions_df = pd.read_csv(filepath_or_buffer=interactions_df_path, dtype=dtype)
items_length_pf = pd.read_csv(filepath_or_buffer=items_length_df_path)
items_types_df = pd.read_csv(filepath_or_buffer=items_type_df_path)
users_df = pd.read_csv(filepath_or_buffer=users_df_path)

### Utilities

In [None]:
def check_matrix(X, format='csc', dtype=np.float32):
    """
    This function takes a matrix as input and transforms it into the specified format.
    The matrix in input can be either sparse or ndarray.
    If the matrix in input has already the desired format, it is returned as-is
    the dtype parameter is always applied and the default is np.float32
    :param X:
    :param format:
    :param dtype:
    :return:
    """


    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)

    elif format == 'npy':
        if sps.issparse(X):
            return X.toarray().astype(dtype)
        else:
            return np.array(X)

    elif isinstance(X, np.ndarray):
        X = sps.csr_matrix(X, dtype=dtype)
        X.eliminate_zeros()
        return check_matrix(X, format=format, dtype=dtype)
    else:
        return X.astype(dtype)

In [None]:
def similarityMatrixTopK(item_weights, k=100, verbose = False, use_absolute_values = False):
    """
    The function selects the TopK most similar elements, column-wise
    :param item_weights:
    :param forceSparseOutput:
    :param k:
    :param verbose:
    :param inplace: Default True, WARNING matrix will be modified
    :return:
    """

    assert (item_weights.shape[0] == item_weights.shape[1]), "selectTopK: ItemWeights is not a square matrix"

    n_items = item_weights.shape[0]
    similarity_builder = Incremental_Similarity_Builder(n_items, initial_data_block=n_items*k, dtype = np.float32)

    start_time = time.time()

    if verbose:
        print("Generating topK matrix")

    # for each column, keep only the top-k scored items
    sparse_weights = not isinstance(item_weights, np.ndarray)

    # iterate over each column and keep only the top-k similar items
    if sparse_weights:
        item_weights = check_matrix(item_weights, format='csc', dtype=np.float32)


    for item_idx in range(n_items):

        if sparse_weights:
            start_position = item_weights.indptr[item_idx]
            end_position = item_weights.indptr[item_idx+1]

            column_data = item_weights.data[start_position:end_position]
            column_row_index = item_weights.indices[start_position:end_position]

        else:
            column_data = item_weights[:,item_idx]
            column_row_index = np.arange(n_items, dtype=np.int32)

        if np.any(column_data==0):
            non_zero_data = column_data!=0
            column_data = column_data[non_zero_data]
            column_row_index = column_row_index[non_zero_data]


        # If there is less data than k, there is no need to sort
        if k < len(column_data):
            # Use argpartition because I only need to select "which" are the topK elements, I do not need their exact order
            if use_absolute_values:
                top_k_idx = np.argpartition(-np.abs(column_data), k-1, axis=0)[:k]
            else:
                top_k_idx = np.argpartition(-column_data, k-1, axis=0)[:k]

            try:
                column_row_index = column_row_index[top_k_idx]
                column_data = column_data[top_k_idx]
            except:
                pass


        similarity_builder.add_data_lists(row_list_to_add = column_row_index,
                                          col_list_to_add = np.ones(len(column_row_index), dtype = np.int) * item_idx,
                                          data_list_to_add = column_data)



    if verbose:
        print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time))

    return similarity_builder.get_SparseMatrix()


In [None]:
def seconds_to_biggest_unit(time_in_seconds):

    conversion_factor_list = [
        ("sec", 1),
        ("min", 60),
        ("hour", 60),
        ("day", 24),
        ("year", 365),
    ]

    unit_index = 0
    temp_time_value = time_in_seconds
    new_time_value = time_in_seconds
    new_time_unit = "sec"

    while temp_time_value >= 1.0 and unit_index < len(conversion_factor_list)-1:

        temp_time_value = temp_time_value/conversion_factor_list[unit_index+1][1]

        if temp_time_value >= 1.0:
            unit_index += 1
            new_time_value = temp_time_value
            new_time_unit = conversion_factor_list[unit_index][0]

    else:
        return new_time_value, 

### Model

In [None]:
class DataIO(object):
    """ DataIO"""

    _DEFAULT_TEMP_FOLDER = ".temp"

    # _MAX_PATH_LENGTH_LINUX = 4096
    _MAX_PATH_LENGTH_WINDOWS = 255

    def __init__(self, folder_path):
        super(DataIO, self).__init__()

        self._is_windows = platform.system() == "Windows"

        self.folder_path = folder_path if folder_path[-1] == "/" else folder_path + "/"
        self._key_string_alert_done = False

        # if self._is_windows:
        #     self.folder_path = "\\\\?\\" + self.folder_path


    def _print(self, message):
        print("{}: {}".format("DataIO", message))


    def _get_temp_folder(self, file_name):
        """
        Creates a temporary folder to be used during the data saving
        :return:
        """

        # Ignore the .zip extension
        file_name = file_name[:-4]
        current_temp_folder = "{}{}_{}_{}/".format(self.folder_path, self._DEFAULT_TEMP_FOLDER, os.getpid(), file_name)

        if os.path.exists(current_temp_folder):
            self._print("Folder {} already exists, could be the result of a previous failed save attempt or multiple saver are active in parallel. " \
            "Folder will be removed.".format(current_temp_folder))

            shutil.rmtree(current_temp_folder, ignore_errors=True)

        os.makedirs(current_temp_folder)

        return current_temp_folder


    def _check_dict_key_type(self, dict_to_save):
        """
        Check whether the keys of the dictionary are string. If not, transforms them into strings
        :param dict_to_save:
        :return:
        """

        all_keys_are_str = all(isinstance(key, str) for key in dict_to_save.keys())

        if all_keys_are_str:
            return dict_to_save

        if not self._key_string_alert_done:
            self._print("Json dumps supports only 'str' as dictionary keys. Transforming keys to string, note that this will alter the mapper content.")
            self._key_string_alert_done = True

        dict_to_save_key_str = {str(key):val for (key,val) in dict_to_save.items()}

        assert len(dict_to_save_key_str) == len(dict_to_save), \
            "DataIO: Transforming dictionary keys into strings altered its content. Duplicate keys may have been produced."

        return dict_to_save_key_str


    def save_data(self, file_name, data_dict_to_save):

        # If directory does not exist, create with .temp_model_folder
        if not os.path.exists(self.folder_path):
            os.makedirs(self.folder_path)

        if file_name[-4:] != ".zip":
            file_name += ".zip"


        current_temp_folder = self._get_temp_folder(file_name)

        try:

            data_format = {}
            attribute_to_save_as_json = {}

            data_dict_to_save = self._check_dict_key_type(data_dict_to_save)

            for attrib_name, attrib_data in data_dict_to_save.items():

                current_file_path = current_temp_folder + attrib_name

                if isinstance(attrib_data, DataFrame):
                    # attrib_data.to_hdf(current_file_path + ".h5", key="DataFrame", mode='w', append = False, format="table")
                    # Save human readable version as a precaution. Append "." so that it is classified as auxiliary file and not loaded
                    attrib_data.to_csv(current_temp_folder + "." + attrib_name + ".csv", index=True)

                    # Using "fixed" as a format causes a PerformanceWarning because it saves types that are not native of C
                    # This is acceptable because it provides the flexibility of using python objects as types (strings, None, etc..)
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore")
                        attrib_data.to_hdf(current_file_path + ".h5", key="DataFrame", mode='w', append = False, format="fixed")


                elif isinstance(attrib_data, sps.spmatrix):
                    sps.save_npz(current_file_path, attrib_data)

                elif isinstance(attrib_data, np.ndarray):
                    # allow_pickle is FALSE to prevent using pickle and ensure portability
                    np.save(current_file_path, attrib_data, allow_pickle=False)

                else:
                    # Try to parse it as json, if it fails and the data is a dictionary, use another zip file
                    try:
                        _ = json.dumps(attrib_data, default=json_not_serializable_handler)
                        attribute_to_save_as_json[attrib_name] = attrib_data

                    except TypeError:

                        if isinstance(attrib_data, dict):
                            dataIO = DataIO(folder_path = current_temp_folder)
                            dataIO.save_data(file_name = attrib_name, data_dict_to_save=attrib_data)

                        else:
                            raise TypeError("Type not recognized for attribute: {}".format(attrib_name))



            # Save list objects
            if len(data_format)>0:
                attribute_to_save_as_json[".data_format"] = data_format.copy()

            for attrib_name, attrib_data in attribute_to_save_as_json.items():
                current_file_path = current_temp_folder + attrib_name

                # if self._is_windows and len(current_file_path + ".json") >= self._MAX_PATH_LENGTH_WINDOWS:
                #     current_file_path = "\\\\?\\" + current_file_path

                absolute_path = current_file_path + ".json" if current_file_path.startswith(os.getcwd()) else os.getcwd() + current_file_path + ".json"

                assert not self._is_windows or (self._is_windows and len(absolute_path) <= self._MAX_PATH_LENGTH_WINDOWS), \
                    "DataIO: Path of file exceeds {} characters, which is the maximum allowed under standard paths for Windows.".format(self._MAX_PATH_LENGTH_WINDOWS)


                with open(current_file_path + ".json", 'w') as outfile:
                    if isinstance(attrib_data, dict):
                        attrib_data = self._check_dict_key_type(attrib_data)

                    json.dump(attrib_data, outfile, default=json_not_serializable_handler)



            with zipfile.ZipFile(self.folder_path + file_name + ".temp", 'w', compression=zipfile.ZIP_DEFLATED) as myzip:
                for file_to_compress in os.listdir(current_temp_folder):
                    myzip.write(current_temp_folder + file_to_compress, arcname = file_to_compress)

            # Replace file only after the new archive has been successfully created
            # Prevents accidental deletion of previous versions of the file if the current write fails
            os.replace(self.folder_path + file_name + ".temp", self.folder_path + file_name)

        except Exception as exec:

            shutil.rmtree(current_temp_folder, ignore_errors=True)
            raise exec


        shutil.rmtree(current_temp_folder, ignore_errors=True)




    def load_data(self, file_name):

        if file_name[-4:] != ".zip":
            file_name += ".zip"

        dataFile = zipfile.ZipFile(self.folder_path + file_name)

        dataFile.testzip()

        current_temp_folder = self._get_temp_folder(file_name)

        try:

            try:
                data_format = dataFile.extract(".data_format.json", path = current_temp_folder)
                with open(data_format, "r") as json_file:
                    data_format = json.load(json_file)
            except KeyError:
                data_format = {}


            data_dict_loaded = {}

            for file_name in dataFile.namelist():

                # Discard auxiliary data structures
                if file_name.startswith("."):
                    continue

                decompressed_file_path = dataFile.extract(file_name, path = current_temp_folder)
                file_extension = file_name.split(".")[-1]
                attrib_name = file_name[:-len(file_extension)-1]

                if file_extension == "csv":
                    # Compatibility with previous version
                    attrib_data = pd.read_csv(decompressed_file_path, index_col=False)

                elif file_extension == "h5":
                    attrib_data = pd.read_hdf(decompressed_file_path, key=None, mode='r')

                elif file_extension == "npz":
                    attrib_data = sps.load_npz(decompressed_file_path)

                elif file_extension == "npy":
                    # allow_pickle is FALSE to prevent using pickle and ensure portability
                    attrib_data = np.load(decompressed_file_path, allow_pickle=False)

                elif file_extension == "zip":
                    dataIO = DataIO(folder_path = current_temp_folder)
                    attrib_data = dataIO.load_data(file_name = file_name)

                elif file_extension == "json":
                    with open(decompressed_file_path, "r") as json_file:
                        attrib_data = json.load(json_file)

                else:
                    raise Exception("Attribute type not recognized for: '{}' of class: '{}'".format(decompressed_file_path, file_extension))

                data_dict_loaded[attrib_name] = attrib_data


        except Exception as exec:

            shutil.rmtree(current_temp_folder, ignore_errors=True)
            raise exec

        shutil.rmtree(current_temp_folder, ignore_errors=True)


        return 

In [None]:
class BaseRecommender(object):
    """Abstract BaseRecommender"""

    RECOMMENDER_NAME = "Recommender_Base_Class"

    def __init__(self, URM_train, verbose=True):

        super(BaseRecommender, self).__init__()

        self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32)
        self.URM_train.eliminate_zeros()

        self.n_users, self.n_items = self.URM_train.shape
        self.verbose = verbose

        self.filterTopPop = False
        self.filterTopPop_ItemsID = np.array([], dtype=np.int)

        self.items_to_ignore_flag = False
        self.items_to_ignore_ID = np.array([], dtype=np.int)

        self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0

        if self._cold_user_mask.any():
            self._print("URM Detected {} ({:4.1f}%) users with no interactions.".format(
                self._cold_user_mask.sum(), self._cold_user_mask.sum()/self.n_users*100))


        self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0

        if self._cold_item_mask.any():
            self._print("URM Detected {} ({:4.1f}%) items with no interactions.".format(
                self._cold_item_mask.sum(), self._cold_item_mask.sum()/self.n_items*100))


    def _get_cold_user_mask(self):
        return self._cold_user_mask

    def _get_cold_item_mask(self):
        return self._cold_item_mask


    def _print(self, string):
        if self.verbose:
            print("{}: {}".format(self.RECOMMENDER_NAME, string))

    def fit(self):
        pass

    def get_URM_train(self):
        return self.URM_train.copy()

    def set_URM_train(self, URM_train_new, **kwargs):

        assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format(self.RECOMMENDER_NAME)

        if len(kwargs)>0:
            self._print("set_URM_train keyword arguments not supported for this recommender class. Received: {}".format(kwargs))

        self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32)
        self.URM_train.eliminate_zeros()

        self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0

        if self._cold_user_mask.any():
            self._print("Detected {} ({:4.1f}%) users with no interactions.".format(
                self._cold_user_mask.sum(), self._cold_user_mask.sum()/len(self._cold_user_mask)*100))



    def set_items_to_ignore(self, items_to_ignore):
        self.items_to_ignore_flag = True
        self.items_to_ignore_ID = np.array(items_to_ignore, dtype=np.int)

    def reset_items_to_ignore(self):
        self.items_to_ignore_flag = False
        self.items_to_ignore_ID = np.array([], dtype=np.int)


    #########################################################################################################
    ##########                                                                                     ##########
    ##########                     COMPUTE AND FILTER RECOMMENDATION LIST                          ##########
    ##########                                                                                     ##########
    #########################################################################################################


    def _remove_TopPop_on_scores(self, scores_batch):
        scores_batch[:, self.filterTopPop_ItemsID] = -np.inf
        return scores_batch


    def _remove_custom_items_on_scores(self, scores_batch):
        scores_batch[:, self.items_to_ignore_ID] = -np.inf
        return scores_batch


    def _remove_seen_on_scores(self, user_id, scores):

        assert self.URM_train.getformat() == "csr", "Recommender_Base_Class: URM_train is not CSR, this will cause errors in filtering seen items"

        seen = self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id + 1]]

        scores[seen] = -np.inf
        return scores


    def _compute_item_score(self, user_id_array, items_to_compute = None):
        """
        :param user_id_array:       array containing the user indices whose recommendations need to be computed
        :param items_to_compute:    array containing the items whose scores are to be computed.
                                        If None, all items are computed, otherwise discarded items will have as score -np.inf
        :return:                    array (len(user_id_array), n_items) with the score.
        """
        raise NotImplementedError("BaseRecommender: compute_item_score not assigned for current recommender, unable to compute prediction scores")


    def recommend(self, user_id_array, cutoff = None, remove_seen_flag=True, items_to_compute = None,
                  remove_top_pop_flag = False, remove_custom_items_flag = False, return_scores = False):

        # If is a scalar transform it in a 1-cell array
        if np.isscalar(user_id_array):
            user_id_array = np.atleast_1d(user_id_array)
            single_user = True
        else:
            single_user = False

        if cutoff is None:
            cutoff = self.URM_train.shape[1] - 1

        cutoff = min(cutoff, self.URM_train.shape[1] - 1)

        # Compute the scores using the model-specific function
        # Vectorize over all users in user_id_array
        scores_batch = self._compute_item_score(user_id_array, items_to_compute=items_to_compute)


        for user_index in range(len(user_id_array)):

            user_id = user_id_array[user_index]

            if remove_seen_flag:
                scores_batch[user_index,:] = self._remove_seen_on_scores(user_id, scores_batch[user_index, :])


        if remove_top_pop_flag:
            scores_batch = self._remove_TopPop_on_scores(scores_batch)

        if remove_custom_items_flag:
            scores_batch = self._remove_custom_items_on_scores(scores_batch)

        # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
        # - Partition the data to extract the set of relevant items
        # - Sort only the relevant items
        # - Get the original item index
        # relevant_items_partition is block_size x cutoff
        relevant_items_partition = np.argpartition(-scores_batch, cutoff-1, axis=1)[:,0:cutoff]

        # Get original value and sort it
        # [:, None] adds 1 dimension to the array, from (block_size,) to (block_size,1)
        # This is done to correctly get scores_batch value as [row, relevant_items_partition[row,:]]
        relevant_items_partition_original_value = scores_batch[np.arange(scores_batch.shape[0])[:, None], relevant_items_partition]
        relevant_items_partition_sorting = np.argsort(-relevant_items_partition_original_value, axis=1)
        ranking = relevant_items_partition[np.arange(relevant_items_partition.shape[0])[:, None], relevant_items_partition_sorting]

        ranking_list = [None] * ranking.shape[0]

        # Remove from the recommendation list any item that has a -inf score
        # Since -inf is a flag to indicate an item to remove
        for user_index in range(len(user_id_array)):
            user_recommendation_list = ranking[user_index]
            user_item_scores = scores_batch[user_index, user_recommendation_list]

            not_inf_scores_mask = np.logical_not(np.isinf(user_item_scores))

            user_recommendation_list = user_recommendation_list[not_inf_scores_mask]
            ranking_list[user_index] = user_recommendation_list.tolist()



        # Return single list for one user, instead of list of lists
        if single_user:
            ranking_list = ranking_list[0]


        if return_scores:
            return ranking_list, scores_batch

        else:
            return ranking_list



    #########################################################################################################
    ##########                                                                                     ##########
    ##########                                LOAD AND SAVE                                        ##########
    ##########                                                                                     ##########
    #########################################################################################################



    def save_model(self, folder_path, file_name = None):
        raise NotImplementedError("BaseRecommender: save_model not implemented")




    def load_model(self, folder_path, file_name = None):

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        self._print("Loading model from file '{}'".format(folder_path + file_name))

        dataIO = DataIO(folder_path=folder_path)
        data_dict = dataIO.load_data(file_name=file_name)

        for attrib_name in data_dict.keys():
             self.__setattr__(attrib_name, data_dict[attrib_name])

        self._print("Loading complete")

In [None]:
class BaseSimilarityMatrixRecommender(BaseRecommender):
    """
    This class refers to a BaseRecommender KNN which uses a similarity matrix, it provides two function to compute item's score
    bot for user-based and Item-based models as well as a function to save the W_matrix
    """

    def __init__(self, URM_train, verbose=True):
        super(BaseSimilarityMatrixRecommender, self).__init__(URM_train, verbose = verbose)

        self._URM_train_format_checked = False
        self._W_sparse_format_checked = False



    def _check_format(self):

        if not self._URM_train_format_checked:

            if self.URM_train.getformat() != "csr":
                self._print("PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation.".format("URM_train", "csr"))

            self._URM_train_format_checked = True

        if not self._W_sparse_format_checked:

            if self.W_sparse.getformat() != "csr":
                self._print("PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation.".format("W_sparse", "csr"))

            self._W_sparse_format_checked = True




    def save_model(self, folder_path, file_name = None):

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        self._print("Saving model in file '{}'".format(folder_path + file_name))

        data_dict_to_save = {"W_sparse": self.W_sparse}

        dataIO = DataIO(folder_path=folder_path)
        dataIO.save_data(file_name=file_name, data_dict_to_save = data_dict_to_save)

        self._print("Saving complete")

In [None]:
class BaseItemSimilarityMatrixRecommender(BaseSimilarityMatrixRecommender):

    def _compute_item_score(self, user_id_array, items_to_compute=None):
        """
        URM_train and W_sparse must have the same format, CSR
        :param user_id_array:
        :param items_to_compute:
        :return:
        """

        self._check_format()

        user_profile_array = self.URM_train[user_id_array]

        if items_to_compute is not None:
            item_scores = - np.ones((len(user_id_array), self.n_items), dtype=np.float32)*np.inf
            item_scores_all = user_profile_array.dot(self.W_sparse).toarray()
            item_scores[:, items_to_compute] = item_scores_all[:, items_to_compute]
        else:
            item_scores = user_profile_array.dot(self.W_sparse).toarray()

        return item_scores

In [None]:
class P3alphaRecommender(BaseItemSimilarityMatrixRecommender):
    """ P3alpha recommender """

    RECOMMENDER_NAME = "P3alphaRecommender"

    def __init__(self, URM_train, verbose = True):
        super(P3alphaRecommender, self).__init__(URM_train, verbose = verbose)


    def __str__(self):
        return "P3alpha(alpha={}, min_rating={}, topk={}, implicit={}, normalize_similarity={})".format(self.alpha,
                                                                            self.min_rating, self.topK, self.implicit,
                                                                            self.normalize_similarity)

    def fit(self, topK=100, alpha=1., min_rating=0, implicit=False, normalize_similarity=False):

        self.topK = topK
        self.alpha = alpha
        self.min_rating = min_rating
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity


        #
        # if X.dtype != np.float32:
        #     print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32)

        #Pui is the row-normalized urm
        Pui = normalize(self.URM_train, norm='l1', axis=1)

        #Piu is the column-normalized, "boolean" urm transposed
        X_bool = self.URM_train.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)
        #ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del(X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu

        similarity_builder = Incremental_Similarity_Builder(Pui.shape[1], initial_data_block=Pui.shape[1]*self.topK, dtype = np.float32)

        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = similarity_block[row_in_block, :]
                row_data[current_block_start_row + row_in_block] = 0

                relevant_items_partition = np.argpartition(-row_data, self.topK-1, axis=0)[:self.topK]
                row_data = row_data[relevant_items_partition]

                # Incrementally build sparse matrix, do not add zeros
                if np.any(row_data == 0.0):
                    non_zero_mask = row_data != 0.0
                    relevant_items_partition = relevant_items_partition[non_zero_mask]
                    row_data = row_data[non_zero_mask]

                similarity_builder.add_data_lists(row_list_to_add=np.ones(len(row_data), dtype = np.int) * (current_block_start_row + row_in_block),
                                                  col_list_to_add=relevant_items_partition,
                                                  data_list_to_add=row_data)


            if time.time() - start_time_printBatch > 300 or current_block_start_row + block_dim == Pui.shape[1]:
                new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)

                self._print("Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}".format(
                     current_block_start_row + block_dim,
                    100.0 * float( current_block_start_row + block_dim) / Pui.shape[1],
                    float( current_block_start_row + block_dim) / (time.time() - start_time),
                    new_time_value, new_time_unit))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()


        self.W_sparse = similarity_builder.get_SparseMatrix()


        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)


        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')