<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

# Deep Autoencoder Deep Dive

In [58]:
import sys
sys.path.append("../../")
import os
import numpy as np
import pandas as pd
import torch
import copy

from reco_utils.common.gpu_utils import get_number_gpus, get_cuda_version
from reco_utils.common.general_utils import get_number_processors, invert_dictionary
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_chrono_split
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var
from reco_utils.recommender.deep_autoencoder.autoencoder import AutoEncoder
from reco_utils.recommender.deep_autoencoder.data import UserItemRecDataProvider
from reco_utils.recommender.deep_autoencoder.utils import add_gpu, init_optimizer, MSEloss

import logging
log = logging.getLogger(__name__)

print("OS:", sys.platform)
print("Python: ", sys.version)
print("PyTorch:", torch.__version__)
print("Number of CPU processors:", get_number_processors())
print("Number of GPUs:", get_number_gpus())
print(get_cuda_version())

%load_ext autoreload
%autoreload 2

OS: linux
Python:  3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) 
[GCC 7.3.0]
PyTorch: 1.0.0
Number of CPU processors: 6
Number of GPUs: 1
CUDA Version 9.2.148
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
train_path = "ratings_train.csv"
valid_path = "ratings_valid.csv"
test_path = "ratings_test.csv"


In [7]:
data_params = {'batch_size': 128,
                'major': 'users',  # major position is the first column id of input data
                'itemIdInd': 1,  # the second index is the items
                'userIdInd': 0,  # the first index is the users/customers
                'delimiter': ',',
                'header': True,
                "src_file": train_path
                }
torch.manual_seed(42)

<torch._C.Generator at 0x7fb7e70acf30>

In [11]:
data_layer = UserItemRecDataProvider(params=data_params)
#dir(data_layer)

In [37]:
for i, mb in enumerate(data_layer.iterate_one_epoch()):
    print(mb)
    break

tensor(indices=tensor([[   0,    0,    0,  ...,  127,  127,  127],
                       [ 424,   54, 2049,  ..., 1430, 1439, 2230]]),
       values=tensor([5., 3., 3.,  ..., 5., 5., 5.]),
       size=(128, 6375), nnz=9835, layout=torch.sparse_coo)


In [14]:
eval_params = copy.deepcopy(data_params)
eval_params['src_file'] = valid_path
validation_layer = UserItemRecDataProvider(
    params=eval_params,
    user_id_map=data_layer.user_id_map,
    item_id_map=data_layer.item_id_map)
validation_layer.src_data = data_layer.data


In [19]:
hidden_layers = [1024, 512, 512, 128]
model = AutoEncoder(
    layer_sizes=[data_layer.vector_dim] + hidden_layers,
    nl_type="selu",
    is_constrained=False,
    dp_drop_prob=0.8)

  weight_init.xavier_uniform(w)
  weight_init.xavier_uniform(w)


In [23]:
model = add_gpu(model, "0")

In [25]:
optimizer, scheduler = init_optimizer(model,
                       optimization_method="momentum",
                       lr=0.005,
                       wd=0.00001)

In [52]:
from torch.autograd import Variable
from math import sqrt


cuda_availability = True
def train_loop(rencoder, optimizer, scheduler=None):    
    """
    Internal train loop
    """
    t_loss = 0.0
    t_loss_denom = 0.0
    global_step = 0
    best_loss = sys.maxsize
    best_epoch = 0
    epoch = 0
    losing_patience = 0

    # Params
    noise_prob = 0.0
    num_epochs = 20
    aug_step = 1


    if noise_prob > 0.0:
        dp = nn.Dropout(p=noise_prob)

    # Train until finish epochs or early stoping fires
    while epoch < num_epochs and losing_patience < 10:
        print('Doing epoch {} of {}'.format(epoch, num_epochs))
        rencoder.train()
        total_epoch_loss = 0.0
        denom = 0.0
        if scheduler:
            scheduler.step()
        for i, mb in enumerate(data_layer.iterate_one_epoch()):
            inputs = Variable(mb.cuda().to_dense()) if cuda_availability else Variable(mb.to_dense())
            optimizer.zero_grad()
            loss, outputs = _backprop(rencoder, inputs, optimizer)
            global_step += 1
            t_loss += loss.data.item()#loss.data[0]
            t_loss_denom += 1
            total_epoch_loss += loss.data.item()#loss.data[0]
            denom += 1

            if aug_step > 0:
                # Magic data augmentation trick happen here
                for t in range(aug_step):
                    inputs = Variable(outputs.data)
                    if noise_prob > 0.0:
                        inputs = dp(inputs)
                    optimizer.zero_grad()
                    loss, outputs = _backprop(
                        rencoder, inputs, optimizer)

        # Track model with lowest loss
        epoch_loss = sqrt(total_epoch_loss/denom)
        print("Epoch {} - Training loss: {}".format(epoch, epoch_loss))
        if True:# self.params['use_validation']:
            epoch_loss = _evaluate_on_validation_set(rencoder)
            print("Epoch {} - Validation loss: {}".format(epoch,
                                                              epoch_loss))
        if epoch_loss < best_loss:
            losing_patience = 0
            best_loss = epoch_loss
            best_epoch = epoch
            best_model_wts = copy.deepcopy(rencoder.state_dict())
        else:
            # early stoping
            losing_patience += 1
        epoch += 1

    # Save final model
    print("Best loss {} in epoch {}".format(best_loss, best_epoch))
    #self._save_model(best_model_wts, best_epoch)
    #rencoder.load_state_dict(best_model_wts)

def _backprop(rencoder, inputs, optimizer):
    outputs = rencoder(inputs)
    loss, num_ratings = MSEloss(outputs, inputs)
    loss = loss / num_ratings
    loss.backward()
    optimizer.step()
    return loss, outputs

def _evaluate_on_validation_set(rencoder):
    rencoder.eval()
    denom = 0.0
    total_epoch_loss = 0.0
    for target_mb, user_profile in validation_layer.iterate_one_epoch_eval():
        inputs = Variable(user_profile.cuda().to_dense()) if cuda_availability else Variable(user_profile.to_dense())
        targets = Variable(target_mb.cuda().to_dense()) if cuda_availability else Variable(target_mb.to_dense())
        outputs = rencoder(inputs)
        loss, num_ratings = MSEloss(outputs, targets)
        total_epoch_loss += loss.data.item()#loss.data[0]
        denom += num_ratings.data.item()#num_ratings.data[0]
    return sqrt(total_epoch_loss / denom)

In [53]:
train_loop(model, optimizer, scheduler)

Doing epoch 0 of 20
Epoch 0 - Training loss: 0.8397684036877522
Epoch 0 - Validation loss: 1.0088240287902315
Doing epoch 1 of 20
Epoch 1 - Training loss: 0.8367286859629405
Epoch 1 - Validation loss: 1.0082671629657438
Doing epoch 2 of 20
Epoch 2 - Training loss: 0.8378885122419276
Epoch 2 - Validation loss: 1.007014279433431
Doing epoch 3 of 20
Epoch 3 - Training loss: 0.8360690931328426
Epoch 3 - Validation loss: 1.005681530466913
Doing epoch 4 of 20
Epoch 4 - Training loss: 0.8298117032809726
Epoch 4 - Validation loss: 1.0047572805108538
Doing epoch 5 of 20
Epoch 5 - Training loss: 0.828885582064895
Epoch 5 - Validation loss: 1.0038280099692707
Doing epoch 6 of 20
Epoch 6 - Training loss: 0.8288188187116898
Epoch 6 - Validation loss: 1.0027213862829305
Doing epoch 7 of 20
Epoch 7 - Training loss: 0.825830375487737
Epoch 7 - Validation loss: 1.001512954053215
Doing epoch 8 of 20
Epoch 8 - Training loss: 0.8246885506528209
Epoch 8 - Validation loss: 1.000410581692314
Doing epoch 9 of

In [56]:
def recommend_k_items(model, data_layer, k=10):
    """
    Predict function. It returns the top k rated items for each user.
    These items have not been seen by the user yet.
    """
    model.eval()

    # Inverse mapping from internal user-item representation to input
    # data representation
    inv_user_id_map = invert_dictionary(data_layer.user_id_map)
    inv_item_id_map = invert_dictionary(data_layer.item_id_map)

    # Initialize result array
    n_users = len(data_layer.user_id_map.keys())
    results = np.zeros((n_users*k, 3))
    for i, (mb, major_indices) in enumerate(
            data_layer.iterate_one_epoch(shuffle_data=False)):
        # Given a user profile compute the ratings of all items
        inputs = Variable(mb.cuda().to_dense()) if cuda_availability else Variable(mb.to_dense())
        outputs = model(inputs).cpu().data.numpy()

        # Get the major_key of the origial input data (in this current
        # implementation: customerID)
        major_key = [inv_user_id_map[k] for k in major_indices]

        # Select non viewed items
        non_viewed_items = mb.to_dense().numpy() == 0
        non_viewed_outputs = non_viewed_items*outputs

        # Sort ratings of non viwed items and take top k
        sorted_indices = np.fliplr(np.argsort(non_viewed_outputs, axis=1))[:, :k]

        # Return a batch of top items with higher ratings that the user
        # has not seen yet
        batch_size = outputs.shape[0]  # size of current batch
        results_internal = np.zeros((batch_size*k, 3))
        for b in range(batch_size):
            customer_id_batch = [major_key[b]]*k
            item_id_batch = [inv_item_id_map[i]
                             for i in sorted_indices[b, :]]
            outputs_batch = outputs[b, sorted_indices[b, :]]
            result_batch = np.column_stack(
                (customer_id_batch, item_id_batch, outputs_batch))
            results_internal[b*k:(b+1)*k, :] = result_batch

        # Append the batch to results vector
        result_batch_size = results_internal.shape[0]
        results[i*result_batch_size:(i+1) *
                result_batch_size, :] = results_internal

    rec_df = pd.DataFrame(
        results, columns=['customerID', 'itemID', 'rating'])
    return rec_df

df_ranking = recommend_k_items(model, data_layer)
df_ranking.head()

Unnamed: 0,customerID,itemID,rating
0,1.0,858.0,4.007101
1,1.0,6077.0,4.002754
2,1.0,876.0,3.978678
3,1.0,98491.0,3.950896
4,1.0,6461.0,3.812881


In [60]:
test_params = copy.deepcopy(data_params)
test_params['src_file'] = test_path
test_data_layer = UserItemRecDataProvider(
    params=test_params,
    user_id_map=data_layer.user_id_map,
    item_id_map=data_layer.item_id_map)
test_data_layer.src_data = data_layer.data


def predict_regression(model, eval_data_layer, cols = ['customerID','itemID','rating']):
    """
    Predict function for regression. It returns the predictions only of the
    items rated in the evaluation set.
    :param return_targets: Whether or not return the targets
    :return: A dataframe with customer, item, predicted rating (and
    optionally, the target rating)
    """
    model.eval()    

#     # Generate evaluation layer
#     test_path = self.dataset.get_local_filepath(dataset.TEST_TYPE)
#     self.data_params['src_file'] = test_path
#     eval_data_layer = UserItemRecDataProvider(
#         params=self.data_params,
#         user_id_map=self.train_data_layer.user_id_map,  # mappings provided
#         item_id_map=self.train_data_layer.item_id_map)
#     # Populate evaluation layer with user profile
#     eval_data_layer.src_data = self.train_data_layer.data

    # Generate inverse user-item mapping: mapping from internal
    # representation to input data
    inv_user_id_map = invert_dictionary(eval_data_layer.user_id_map)
    inv_item_id_map = invert_dictionary(eval_data_layer.item_id_map)

    # FIXME: optimize iterate_one_epoch_eval to yield data of batch_size.
    # Check whether using batch_size>1 generates wrong results.
    results = []
    for i, ((targets, user_profile), major_ind) in enumerate(
            eval_data_layer.iterate_one_epoch_eval(for_inf=True)):
        # Given a user profile compute the ratings of all items
        inputs = Variable(user_profile.cuda().to_dense()) if cuda_availability else Variable(user_profile.to_dense())
        outputs = model(inputs).cpu().data.numpy()[0, :]

        # Get the major_key of the origial input data (in this current
        # implementation: customerID)
        major_key = inv_user_id_map[major_ind]

        # Get the evaluation targets, most elements are going to be zero.
        # Also get the non zero indices in non_zero
        targets_np = targets.to_dense().numpy()[0, :]
        non_zeros = targets_np.nonzero()[0].tolist()

        # Create a dataframe with the prediction selecting only the
        # indexes of the items scored in the test set
        for ind in non_zeros:
            result = [major_key, inv_item_id_map[ind], outputs[ind]]
            results.append(result)
    test_pred = pd.DataFrame(results, columns=cols)

    return test_pred

df_rating = predict_regression(model, test_data_layer)
df_rating.head()

Unnamed: 0,customerID,itemID,rating
0,1,2193,2.498287
1,1,2968,2.879097
2,1,1405,2.303481
3,1,2150,2.84881
4,1,1172,3.486091
