# Using Neural Network for Content-Based  Recommendation Systems

```python

import tensorflow as tf


content_id_column = tf.feature_column.categorical_column_with_hash_bucket(
  key="content_id",
  hash_bucket_size=len(content_ids_list))

embedded_content_column = tf.feature_column.embedding_column(
  categorical_column=content_id_column,
  dimension=10)

embedded_title_column = hub.text_embedding_column(
  key="title",
  module_spec="https://tfhub.dev/google/nnlm-de-dim50/1",
  trainable=False
)


author_column = tf.feature_column.categorical_column_with_hash_bucket(

  key="author",
  hash_bucket_size=len(authors_list) + 1)

embedded_author_column = tf.feature_column.embedding_columm(
  categorical_column=author_column,
  dimension=3
)

months_since_epoch_column = tf.feature_column.numeric_column(
  key="months_since_epoch"
)

months_since_epoch_bucketized = tf.feature_column.bucketized_column(
  source_column=months_since_epoch_column,
  boundaries=months_since_epoch_boundaries)


net = tf.feature_column.input_layer(features, params['feature_columns'])

for units in params['hidden_units']:
  net = tf.layers.dense(net, units=units, activation=tf.nn.relu)

# Compute logits (1 per class).

logits = tf.layers.dense(net, params['n_classes'], activation=None)


```

# WALS Collaborative Filtering Implementation in TensorFlow Estimator

```python
# Because WALS requires whole rows or columns, the data has to be preprocessed to provide SparseTensors of rows/columns

import tensorflow as tf

grouped_by_items = mapped_df.groupby('itemId')

with tf.python_ip.TFReocrdWriter('data/users_for_item') as ofp:
  for item, grouped in grouped_by_items:
    example = tf.train.Example(features=tf.train.Features(feature={
      'key' : tf.train.Feature(int64_list=tf.train.Int64List(value=[item])),
      'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['userId'].values)),
      'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
    }))

    ofp.write(example.SerializeToString())



```

```python
# Because WALS requires whole rows or columns, the data has to be preprocessed to provide SparseTensors of rows/columns

import tensorflow as tf

grouped_by_users = mapped_df.groupby('userId')

with tf.python_ip.TFReocrdWriter('data/users_for_item') as ofp:
  for user, grouped in grouped_by_users:
    example = tf.train.Example(features=tf.train.Features(feature={
      'key' : tf.train.Feature(int64_list=tf.train.Int64List(value=[user])),
      'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['itemId'].values)),
      'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
    }))

    ofp.write(example.SerializeToString())
```


# WALS Matrix Factorization Estimator

```python

tf.contrib.learn.Experiment(

  tf.contrib.factorization.WALSMatrixFactorization(
    num_rows=args['nusers'], num_cols=args['nitems'],
    embedding_dimension=args['n_embeds'],
    model_dir=args['output_dir']
  ),

  train_input_fn = read_dataset(tf.estimator.ModeKeys.TRAIN, args),
  eval_input_fn = read_dataset(tf.estimator.ModeKeys.EVAL, args),
  train_steps = train_steps,
  eval_steps = 1,
  min_eval_frequency = steps_in_epoch,
  export_strategies = tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(serving_input_fn=create_serving_input_fn(args))
)

```
# The input function has to read the files and create sparse tensors for the rows and for the columns

```python

def parse_tfrecords(filename, vocab_size):
  files = tf.gfile.Glob(os.path.join(args['input_path'], filename))
  dataset = tf.data.TFRecordDataset(files)
  dataset = dataset.map(lambda x: decode_example(x, vocab_size))

  if mode == tf.estimator.ModeKeys.TRAIN:
    num_epochs = None # indefinitely
  else:
    num_epochs = 1 # end-of-input after this


  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(args['batch_size'])
  dataset = dataset.map(lambda xZ: remap_keys(x))
  return dataset.make_one_shot_iterator().get_next()


def _input_fn():
  features = {
    WALSMatrixFactorization.INPUT_ROWS: parse_tfrecords('items_for_user', args['nitems']),
    WALSMatrixFactorization.INPUT_COLS: parse_tfrecords('users_for_item', args['nuers']),
    WALSMatrixFactorization.PROJECT_ROW: tf.constant(True)
  }

  return features, None
```


# Decode the TF Record files and invoke sparse_merge to create the necessary SparseTensor
```python

def decode_example(protos, vocab_size):
  
  # Specify the features that were saved in the TF Record file
  features = {
    'key': tf.FixedLenFeature([1], tf.int64),
    'indices': tf.VarLenFeature(dtype=tf.int64),
    'values': tf.VarLenFeature(dtype=tf.float32)
  }
  # Parse from the file
  parsed_features = tf.parse_single_example(protos, features)
  # Create SparseTensor
  values = tf.sparse_merge(parsed_features['indices'], parsed_features['values'], vocab_size=vocab_size)

  # Save key (itemId or userId)
  # Save key to remap after batching is complete
  key = parsed_features['key']

  # Splice key into SparseTensor with tf.concat

  decoded_sparse_tensor = tf.SparseTensor(indices=tf.concat([values.indices, [key]], axis=0),
  values = tf.concat([values.values, [0.0]], axis=0), dense_shape = values.dense_shape)

  # Return final SparseTensor
  return decoded_sparse_tensor

```

# Remap keys to SparseTensor to fix re-indexing after batching

```python

def remap_keys(sparse_tensor):
  # Current indices of our SparseTensor that we need to fix
  bad_indices = sparse_tensor.indices
  # Current values of our SparseTensor that we need to fix
  bad_values = sparse_tensor.values

  # Group by the batch_indices and get the count for each
  size = tf.segmen_sum(data = tf.ones_like(bad_indices[:, 0], dtype= tf.int64), segment_ids = bad_indices[:, 0]) - 1

  # The number of batch_indices (this should be batch_size unless it is a partially full batch)

  length = tf.shape(size, out_type = tf.int64)[0]

  # Finds the cumulative sum which we can use for indexing later
  cum = tf.cumsum(size)

  # The offsets between each example in the batch due to concatenation of the keys in decode_example
  length_range = tf.range(start = 0, limit = length, delta = 1, dtype = tf.int64)

  # Indices of the SparseTensor of the rows added by concatenation of keys in decode_example

  cum_range = cum + length_range

  # The keys that we have extracted back out of our concatenated SparseTensor

  gathered_indices = tf.squeeze(tf.gather(bad_indices, cum_range)[:, 1])

  # The enumerated row indices of the SparseTensor's indices member
  sparse_indices_range = tf.range(tf.shape(bad_indices, out_type=tf.int64)[0], dtype=tf.int64)


  # Want to find the rows indices of the SparseTensor that are actual data & not the concatenated rows So we can to find the intersection of the two sets and then take the opposite of that

  x = sparse_indices_range
  s = cum_range

  # Number of multiples we are going to tile x, which is our sparse_indices_range

  tile_multiples = tf.concat([tf.ones(tf.shape(tf.shape(x)), dtype=tf.int64), tf.shape(s, out_type=tf.int64)], axis=0)

  # Expands x, our sparse_indices_range, into a rank 2 tensor
  # Then multiplies the rows by 1 (no copying) and the columns by the number of examples in the batch 

  x_tile = tf.tile(tf.expand_dims(x, -1_, tile_multiples)

  # Essentially a vectorized logical or, that we then negate
  x_not_in_s = ~tf.reduce_any(tf.equal(x_tile, s), -1) 

  # The SparseTensor's indices that are our actual data by using the boolean_mask we just made above Applied to the entire indices member of our SparseTensor

  selected_indices = tf.boolean_mask(tensor = bad_indices, mask = x_not_in_s, axis = 0)

  # Apply the same boolean_mask to the entire values member of our SparseTensor
  # Gets the actual values data
  selected_values = tf.boolean_mask(tensor = bad_values, mask = x_not_in_s, axis = 0)

  # Need to replace the first column of selected_indices with keys
  # So we first need to tile gathered_indices
  tiling = tf.tile(input = tf.expand_dims(gathered_indices[0], -1_, multiples = tf.expand_dims(size[0], -1))

  # We have to repeatedly apply the tiling to each example in the batch
  # Since it is jagged we cannot use tf.map_fn due to the stacking of the TensorArray So we have to create our own custom version

  def loop_body(i, tensor_grow):
    return i + 1, tf.concat(values, [tensor_grow, 
              tf.tile(input = tf.expand_dims(gathered_indices[i], -1),
              multiples = tf.expand_dims(size[i], -1))], axis=0)

  _, result = tf.while_loop(lambda i, tensor_grow: i < length, loop_body, [tf.constant(1, dtype=tf.int64), tiling])


  # Concatenated tiled keys with the 2nd column of selected_indices

  selected_indices_fixed = tf.concat([tf.expand_dims(result, -1),
                                      tf.expand_dims(selected_indices[:,1], -1)], axis=1)

  # Combine everything together back into a SparseTensor

  remapped_sparse_tensor = tf.SparseTensor(indices = selected_indices_fixed, values = selected_values, dense_shape = sparse_tensor.dense_shape)

  return remapped_sparse_tensor
```


# The train_and_evaluate loop is typical of tf.contrib Estimators

```python
def train_and_evaluate(args):
  train_steps = int(0.5 + (1.0 * args['num_epochs'] * args['nusers']) / args['batch_size']

  def experiement_fn(output_dir):
    return tf.contrib.learn.Experiement(
      tf.contrib.factorization.WALSMatrixFactorization(
        num_rows=args['nusers'], num_cols=args['nitems'],
        embedding_dimension=args['n_embeds'],
        model_dir=args['output_dir']),
      train_input_fn = read_dataset(tf.estimator.ModeKeys.TRAIN, args),
      eval_input_fn = read_dataset(tf.estimator.ModeKeys.EVAL, args),
      train_steps=train_steps,
      eval_steps=1,
      min_eval_frequency=steps_in_epoch,
      export_strategies=tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(serving_input_fn=create_serving_input_fn(args)))

  learn_runner.run(experiment_fn, args['output_dir'])

def find_top_k(user_factors, item_factors, k):
  all_items = tf.matmul(tf.expand_dims(user_factors, 0), tf.transpose(item_factors))

  topk = tf.nn.top_k(all_items, k=k)

  return tf.cast(topk.indices, dtype=tf.int64)
```

# Finding top K items for all users can be done as a batch prediction job

```python

def batch_predict(args):
  with tf.Session() as sess:
    estimator = tf.contrib.factorization.WALSMatrixFactorization(
                        num_rows=args['nusers'], num_cols=args['nitems'],
                        embedding_dimension=args['n_embeds'],
                        model_dir=args['output_dir'])

    user_factors = tf.convert_to_tensor(estimator.get_row_factors()[0]) # (nusers, nembeds)
    item_factors = tf.convert_to_tensor(estimator.get_col_factors()[0]) # (nitems, nembeds)

    # for each user, find the top K items

    topk = tf.squeeze(tf.map_fn(lambda user: find_top_k(user, item_factors, args['topk']), user_factors, dtype=tf.int64))

    with file_io.FileIO(os.path.join(args['output_dir'], 'batch_pred.txt', mode='w') as f:
      for best_items_for_user in topk.eval():
        f.write(','.join(str(x) for x in best_items_for_user) + '\n')


```