In [2]:
!pip install sh

Collecting sh
  Downloading https://files.pythonhosted.org/packages/4a/22/17b22ef5b049f12080f5815c41bf94de3c229217609e469001a8f80c1b3d/sh-1.12.14-py2.py3-none-any.whl
[31mgoogle-cloud-pubsub 0.26.0 has requirement google-cloud-core<0.26dev,>=0.25.0, but you'll have google-cloud-core 0.28.1 which is incompatible.[0m
[31mtensorflow-transform 0.8.0 has requirement protobuf<4,>=3.6.0, but you'll have protobuf 3.5.2 which is incompatible.[0m
[31mgapic-google-cloud-pubsub-v1 0.15.4 has requirement oauth2client<4.0dev,>=2.0.0, but you'll have oauth2client 4.1.2 which is incompatible.[0m
[31mproto-google-cloud-pubsub-v1 0.15.4 has requirement oauth2client<4.0dev,>=2.0.0, but you'll have oauth2client 4.1.2 which is incompatible.[0m
[31mapache-airflow 1.9.0 has requirement bleach==2.1.2, but you'll have bleach 3.0.2 which is incompatible.[0m
[31mapache-airflow 1.9.0 has requirement jinja2<2.9.0,>=2.7.3, but you'll have jinja2 2.10 which is incompatible.[0m
[31mapache-airflow 1.9.0 h

In [3]:
%writefile wals.py
import math

import numpy as np
import tensorflow as tf
from tensorflow.contrib.factorization.python.ops import factorization_ops


def get_rmse(output_row, output_col, actual):
  """Compute rmse between predicted and actual ratings.

  Args:
    output_row: evaluated numpy array of row_factor
    output_col: evaluated numpy array of col_factor
    actual: coo_matrix of actual (test) values

  Returns:
    rmse
  """
  mse = 0
  for i in xrange(actual.data.shape[0]):
    row_pred = output_row[actual.row[i]]
    col_pred = output_col[actual.col[i]]
    err = actual.data[i] - np.dot(row_pred, col_pred)
    mse += err * err
  mse /= actual.data.shape[0]
  rmse = math.sqrt(mse)
  return rmse


def simple_train(model, input_tensor, num_iterations):
  """Helper function to train model on input for num_iterations.

  Args:
    model:            WALSModel instance
    input_tensor:     SparseTensor for input ratings matrix
    num_iterations:   number of row/column updates to run

  Returns:
    tensorflow session, for evaluating results
  """
  sess = tf.Session(graph=input_tensor.graph)

  with input_tensor.graph.as_default():
    row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
    col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

    sess.run(model.initialize_op)
    sess.run(model.worker_init)
    for _ in xrange(num_iterations):
      sess.run(model.row_update_prep_gramian_op)
      sess.run(model.initialize_row_update_op)
      sess.run(row_update_op)
      sess.run(model.col_update_prep_gramian_op)
      sess.run(model.initialize_col_update_op)
      sess.run(col_update_op)

  return sess

LOG_RATINGS = 0
LINEAR_RATINGS = 1
LINEAR_OBS_W = 100.0


def make_wts(data, wt_type, obs_wt, feature_wt_exp, axis):
  """Generate observed item weights.

  Args:
    data:             coo_matrix of ratings data
    wt_type:          weight type, LOG_RATINGS or LINEAR_RATINGS
    obs_wt:           linear weight factor
    feature_wt_exp:   logarithmic weight factor
    axis:             axis to make weights for, 1=rows/users, 0=cols/items

  Returns:
    vector of weights for cols (items) or rows (users)
  """
  # recipricol of sum of number of items across rows (if axis is 0)
  frac = np.array(1.0/(data > 0.0).sum(axis))

  # filter any invalid entries
  frac[np.ma.masked_invalid(frac).mask] = 0.0

  # normalize weights according to assumed distribution of ratings
  if wt_type == LOG_RATINGS:
    wts = np.array(np.power(frac, feature_wt_exp)).flatten()
  else:
    wts = np.array(obs_wt * frac).flatten()

  # check again for any numerically unstable entries
  assert np.isfinite(wts).sum() == wts.shape[0]
  return wts


def wals_model(data, dim, reg, unobs, weights=False,
               wt_type=LINEAR_RATINGS, feature_wt_exp=None,
               obs_wt=LINEAR_OBS_W):
  """Create the WALSModel and input, row and col factor tensors.

  Args:
    data:           scipy coo_matrix of item ratings
    dim:            number of latent factors
    reg:            regularization constant
    unobs:          unobserved item weight
    weights:        True: set obs weights, False: obs weights = unobs weights
    wt_type:        feature weight type: linear (0) or log (1)
    feature_wt_exp: feature weight exponent constant
    obs_wt:         feature weight linear factor constant

  Returns:
    input_tensor:   tensor holding the input ratings matrix
    row_factor:     tensor for row_factor
    col_factor:     tensor for col_factor
    model:          WALSModel instance
  """
  row_wts = None
  col_wts = None

  num_rows = data.shape[0]
  num_cols = data.shape[1]

  if weights:
    assert feature_wt_exp is not None
    row_wts = np.ones(num_rows)
    col_wts = make_wts(data, wt_type, obs_wt, feature_wt_exp, 0)

  row_factor = None
  col_factor = None

  with tf.Graph().as_default():

    input_tensor = tf.SparseTensor(indices=zip(data.row, data.col),
                                   values=(data.data).astype(np.float32),
                                   dense_shape=data.shape)

    model = factorization_ops.WALSModel(num_rows, num_cols, dim,
                                        unobserved_weight=unobs,
                                        regularization=reg,
                                        row_weights=row_wts,
                                        col_weights=col_wts)

    # retrieve the row and column factors
    row_factor = model.row_factors[0]
    col_factor = model.col_factors[0]

  return input_tensor, row_factor, col_factor, model

Writing wals.py


In [113]:
import datetime
import numpy as np
import os
import pandas as pd
from scipy.sparse import coo_matrix
import sh
import tensorflow as tf

import wals

# ratio of train set size to test set size
TEST_SET_RATIO = 10

# default hyperparameters
DEFAULT_PARAMS = {
    'weights': True,
    'latent_factors': 5,
    'num_iters': 20,
    'regularization': 0.07,
    'unobs_weight': 0.01,
    'wt_type': 0,
    'feature_wt_factor': 130.0,
    'feature_wt_exp': 0.08,
    'delimiter': '\t'
}

# parameters optimized with hypertuning for the MovieLens data set
OPTIMIZED_PARAMS = {
    'latent_factors': 34,
    'regularization': 9.83,
    'unobs_weight': 0.001,
    'feature_wt_factor': 189.8,
}

# parameters optimized with hypertuning for the included web views data set
OPTIMIZED_PARAMS_WEB = {
    'latent_factors': 30,
    'regularization': 7.27,
    'unobs_weight': 0.01,
    'feature_wt_exp': 5.05,
}


def create_test_and_train_sets():
  
  return _ratings_train_and_test()

def _ratings_train_and_test():
  headers = ['userId', 'movieId', 'rating', 'timestamp']
  #header_row = 0 if use_headers else None
  
  ratings_df = pd.read_pickle("../data/rating.pkl")
 # ratings_df = pd.read_csv(input_file,
 #                          sep=delimiter,
 #                          names=headers,
 #                          header=header_row,
 #                          dtype={
 #                              'user_id': np.int32,
 #                              'item_id': np.int32,
 #                              'rating': np.float32,
 #                              'timestamp': np.int32,
 #                          })

  np_users = ratings_df.userId.as_matrix()
  np_items = ratings_df.movieId.as_matrix()
  unique_users = np.unique(np_users)
  unique_items = np.unique(np_items)

  n_users = unique_users.shape[0]
  n_items = unique_items.shape[0]

  # make indexes for users and items if necessary
  max_user = unique_users[-1]
  max_item = unique_items[-1]
  if n_users != max_user or n_items != max_item:
    # make an array of 0-indexed unique user ids corresponding to the dataset
    # stack of user ids
    z = np.zeros(max_user+1, dtype=int)
    z[unique_users] = np.arange(n_users)
    u_r = z[np_users]

    # make an array of 0-indexed unique item ids corresponding to the dataset
    # stack of item ids
    z = np.zeros(max_item+1, dtype=int)
    z[unique_items] = np.arange(n_items)
    i_r = z[np_items]

    # construct the ratings set from the three stacks
    np_ratings = ratings_df.rating.as_matrix()
    ratings = np.zeros((np_ratings.shape[0], 3), dtype=object)
    ratings[:, 0] = u_r
    ratings[:, 1] = i_r
    ratings[:, 2] = np_ratings
  else:
    ratings = ratings_df.as_matrix(['userId', 'movieId', 'rating'])
    # deal with 1-based user indices
    ratings[:, 0] -= 1
    ratings[:, 1] -= 1

  tr_sparse, test_sparse = _create_sparse_train_and_test(ratings,
                                                         n_users, n_items)

  return ratings[:, 0], ratings[:, 1], tr_sparse, test_sparse

def _create_sparse_train_and_test(ratings, n_users, n_items):
  """Given ratings, create sparse matrices for train and test sets.

  Args:
    ratings:  list of ratings tuples  (u, i, r)
    n_users:  number of users
    n_items:  number of items

  Returns:
     train, test sparse matrices in scipy coo_matrix format.
  """
  # pick a random test set of entries, sorted ascending
  test_set_size = len(ratings) / TEST_SET_RATIO
  test_set_idx = np.random.choice(xrange(len(ratings)),
                                  size=test_set_size, replace=False)
  test_set_idx = sorted(test_set_idx)

  # sift ratings into train and test sets
  ts_ratings = ratings[test_set_idx]
  tr_ratings = np.delete(ratings, test_set_idx, axis=0)

  # create training and test matrices as coo_matrix's
  u_tr, i_tr, r_tr = zip(*tr_ratings)
  tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items))

  u_ts, i_ts, r_ts = zip(*ts_ratings)
  test_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items))

  return tr_sparse, test_sparse


def train_model(args, tr_sparse):
  dim = args['latent_factors']
  num_iters = args['num_iters']
  reg = args['regularization']
  unobs = args['unobs_weight']
  wt_type = args['wt_type']
  feature_wt_exp = args['feature_wt_exp']
  obs_wt = args['feature_wt_factor']

  tf.logging.info('Train Start: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))

  # generate model
  input_tensor, row_factor, col_factor, model = wals.wals_model(tr_sparse,
                                                                dim,
                                                                reg,
                                                                unobs,
                                                                args['weights'],
                                                                wt_type,
                                                                feature_wt_exp,
                                                                obs_wt)

  # factorize matrix
  session = wals.simple_train(model, input_tensor, num_iters)

  tf.logging.info('Train Finish: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))

  # evaluate output factor matrices
  output_row = row_factor.eval(session=session)
  output_col = col_factor.eval(session=session)

  # close the training session now that we've evaluated the output
  session.close()

  return output_row, output_col, model


def save_model(output_dir, user_map, item_map, row_factor, col_factor):
  """Save the user map, item map, row factor and column factor matrices in numpy format.

  These matrices together constitute the "recommendation model."

  Args:
    args:         input args to training job
    user_map:     user map numpy array
    item_map:     item map numpy array
    row_factor:   row_factor numpy array
    col_factor:   col_factor numpy array
  """
  model_dir = os.path.join(output_dir, 'model')

  # if our output directory is a GCS bucket, write model files to /tmp,
  # then copy to GCS
  gs_model_dir = None
  if model_dir.startswith('gs://'):
    gs_model_dir = model_dir
    model_dir = '/tmp/{0}'.format(args['job_name'])

  os.makedirs(model_dir)
  np.save(os.path.join(model_dir, 'user'), user_map)
  np.save(os.path.join(model_dir, 'item'), item_map)
  np.save(os.path.join(model_dir, 'row'), row_factor)
  np.save(os.path.join(model_dir, 'col'), col_factor)

  if gs_model_dir:
    sh.gsutil('cp', '-r', os.path.join(model_dir, '*'), gs_model_dir)


def generate_recommendations(user_idx, user_rated, row_factor, col_factor, k):
  """Generate recommendations for a user.

  Args:
    user_idx: the row index of the user in the ratings matrix,

    user_rated: the list of item indexes (column indexes in the ratings matrix)
      previously rated by that user (which will be excluded from the
      recommendations)

    row_factor: the row factors of the recommendation model
    col_factor: the column factors of the recommendation model

    k: number of recommendations requested

  Returns:
    list of k item indexes with the predicted highest rating, excluding
    those that the user has already rated
  """

  # bounds checking for args
  assert (row_factor.shape[0] - len(user_rated)) >= k

  # retrieve user factor
  user_f = row_factor[user_idx]

  # dot product of item factors with user factor gives predicted ratings
  pred_ratings = col_factor.dot(user_f)

  # find candidate recommended item indexes sorted by predicted rating
  k_r = k + len(user_rated)
  candidate_items = np.argsort(pred_ratings)[-k_r:]

  # remove previously rated items and take top k
  recommended_items = [i for i in candidate_items if i not in user_rated]
  recommended_items = recommended_items[-k:]

  # flip to sort highest rated first
  recommended_items.reverse()

  return recommended_items


In [115]:
user_map, item_map, tr_sparse, test_sparse = create_test_and_train_sets()

# train model
output_row, output_col, md = train_model(DEFAULT_PARAMS,tr_sparse)

INFO:tensorflow:Train Start: 2018-11-15 15:07:37
INFO:tensorflow:Train Finish: 2018-11-15 15:16:18


In [116]:
row  = np.array([0, 0, 0, 0])
col  = np.array([50, 153, 671, 1002])
data = np.array([4.5, 5.0, 4.5, 4.9], dtype=np.float32)
new_user = coo_matrix(data, (row, col))

In [117]:
new_user_tensor = tf.SparseTensor(indices=zip(new_user.row, new_user.col),
                                   values=(new_user.data).astype(np.float32),
                                   dense_shape=new_user.shape)

AttributeError: 'module' object has no attribute 'project_row_factors'

In [120]:
movies_embds = output_col

In [127]:
from sklearn.neighbors import KDTree

In [128]:
tree = KDTree(movies_embds, leaf_size=2) 

In [135]:
user_representation = sum([movies_embds[i] for i in [140, 340, 5002]])

In [136]:
user_representation

array([  8.8227005, -20.886433 ,   6.42346  ,  12.361511 ,  -3.4397814],
      dtype=float32)

In [139]:
movies_embds[:1]

array([[ 11.313215 , -23.997679 ,   8.53643  ,  13.413322 ,   5.4243336]],
      dtype=float32)

In [145]:
_, recs = tree.query(user_representation.reshape(1, -1), k=5)

In [146]:
for i in [140, 340, 5002]:
  print(df_movies.iloc[item_map[i]].title)
print('RECOMMENDED:')
for i in recs:
  print(df_movies.iloc[item_map[i]].title)

Harry Potter and the Chamber of Secrets (2002)
Lost in Space (1998)
Monty Python and the Holy Grail (1975)
RECOMMENDED:
1375    Last of the Mohicans, The (1992)
1934                        Bambi (1942)
120                     Boomerang (1992)
721                     Rock, The (1996)
793              American Buffalo (1996)
Name: title, dtype: object


In [104]:
display(output_row[0:5])
display(output_col[0:5])

array([[ 0.08462361, -0.00880599, -0.12086756, -0.05155351,  0.00057303],
       [ 0.01294671, -0.013276  , -0.13062036, -0.07056771,  0.06378343],
       [ 0.01071315, -0.06041512, -0.1710389 , -0.11614319,  0.07454851],
       [-0.04837557,  0.07757223,  0.02727203, -0.09036379,  0.05118196],
       [-0.0612067 ,  0.05296154,  0.01687476, -0.15498507,  0.05184331]],
      dtype=float32)

array([[ 1.4422902e+01,  1.0490119e+01, -3.4396281e+00, -2.9438894e+01,
         1.2982123e+01],
       [ 8.9928703e+00,  1.3298990e+01, -2.9393589e+00, -1.9970472e+01,
         1.1342815e+01],
       [ 6.2391849e+00,  1.3416862e+01, -2.7574584e+00, -1.5324193e+01,
         1.2780069e+01],
       [ 2.9167972e+00,  6.9432101e+00, -1.7683459e-02, -5.2384443e+00,
         1.3022038e+01],
       [ 5.2895060e+00,  1.3856513e+01, -2.5004339e+00, -1.4203545e+01,
         1.2202555e+01]], dtype=float32)

In [32]:
save_model('wals_out', user_map, item_map, output_row, output_col)

In [38]:
# log results
train_rmse = wals.get_rmse(output_row, output_col, tr_sparse)
test_rmse = wals.get_rmse(output_row, output_col, test_sparse)
print('Train RMSE: %s / Test RMSE: %s' % (train_rmse,test_rmse))

Train RMSE: 1.27013637583 / Test RMSE: 1.2963350423


In [43]:
display(user_map[600:650])

array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6], dtype=object)

In [42]:
display(item_map[0:50])

array([1, 28, 31, 46, 49, 110, 149, 220, 250, 257, 290, 293, 315, 333,
       363, 537, 583, 587, 645, 902, 907, 990, 1017, 1057, 1058, 1067,
       1068, 1075, 1113, 1169, 1171, 1173, 1175, 1176, 1182, 1188, 1189,
       1191, 1193, 1196, 1212, 1215, 1218, 1221, 1230, 1231, 1233, 1234,
       1238, 1250], dtype=object)

In [53]:
len(output_row)

138493

In [54]:
len(output_col)

26744

In [55]:
len(user_map)

20000263

In [56]:
len(item_map)

20000263

In [59]:
# retrieve user factor
user_f = output_row[6]
len(user_f)

5

In [91]:
user_idx = np.searchsorted(user_map,77)
print(user_idx)

8595


In [92]:
# dot product of item factors with user factor gives predicted ratings
pred_ratings = output_col.dot(user_idx)

In [98]:
display(len(pred_ratings))

26744

In [101]:
recommendations = generate_recommendations(user_idx, np.array(['1']),
                                                 output_row,
                                                 output_col,
                                                 6)

# map article indexes back to article ids
article_recommendations = [item_map[i] for i in recommendations]



In [102]:
print(article_recommendations)

[2597, 352, 1839, 2602, 3707, 3157]


In [73]:
df_movies = pd.read_pickle("../data/movies.pkl")

In [103]:
display(df_movies[df_movies['movieId'].isin(article_recommendations)])

Unnamed: 0,movieId,title,genres,movie_year,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
348,352,Crooklyn (1994),Comedy|Drama,1994.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1757,1839,My Giant (1998),Comedy,1998.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2512,2597,Lost & Found (1999),Comedy|Romance,1999.0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2517,2602,Mighty Peking Man (a.k.a. Goliathon) (Xing xin...,Action|Adventure|Horror|Sci-Fi,1977.0,0,1,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3070,3157,Stuart Little (1999),Children|Comedy|Fantasy,1999.0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3616,3707,9 1/2 Weeks (Nine 1/2 Weeks) (1986),Drama|Romance,1986.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
def 