# Download dataset

In [None]:
!mkdir raw_data/
%cd raw_data
!wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
!gzip -d reviews_Electronics_5.json.gz
!wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz
!gzip -d meta_Electronics.json.gz

%cd ..

/content/raw_data
--2022-06-06 21:27:53--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: ‘reviews_Electronics_5.json.gz’


2022-06-06 21:28:08 (33.4 MB/s) - ‘reviews_Electronics_5.json.gz’ saved [495854086/495854086]

--2022-06-06 21:28:19--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 186594679 (178M) [application/x-gzip]
Saving to: ‘meta_Electronics.json.gz’


2022-06-06 21:28:26 (27.3 MB/s) - ‘meta_Electronics.json.gz’ saved [186594679/186594679]


# Convert tensorflow version

In [None]:
!pip uninstall tensorflow
!pip install tensorflow==1.13.1

TensorFlow 1.x selected.


# Convert dataset to pandas version

In [None]:
import pickle
import pandas as pd
import numpy as np
import random

def to_df(file_path):
  with open(file_path, 'r') as fin:
    df = {}
    i = 0
    for line in fin:
      df[i] = eval(line)
      i += 1
    df = pd.DataFrame.from_dict(df, orient='index')
    return df

reviews_df = to_df('raw_data/reviews_Electronics_5.json')
with open('raw_data/reviews.pkl', 'wb') as f:
  pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)

meta_df = to_df('raw_data/meta_Electronics.json')
meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]
meta_df = meta_df.reset_index(drop=True)
with open('raw_data/meta.pkl', 'wb') as f:
  pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL)

# Remap dataset

In [None]:
with open('raw_data/reviews.pkl', 'rb') as f:
  reviews_df = pickle.load(f)
  reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]

with open('raw_data/meta.pkl', 'rb') as f:
  meta_df = pickle.load(f)
  meta_df = meta_df[['asin', 'categories']]
  meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])


def build_map(df, col_name):
  key = sorted(df[col_name].unique().tolist())
  m = dict(zip(key, range(len(key))))
  df[col_name] = df[col_name].map(lambda x: m[x])
  return m, key

asin_map, asin_key = build_map(meta_df, 'asin')
cate_map, cate_key = build_map(meta_df, 'categories')
revi_map, revi_key = build_map(reviews_df, 'reviewerID')

user_count, item_count, cate_count, example_count =\
    len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]
print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' %
      (user_count, item_count, cate_count, example_count))

meta_df = meta_df.sort_values('asin')
meta_df = meta_df.reset_index(drop=True)
reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime'])
reviews_df = reviews_df.reset_index(drop=True)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]

cate_list = [meta_df['categories'][i] for i in range(len(asin_map))]
cate_list = np.array(cate_list, dtype=np.int32)


with open('raw_data/remap.pkl', 'wb') as f:
  pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) 
  pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) 
  pickle.dump((user_count, item_count, cate_count, example_count),
              f, pickle.HIGHEST_PROTOCOL)
  pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)

user_count: 192403	item_count: 63001	cate_count: 801	example_count: 1689188


# Build dataset

In [None]:
import random
import pickle
import numpy as np

random.seed(1234)

with open('raw_data/remap.pkl', 'rb') as f:
  reviews_df = pickle.load(f)
  cate_list = pickle.load(f)
  user_count, item_count, cate_count, example_count = pickle.load(f)

train_set = []
test_set = []
for reviewerID, hist in reviews_df.groupby('reviewerID'):
  pos_list = hist['asin'].tolist()
  def gen_neg():
    neg = pos_list[0]
    while neg in pos_list:
      neg = random.randint(0, item_count-1)
    return neg
  neg_list = [gen_neg() for i in range(len(pos_list))]
  rid_list = [reviewerID for i in range(len(pos_list))]
  hist = list(zip(rid_list, pos_list, neg_list))

  train_set.extend(hist[:-1])
  test_set.append(hist[-1])

random.shuffle(train_set)
random.shuffle(test_set)

assert len(test_set) == user_count
assert len(test_set) + len(train_set) == example_count

train_set = np.array(train_set, dtype=np.int32)
test_set = np.array(test_set, dtype=np.int32)


with open('dataset.pkl', 'wb') as f:
  pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
  pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
  pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
  pickle.dump((user_count, item_count, cate_count), f, pickle.HIGHEST_PROTOCOL)

# Define model

In [None]:
import tensorflow as tf

class Model(object):

  def __init__(self, user_count, item_count, cate_count, cate_list):
    self.u = tf.placeholder(tf.int32, [None,])
    self.i = tf.placeholder(tf.int32, [None,])
    self.j = tf.placeholder(tf.int32, [None,])
    self.lr = tf.placeholder(tf.float64, [])

    user_emb_w = tf.get_variable("user_emb_w", [user_count, 128])
    item_emb_w = tf.get_variable("item_emb_w", [item_count, 64])
    item_b = tf.get_variable("item_b", [item_count])
    cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, 64])
    cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)

    u_emb = tf.nn.embedding_lookup(user_emb_w, self.u)

    ic = tf.gather(cate_list, self.i)
    i_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.i),
        tf.nn.embedding_lookup(cate_emb_w, ic),
        ], 1)
    i_b = tf.gather(item_b, self.i)

    jc = tf.gather(cate_list, self.j)
    j_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.j),
        tf.nn.embedding_lookup(cate_emb_w, jc),
        ], 1)
    j_b = tf.gather(item_b, self.j)

    # MF predict: u_i > u_j
    x = i_b - j_b + tf.reduce_sum(tf.multiply(u_emb, (i_emb - j_emb)), 1)
    self.logits = tf.sigmoid(x)

    # AUC for one user:
    # reasonable iff all (u,i,j) pairs are from the same user
    # average AUC = mean( auc for each user in test set)
    self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))

    # logits for all item:
    all_emb = tf.concat([
        item_emb_w,
        tf.nn.embedding_lookup(cate_emb_w, cate_list)
        ], axis=1)
    self.logits_all = tf.sigmoid(
        item_b + tf.matmul(u_emb, all_emb, transpose_b=True))

    l2_norm = tf.add_n([
        tf.nn.l2_loss(u_emb),
        tf.nn.l2_loss(i_emb),
        tf.nn.l2_loss(j_emb),
        ])

    reg_rate = 5e-5
    self.bprloss = reg_rate * l2_norm - tf.reduce_mean(tf.log(self.logits))

    opt = tf.train.GradientDescentOptimizer
    self.train_op = opt(self.lr).minimize(self.bprloss)

  def train(self, sess, uij, l):
    loss, _ = sess.run([self.bprloss, self.train_op], feed_dict={
        self.u: uij[:, 0],
        self.i: uij[:, 1],
        self.j: uij[:, 2],
        self.lr: l,
        })
    return loss

  def eval(self, sess, test_set):
    return sess.run(self.mf_auc, feed_dict={
        self.u: test_set[:, 0],
        self.i: test_set[:, 1],
        self.j: test_set[:, 2],
        })

  def test(self, sess, uid):
    return sess.run(self.logits_all, feed_dict={
        self.u: uid,
        })

  def save(self, sess, path):
    saver = tf.train.Saver()
    saver.save(sess, save_path=path)

  def restore(self, sess, path):
    saver = tf.train.Saver()
    saver.restore(sess, save_path=path)

# Data input 

In [None]:
class DataInput:
  def __init__(self, data, batch_size):
    self.batch_size = batch_size
    self.data = data
    self.epoch_size = self.data.shape[0] // self.batch_size
    if self.epoch_size * self.batch_size < self.data.shape[0]:
      self.epoch_size += 1
    self.i = 0

  def __iter__(self):
    return self

  def __next__(self):
    if self.i == self.epoch_size:
      raise StopIteration

    t = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size,
                                                 self.data.shape[0])]
    self.i += 1

    return self.i, t

# Train and evaluate model

In [None]:
import os
import pickle
import numpy as np
import tensorflow as tf

tf.reset_default_graph()

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
np.random.seed(1234)
tf.set_random_seed(1234)

train_batch_size = 32

with open('dataset.pkl', 'rb') as f:
  train_set = pickle.load(f)
  test_set = pickle.load(f)
  cate_list = pickle.load(f)
  user_count, item_count, cate_count = pickle.load(f)


gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(
    config=tf.ConfigProto(gpu_options=gpu_options)
    ) as sess:

  model = Model(user_count, item_count, cate_count, cate_list)
  sess.run(tf.global_variables_initializer())
  sess.run(tf.local_variables_initializer())

  best_auc = 0.0
  lr = 1.0
  for epoch in range(50):

    if epoch % 100 == 0 and epoch != 0:
      lr *= 0.5

    epoch_size = train_set.shape[0] // train_batch_size
    loss_sum = 0.0
    for _, uij in DataInput(train_set, train_batch_size):
      loss = model.train(sess, uij, lr)
      loss_sum += loss

    epoch += 1
    print('epoch: %d\ttrain_loss: %.2f\tlr: %.2f' %
          (epoch, loss_sum / epoch_size, lr), end='\t')

    test_auc = model.eval(sess, test_set)
    print('test_auc: %.4f' % test_auc, flush=True)

    if best_auc < test_auc:
      best_auc = test_auc
      model.save(sess, 'save_path/ckpt')

  print('best test_auc:', best_auc)