In [1]:
from collections import deque
from six import next

import tensorflow as tf
import numpy as np
import pandas as pd

import time
import json

In [2]:
reviews = []

with open('reviews_Musical_Instruments_5.json', 'r') as f:
    raw_json = f.readlines()
    for record in raw_json:
        reviews.append(eval(record))

In [3]:
json_info = json.dumps(reviews)

df = pd.read_json(json_info)
df = df[['reviewerID', 'asin', 'overall', 'unixReviewTime']].copy()
df.sample(10)

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
9751,A1DVUFG2QSJ6IK,B007XH9432,5,1381104000
4692,A1365RYO0BLEMI,B000EELFI8,5,1348012800
2624,A1C92SAQFUBJSZ,B0002GXZK4,4,1348876800
6878,A1GMWTGXW682GB,B001PGXKC8,4,1342137600
4595,A24AQ24CD6865K,B000EEJJ5Y,5,1399334400
6793,A3JXFUXN03IUT5,B001PGXHX0,5,1395446400
2336,A2UYE434RFINE,B0002GLDQM,3,1360195200
5981,AA1ZYCEKJGG3A,B000ZJTPLG,2,1354320000
10102,A2U1Z3TZ4P76JB,B00BTGMI5O,4,1393977600
9038,AU2TEZYYXKLL,B005A09I7Q,3,1374451200


In [4]:
np.random.seed(42)

u_num = len(df['reviewerID'].unique())
i_num = len(df['asin'].unique())

batch_size = 100 
dims = 5          
max_epochs = 50   

place_device = "/cpu:0"

In [5]:
def get_data(df):
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.9)

    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    
    return df_train, df_test

def clip(x):
    return np.clip(x, 1.0, 5.0)

In [6]:
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        with tf.variable_scope('lsi',reuse=tf.AUTO_REUSE):
            bias_global = tf.get_variable("bias_global", shape=[])

            w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
            w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])

            bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
            bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")

            w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
            w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))

            embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
            embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    
    with tf.device(device):
        infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer

In [7]:
def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
    with tf.device(device):
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    return cost, train_op

In [8]:
df_train, df_test = get_data(df)

samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" % 
      (len(df_train), len(df_test), samples_per_batch))

Number of train samples 9234, test samples 1027, samples per batch 92


In [9]:
print(df_train["reviewerID"].head()) 
print(df_test["reviewerID"].head())

0     AKAVVQMXSAIGX
1     A6KYDNP84GGGJ
2    A2MPM6M93OXIJT
3     ASJAKT8DJIAC5
4     AJH2W783HOXZV
Name: reviewerID, dtype: object
0    A3E6FHS8DV037H
1    A1W3CEEQBJ4GTN
2    A3TOND09136H4A
3    A3W2VF6D09B2RN
4    A1N82BBPB5816P
Name: reviewerID, dtype: object


In [10]:
print(df_train["asin"].head())
print(df_test["asin"].head())

0    B0002GWFEQ
1    B000NGVQKO
2    B0002D0CLM
3    B0002DV7U2
4    B0002GXZK4
Name: asin, dtype: object
0    B000KW2YEI
1    B00AK7SKL4
2    B001PGXHX0
3    B00064TZYW
4    B001FB5Z44
Name: asin, dtype: object


In [11]:
print(df_train["overall"].head())
print(df_test["overall"].head())

0    5
1    4
2    3
3    5
4    5
Name: overall, dtype: int64
0    4
1    3
2    5
3    5
4    5
Name: overall, dtype: int64


In [12]:
class ShuffleIterator(object):
    """
    Randomly generate batches
    """
    def __init__(self, inputs, batch_size=10):
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))

    def __len__(self):
        return self.len

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,))
        out = self.inputs[ids, :]
        return [out[:, i] for i in range(self.num_cols)]

In [13]:
class OneEpochIterator(ShuffleIterator):
    """
    Sequentially generate one-epoch batches, typically for test data
    """
    def __init__(self, inputs, batch_size=10):
        super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0

    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]

In [14]:
iter_train = ShuffleIterator([df_train["reviewerID"],
                                     df_train["asin"],
                                     df_train["overall"]],
                                     batch_size=batch_size)


iter_test = OneEpochIterator([df_test["reviewerID"],
                                     df_test["asin"],
                                     df_test["overall"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)

W0705 00:05:22.203510 140619911264064 deprecation.py:506] From /home/eloise/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        print(type(users[0]))
        print(type(items[0]))
        print(type(rates[0]))
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
                                                               rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/')

Epoch	Train Error	Val Error	Elapsed Time
<class 'str'>
<class 'str'>
<class 'int'>


ValueError: invalid literal for int() with base 10: 'A3SS4Y0BDSPDB0'