## Embedding experiment

In [2]:
import pandas as pd
import tensorflow as tf

from tensorflow.python.lib.io import file_io
from tensorflow.contrib.factorization import WALSMatrixFactorization

import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from google.datalab.ml import TensorBoard
print(tf.__version__)

tf.logging.set_verbosity(tf.logging.INFO)

1.8.0


### Variables

In [3]:
TEST_SET_RATIO = 0.1
OUTDIR = 'embedding_experiment'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

### Reading data using Pandas

In [4]:
df_movies = pd.read_pickle("./data/movies.pkl")
df_rating = pd.read_pickle("./data/rating.pkl")
df_tags = pd.read_pickle("./data/tags.pkl")
df_links = pd.read_pickle("./data/links.pkl")
df_genome_scores = pd.read_pickle("./data/genome-scores.pkl")
df_genome_tags = pd.read_pickle("./data/genome-tags.pkl")

### Basic test & training set construction

In [5]:
df_dataset = df_rating.join(df_movies, on=['movieId'], lsuffix='_rating', rsuffix='_movies',how='inner')
df_dataset = df_dataset.reset_index(drop=True)
df_dataset.columns

Index(['movieId', 'userId', 'movieId_rating', 'rating', 'timestamp',
       'movieId_movies', 'title', 'genres', 'movie_year', '(no genres listed)',
       'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [6]:
msk = np.random.rand(len(df_rating)) < 0.8
df_train = df_rating[msk]
df_test = df_rating[~msk]

print("Test length: %s , Training length: %s" % (len(df_test),len(df_train)))

Test length: 4000935 , Training length: 15999328


In [7]:
#FEATURES = list(df_dataset.columns)
#FEATURES.remove('movieId_rating')
#FEATURES.remove('timestamp')
##FEATURES.remove('movieId_movies')
#FEATURES.remove('(no genres listed)')
#FEATURES.remove('title') ## for now, no textual features without tokenizers
#FEATURES.remove('genres')
#LABEL = FEATURES.pop(1)

In [8]:
#print("Features: %s \r\n" % (FEATURES))
#print("Label: %s \r\n" % (LABEL))
#print("Feature Col types: %s \r\n" % df_train[FEATURES].dtypes)
#print("Label Col types: %s \r\n" % df_train[LABEL].dtypes)

### Create Sparse Tensor

In [9]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
5,1,112,3.5,2004-09-10 03:09:00
6,1,151,4.0,2004-09-10 03:08:54


In [10]:
np_items = df_train.movieId.as_matrix()
np_users = df_train.userId.as_matrix()
unique_users = np.unique(np_users)
unique_items = np.unique(np_items)
n_users = unique_users.shape[0]
n_items = unique_items.shape[0]
max_item = unique_items[-1]
max_user = unique_users[-1]

if n_users != max_user or n_items != max_item:
  z = np.zeros(max_user + 1, dtype=int)
  z[unique_users] = np.arange(n_users)
  u_r = z[np_users]
  z = np.zeros(max_item + 1, dtype=int)
  z[unique_items] = np.arange(n_items)
  i_r = z[np_items]
  np_ratings = df_train.rating.as_matrix()
  ratings = np.zeros((np_ratings.shape[0], 3), dtype=object)
  ratings[:, 0] = u_r
  ratings[:, 1] = i_r
  ratings[:, 2] = np_ratings
else:
    ratings = df_train.as_matrix(['userId', 'itemId', 'rating'])
    ratings[:, 0] -= 1
    ratings[:, 1] -= 1

In [11]:
#print('Number of items: {} and max item {} / Number of users: {}').format(n_items,max_item,n_users)
print('Number of items: %s and max item %s / Number of users: %s' % (n_items,max_item,n_users))

Number of items: 25832 and max item 131260 / Number of users: 138493


In [14]:
test_set_size = int(len(ratings) * TEST_SET_RATIO)
test_set_idx = np.random.choice(range(len(ratings)),
                                size=test_set_size, replace=False)
test_set_idx = sorted(test_set_idx)

ts_ratings = ratings[test_set_idx]
tr_ratings = np.delete(ratings, test_set_idx, axis=0)


u_tr, i_tr, r_tr = zip(*tr_ratings)
u_ts, i_ts, r_ts = zip(*ts_ratings)

tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items))
test_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items))

In [17]:
tr_sparse.shape

(138493, 25832)

In [24]:
input_tensor = tf.SparseTensor(indices=list(zip(tr_sparse.row, tr_sparse.col)),
                                values=(tr_sparse.data).astype(np.float32),
                                dense_shape=tr_sparse.shape)

### Create embedding

In [22]:
import tensorflow as tf
import numpy as np

example = input_tensor #tf.SparseTensor(indices=[[0], [1], [2]], values=[3, 6, 9], dense_shape=[3])

vocabulary_size = 10
embedding_size = 1
var = np.array([0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0])
embeddings = tf.Variable(var)

embed = tf.nn.embedding_lookup_sparse(embeddings, example, None)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    print(sess.run(embed)) # prints [  9.  36.  81.]



TypeError: Value passed to parameter 'indices' has DataType float32 not in list of allowed values: int32, int64