## Wals MatrixFactorization for embedding

In [3]:
import pandas as pd
import tensorflow as tf

from tensorflow.python.lib.io import file_io
from tensorflow.contrib.factorization import WALSMatrixFactorization

import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from google.datalab.ml import TensorBoard
print(tf.__version__)

tf.logging.set_verbosity(tf.logging.INFO)

  from ._conv import register_converters as _register_converters


1.8.0


### Setting variables

In [4]:
TEST_SET_RATIO = 0.1
OUTDIR = 'wals_experiment'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

In [5]:
DEFAULT_PARAMS = {
    'weights': True,
    'latent_factors': 5,
    'num_iters': 20,
    'regularization': 0.07,
    'unobs_weight': 0.01,
    'wt_type': 0,
    'feature_wt_factor': 130.0,
    'feature_wt_exp': 0.08,
    'delimiter': '\t'
}

OPTIMIZED_PARAMS = {
    'latent_factors': 34,
    'regularization': 9.83,
    'unobs_weight': 0.001,
    'feature_wt_factor': 189.8,
}

for key in OPTIMIZED_PARAMS:
  DEFAULT_PARAMS[key] = OPTIMIZED_PARAMS[key]

### Reading data with Pandas

In [6]:
df_movies = pd.read_pickle("./data/movies.pkl")
df_rating = pd.read_pickle("./data/rating.pkl")
df_tags = pd.read_pickle("./data/tags.pkl")
df_links = pd.read_pickle("./data/links.pkl")
df_genome_scores = pd.read_pickle("./data/genome-scores.pkl")
df_genome_tags = pd.read_pickle("./data/genome-tags.pkl")

In [7]:
df_rating.describe(include = 'all')

Unnamed: 0,userId,movieId,rating,timestamp
count,20000260.0,20000260.0,20000260.0,20000263
unique,,,,15351121
top,,,,1996-03-01 00:00:00
freq,,,,643
first,,,,1995-01-09 11:46:44
last,,,,2015-03-31 06:40:02
mean,69045.87,9041.567,3.525529,
std,40038.63,19789.48,1.051989,
min,1.0,1.0,0.5,
25%,34395.0,902.0,3.0,


In [8]:
df_rating['rating'] = pd.to_numeric(df_rating['rating'], errors='coerce')
df_rating['userId'] = pd.to_numeric(df_rating['userId'], errors='coerce')
df_rating['movieId'] = pd.to_numeric(df_rating['movieId'], errors='coerce')
df_rating = df_rating.dropna(subset=['rating','userId','movieId'])
df_rating['rating'] = df_rating['rating'].astype(float)
df_rating['userId'] = df_rating['userId'].astype(int)
df_rating['movieId'] = df_rating['movieId'].astype(int)
del df_rating['timestamp']
df_rating.describe(include = 'all')

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


### Prepare dataset

In [9]:
ratings = df_rating.as_matrix(['userId', 'movieId', 'rating'])

In [10]:
ratings[:10]

array([[  1. ,   2. ,   3.5],
       [  1. ,  29. ,   3.5],
       [  1. ,  32. ,   3.5],
       [  1. ,  47. ,   3.5],
       [  1. ,  50. ,   3.5],
       [  1. , 112. ,   3.5],
       [  1. , 151. ,   4. ],
       [  1. , 223. ,   4. ],
       [  1. , 253. ,   4. ],
       [  1. , 260. ,   4. ]])

In [11]:
U_ratings = df_rating.groupby(['userId'])['rating'].apply(np.array)

In [12]:
U_movies= df_rating.groupby(['userId'])['movieId'].apply(np.array)

In [13]:
U = pd.concat([U_ratings, U_movies], axis=1)

In [14]:
del U_ratings
del U_movies

In [47]:
U['rating'][1]

array([3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 4. , 4. , 4. , 4. , 4. , 4. , 4. ,
       3.5, 3.5, 4. , 3.5, 3.5, 3. , 3.5, 3.5, 3.5, 4. , 4. , 3.5, 3.5,
       4. , 4. , 3.5, 3.5, 4.5, 4.5, 4. , 3. , 3.5, 4. , 4. , 3.5, 4. ,
       3.5, 4. , 3. , 3.5, 4. , 4. , 4. , 3.5, 3.5, 4. , 4. , 3.5, 3. ,
       4. , 4. , 3.5, 3.5, 4. , 3. , 4. , 4. , 3. , 3.5, 3.5, 3.5, 3.5,
       4. , 3.5, 3.5, 4. , 4. , 4. , 4. , 4. , 4. , 4. , 4. , 4. , 3.5,
       3.5, 4. , 4. , 4. , 4. , 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3. , 4. ,
       3.5, 4. , 3.5, 4. , 3.5, 4. , 4. , 3.5, 3. , 3.5, 4. , 4. , 3.5,
       3.5, 3.5, 4. , 4. , 4. , 4. , 3. , 4. , 3.5, 4. , 4. , 3.5, 4. ,
       3. , 3.5, 4. , 3.5, 4. , 4. , 3.5, 4. , 3.5, 4. , 4. , 3. , 3.5,
       3.5, 5. , 4. , 4. , 3. , 3.5, 4. , 4. , 3.5, 4. , 4. , 3.5, 5. ,
       3.5, 4. , 3.5, 4. , 3.5, 4. , 4. , 3.5, 4. , 3.5, 3.5, 3. , 3.5,
       3.5, 4. , 5. , 3.5, 3.5, 3.5, 4. , 4. , 3.5, 4. , 3. , 4. , 4. ,
       3.5, 5. , 4.5, 3.5, 4. , 4. ])

In [68]:
a1 = np.array(U.rating[1])
a2 = np.array(U.movieId[1])
print(a1)
print(a2)
print()
a3 = take_topN(a1, a2, 3)
print(a3)

[3.5 3.5 3.5 3.5 3.5 3.5 4.  4.  4.  4.  4.  4.  4.  3.5 3.5 4.  3.5 3.5
 3.  3.5 3.5 3.5 4.  4.  3.5 3.5 4.  4.  3.5 3.5 4.5 4.5 4.  3.  3.5 4.
 4.  3.5 4.  3.5 4.  3.  3.5 4.  4.  4.  3.5 3.5 4.  4.  3.5 3.  4.  4.
 3.5 3.5 4.  3.  4.  4.  3.  3.5 3.5 3.5 3.5 4.  3.5 3.5 4.  4.  4.  4.
 4.  4.  4.  4.  4.  3.5 3.5 4.  4.  4.  4.  3.5 3.5 3.5 3.5 3.5 3.5 3.
 4.  3.5 4.  3.5 4.  3.5 4.  4.  3.5 3.  3.5 4.  4.  3.5 3.5 3.5 4.  4.
 4.  4.  3.  4.  3.5 4.  4.  3.5 4.  3.  3.5 4.  3.5 4.  4.  3.5 4.  3.5
 4.  4.  3.  3.5 3.5 5.  4.  4.  3.  3.5 4.  4.  3.5 4.  4.  3.5 5.  3.5
 4.  3.5 4.  3.5 4.  4.  3.5 4.  3.5 3.5 3.  3.5 3.5 4.  5.  3.5 3.5 3.5
 4.  4.  3.5 4.  3.  4.  4.  3.5 5.  4.5 3.5 4.  4. ]
[    2    29    32    47    50   112   151   223   253   260   293   296
   318   337   367   541   589   593   653   919   924  1009  1036  1079
  1080  1089  1090  1097  1136  1193  1196  1198  1200  1201  1208  1214
  1215  1217  1219  1222  1240  1243  1246  1249  1258  1259  1261  1262
  

In [97]:
print(take_topN( np.array([3.5, 4.5, 2.5, 6.7]), np.array([100, 50, 150, 2500]), 3))

[150 100  50]


In [93]:
def take_topN(array1, array2, n=3):
  indices = np.argsort(array1)#[::-1]
  return array2[indices[0:n]]

In [98]:
U['col2'] = U.apply(lambda row: list(take_topN(row['rating'],row['movieId'],3)),axis=1)

In [None]:
U['col2'] = U.apply(lambda row: list(take_topN(row['rating'],row['movieId'],3)),axis=1)

In [99]:
U.head(20)

Unnamed: 0_level_0,rating,movieId,col2
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 4.0, 4.0, 4.0, ...","[2, 29, 32, 47, 50, 112, 151, 223, 253, 260, 2...",1525
2,"[4.0, 5.0, 5.0, 4.0, 3.0, 5.0, 5.0, 3.0, 5.0, ...","[3, 62, 70, 110, 242, 260, 266, 469, 480, 541,...",1971
3,"[4.0, 3.0, 4.0, 5.0, 3.0, 2.0, 5.0, 3.0, 5.0, ...","[1, 24, 32, 50, 160, 173, 175, 196, 223, 260, ...",2676
4,"[3.0, 4.0, 3.0, 1.0, 3.0, 3.0, 4.0, 4.0, 3.0, ...","[6, 10, 19, 32, 165, 329, 350, 356, 367, 368, ...",32
5,"[3.0, 5.0, 3.0, 3.0, 5.0, 2.0, 4.0, 2.0, 5.0, ...","[2, 11, 17, 60, 62, 104, 110, 140, 141, 150, 2...",1073
6,"[5.0, 3.0, 5.0, 5.0, 5.0, 5.0, 3.0, 4.0, 5.0, ...","[1, 3, 7, 17, 52, 62, 135, 140, 141, 260, 494,...",1073
7,"[3.0, 3.0, 4.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, ...","[3, 7, 11, 15, 16, 17, 24, 105, 122, 151, 252,...",4848
8,"[4.0, 5.0, 3.0, 4.0, 1.0, 4.0, 3.0, 5.0, 4.0, ...","[1, 3, 6, 10, 19, 21, 39, 47, 48, 110, 150, 15...",19
9,"[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 2.0, 3.0, ...","[356, 858, 1219, 1911, 1923, 1997, 2279, 2605,...",4502
10,"[4.0, 4.0, 4.0, 4.0, 3.0, 5.0, 5.0, 4.0, 4.0, ...","[1, 11, 25, 260, 356, 527, 858, 912, 969, 1094...",3107


In [None]:
#U['topN'] = U.apply(lambda row: take_topN(row['rating'], row['movieId']))

In [None]:
#U_total = pd.merge(U_ratings.to_frame(), U_movies.to_frame(), on='userId', how='inner')

In [None]:
del U_ra

In [8]:
np_items = df_rating.movieId.as_matrix()
np_users = df_rating.userId.as_matrix()
unique_users = np.unique(np_users)
unique_items = np.unique(np_items)
n_users = unique_users.shape[0]
n_items = unique_items.shape[0]
max_item = unique_items[-1]
max_user = unique_users[-1]

if n_users != max_user or n_items != max_item:
  z = np.zeros(max_user + 1, dtype=int)
  z[unique_users] = np.arange(n_users)
  u_r = z[np_users]
  z = np.zeros(max_item + 1, dtype=int)
  z[unique_items] = np.arange(n_items)
  i_r = z[np_items]
  np_ratings = df_rating.rating.as_matrix()
  ratings = np.zeros((np_ratings.shape[0], 3), dtype=object)
  ratings[:, 0] = u_r
  ratings[:, 1] = i_r
  ratings[:, 2] = np_ratings
else:
    ratings = df_rating.as_matrix(['user_id', 'item_id', 'rating'])
    ratings[:, 0] -= 1
    ratings[:, 1] -= 1

In [9]:
print('Number of items: {} and max item {} / Number of users: {}').format(n_items,max_item,n_users)

Number of items: 26744 and max item 131262 / Number of users: 138493


In [10]:
test_set_size = int(len(ratings) * TEST_SET_RATIO)
test_set_idx = np.random.choice(xrange(len(ratings)),
                                size=test_set_size, replace=False)
test_set_idx = sorted(test_set_idx)

ts_ratings = ratings[test_set_idx]
tr_ratings = np.delete(ratings, test_set_idx, axis=0)

In [11]:
u_tr, i_tr, r_tr = zip(*tr_ratings)
u_ts, i_ts, r_ts = zip(*ts_ratings)

In [12]:
print(len(r_tr))
print(len(u_tr))
print(len(i_tr))
print(n_users * n_items)

18000237
18000237
18000237
3703856792


In [13]:
tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items))
test_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items))

In [36]:
user_sparse_tensor = tf.SparseTensor(indices=tr_sparse.row, dense_shape=(tr_sparse.shape[0],1))

TypeError: __init__() takes exactly 4 arguments (3 given)

In [14]:
input_tensor = tf.SparseTensor(indices=zip(tr_sparse.row, tr_sparse.col),
                                values=(tr_sparse.data).astype(np.float32),
                                dense_shape=tr_sparse.shape)

In [None]:
input_tensor.

In [19]:
print(input_tensor.get_shape())

(138493, 26744)


In [35]:
limit = input_tensor.indices
print(limit)

Tensor("SparseTensor/indices:0", shape=(18000237, 2), dtype=int64)


In [110]:
def train_and_evaluate(args):
    train_steps = int(0.5 + (1.0 * args['num_epochs'] * args['nusers']) / args['batch_size'])
    steps_in_epoch = int(0.5 + args['nusers'] / args['batch_size'])
    print('Will train for {} steps, evaluating once every {} steps'.format(train_steps, steps_in_epoch))
    def experiment_fn(output_dir):
        return tf.contrib.learn.Experiment(
            tf.contrib.factorization.WALSMatrixFactorization(
                         num_rows=args['nusers'], num_cols=args['nitems'],
                         embedding_dimension=args['n_embeds'],
                         model_dir=args['output_dir']),
            train_input_fn=read_dataset(tf.estimator.ModeKeys.TRAIN, args),
            eval_input_fn=read_dataset(tf.estimator.ModeKeys.EVAL, args),
            train_steps=train_steps,
            eval_steps=1,
            min_eval_frequency=steps_in_epoch
        )

    from tensorflow.contrib.learn.python.learn import learn_runner
    learn_runner.run(experiment_fn, args['output_dir'])
    
    batch_predict(args)

In [None]:
import shutil
shutil.rmtree(OUTDIR, ignore_errors=True)
train_and_evaluate({
    'output_dir': OUTDIR,
    #'input_path': 'gs://{}/wals/preproc_tft'.format(BUCKET),
    'num_epochs': 0.05,
    'nitems': 5668,
    'nusers': 82802,

    'batch_size': 512,
    'n_embeds': 10,
    'topk': 3
  })