## Embedding experiment

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from google.datalab.ml import TensorBoard
print(tf.__version__)

tf.logging.set_verbosity(tf.logging.INFO)

  from ._conv import register_converters as _register_converters


1.8.0


### Variables

In [2]:
OUTDIR = 'embedding_experiment'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

### Reading data using Pandas

In [3]:
df_movies = pd.read_pickle("./data/movies.pkl")
df_rating = pd.read_pickle("./data/rating.pkl")
df_tags = pd.read_pickle("./data/tags.pkl")
df_links = pd.read_pickle("./data/links.pkl")
df_genome_scores = pd.read_pickle("./data/genome-scores.pkl")
df_genome_tags = pd.read_pickle("./data/genome-tags.pkl")

### Basic test & training set construction

In [4]:
df_dataset = df_rating.join(df_movies, on=['movieId'], lsuffix='_rating', rsuffix='_movies',how='inner')
df_dataset = df_dataset.reset_index(drop=True)
df_dataset.columns

Index(['movieId', 'userId', 'movieId_rating', 'rating', 'timestamp',
       'movieId_movies', 'title', 'genres', 'movie_year', '(no genres listed)',
       'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [5]:
msk = np.random.rand(len(df_dataset)) < 0.8
df_train = df_dataset[msk]
df_test = df_dataset[~msk]

print("Test length: %s , Training length: %s" % (len(df_test),len(df_train)))

Test length: 3548125 , Training length: 14177840


In [6]:
FEATURES = list(df_dataset.columns)
FEATURES.remove('movieId_rating')
FEATURES.remove('timestamp')
FEATURES.remove('movieId_movies')
FEATURES.remove('(no genres listed)')
FEATURES.remove('title') ## for now, no textual features without tokenizers
FEATURES.remove('genres')
LABEL = FEATURES.pop(1)

In [7]:
print("Features: %s \r\n" % (FEATURES))
print("Label: %s \r\n" % (LABEL))
print("Feature Col types: %s \r\n" % df_train[FEATURES].dtypes)
print("Label Col types: %s \r\n" % df_train[LABEL].dtypes)

Features: ['movieId', 'rating', 'movie_year', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] 

Label: userId 

Feature Col types: movieId          int64
rating         float64
movie_year     float64
Action           int64
Adventure        int64
Animation        int64
Children         int64
Comedy           int64
Crime            int64
Documentary      int64
Drama            int64
Fantasy          int64
Film-Noir        int64
Horror           int64
IMAX             int64
Musical          int64
Mystery          int64
Romance          int64
Sci-Fi           int64
Thriller         int64
War              int64
Western          int64
dtype: object 

Label Col types: int64 



### Input functions

In [8]:
def make_train_input_fn(df, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = df[FEATURES],
    y = df[LABEL],
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000
  )


In [9]:
def make_eval_input_fn(df):
  return tf.estimator.inputs.pandas_input_fn(
    x = df[FEATURES],
    y = df[LABEL],
    batch_size = 128,
    shuffle = False,
    queue_capacity = 1000
  )

In [10]:
def serving_input_fn(df):
  return tf.estimator.inputs.pandas_input_fn(
    x = df[FEATURES],
    y = None,
    batch_size = 128,
    shuffle = False,
    queue_capacity = 1000
  )

### Create feature columns

In [50]:
def make_feature_cols():
  input_columns = []
  ## input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES] -- No fields are fully numerical
  
  ## The binary fields:
  binary_fields = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                   'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] 
  
  binary_columns = [tf.feature_column.categorical_column_with_identity(k, num_buckets=2,default_value=0) for k in binary_fields]

  ## Embedding the category fields
  #input_columns = tf.feature_column.embedding_column(categorical_column=binary_columns,dimension=10)
  
  input_columns = [tf.feature_column.embedding_column(tf.feature_column.crossed_column(binary_columns,20**2),dimension=5)]
  
  return input_columns #binary_columns #input_columns

In [47]:
print(make_feature_cols())

[_EmbeddingColumn(categorical_column=_CrossedColumn(keys=(_IdentityCategoricalColumn(key='Action', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Adventure', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Animation', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Children', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Comedy', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Crime', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Documentary', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Drama', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Fantasy', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Film-Noir', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Horror', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='IMAX', num_buckets=2, default_value=0), _IdentityCategoricalColumn(key='Musical',

### Define model train & evaluate function

In [51]:
def train_and_evaluate(output_dir, num_train_steps):
  
    EVAL_INTERVAL = 300
    run_config = tf.estimator.RunConfig(save_checkpoints_secs = EVAL_INTERVAL,
                                      keep_checkpoint_max = 3)
    #estimator = tf.estimator.LinearRegressor(
    #                   model_dir = output_dir,
    #                   feature_columns = make_feature_cols())
    
    estimator = tf.estimator.DNNRegressor(
                       model_dir = output_dir,
                       feature_columns = make_feature_cols(),
                       hidden_units = [64, 32,16,8],
                       config = run_config)
    
    train_spec=tf.estimator.TrainSpec(
                       input_fn = make_train_input_fn(df_train, num_epochs = 10),
                       max_steps = num_train_steps)

    #exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)

    eval_spec=tf.estimator.EvalSpec(
                       input_fn = make_eval_input_fn(df_test),
                       steps = None,
                       start_delay_secs = 1, # start evaluating after N seconds
                       throttle_secs = EVAL_INTERVAL,  # evaluate every N seconds
                       #exporters = exporter
    )
    
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

### Run!

In [52]:
# Run training    
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
train_and_evaluate(OUTDIR, num_train_steps = 1000)

INFO:tensorflow:Using config: {'_master': '', '_keep_checkpoint_max': 3, '_evaluation_master': '', '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_task_id': 0, '_num_ps_replicas': 0, '_save_summary_steps': 100, '_train_distribute': None, '_tf_random_seed': None, '_service': None, '_num_worker_replicas': 1, '_model_dir': 'embedding_experiment', '_log_step_count_steps': 100, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7d75b937f0>, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 300}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 300 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:te

In [37]:
def print_rmse(model, df):
  metrics = model.evaluate(input_fn = make_eval_input_fn(df))
  print('RMSE on dataset = {}'.format(np.sqrt(metrics['average_loss'])))
#print_rmse(model, df_valid)

### TensorflowBoard for debugging purposes

In [54]:
TensorBoard().start(OUTDIR)

912900

### Shutting down TensorflowBoard

In [53]:
# to list Tensorboard instances
TensorBoard().list()

In [40]:
pids_df = TensorBoard.list()
if not pids_df.empty:
    for pid in pids_df['pid']:
        TensorBoard().stop(pid)
        print('Stopped TensorBoard with pid {}'.format(pid))

Stopped TensorBoard with pid 783168
Stopped TensorBoard with pid 905119
