In [1]:
# Some functions are from or based on work done by Kevin Liao in the below notebook
# https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/master/movie_recommender/movie_recommendation_using_ALS.ipynb

# Intialization
import os
import time
import sys
import datetime as df

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.types import StringType, ArrayType
from zoo.models.recommendation import *
from zoo.models.recommendation.utils import *
from zoo.common.nncontext import init_nncontext

from bigdl.dataset.transformer import *
from bigdl.dataset.base import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.nn.layer import *

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

init_engine()

In [2]:
data_path = 'hdfs:///user/andrew/'

In [3]:
%%time
# Read in data through spark since the data is sored in hadoop and format the columns
# Convert to pandas dataframes for easier and faster manipulation
sqlContext = SQLContext(sc)
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *

movies = sqlContext.read.parquet(data_path + 'movie_20m_metadata_OHE_subset')
movies_df = movies.toPandas()
movies_df = movies_df.set_index(movies_df.item_id) # set index so no sorting errors occur

movies_gp = movies.drop('title', 'imdb_id', 'imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year')

users_full_sdf = sqlContext.read.parquet(data_path + 'users_metadata_20m') 
users_full_sdf = users_full_sdf.na.fill(0)

ratings_initial = sqlContext.read.parquet(data_path + 'ratings_20m')
ratings_initial = ratings_initial.drop('timestamp')
ratings_initial = ratings_initial.withColumn("userId", ratings_initial["userId"].cast("int"))
#Multiply ratings by 2 so that values are whole numbers -> values 1 to 10
ratings_initial = ratings_initial.withColumn("label", ratings_initial["rating"] * 2) 
ratings = ratings_initial.select("userId", "movieId", "label").toDF("userId", "itemId", "label")

CPU times: user 2.62 s, sys: 151 ms, total: 2.77 s
Wall time: 13.2 s


In [4]:
# User input function - takes user input data, strpis it down, and calls other functions on that data
# Takes in user age, gender, occupation (of 20 options - may drop this), list of favorite movies
# All movies in the list of favorite movies will be rated 5 stars
def new_user_input(fav_movies, all_ratings, movies, spark_context, 
                   sqlContext = None, num_recs = 10, movies_gp = None, movies_df = None):
    # collect favorite movie ids
    print 'Collecting favorite movie IDs'
    movieIds = get_movieId(movies_df, fav_movies)
    # print 'Favorite movies in the available set'
    # print movies_df[['item_id', 'title', 'year']].loc[movieIds]
    
    print 'Adding ratings to full set'
    # add new user movie ratings to all ratings dataframe
    # all_ratings_updated, user_ratings, user_ratings_binary = add_new_user_to_data(all_ratings, movieIds, spark_context)
    all_ratings_updated, new_user_ratings = add_new_user_to_data(all_ratings, movieIds, spark_context)
    del all_ratings
    
    print 'Creating prediction set'
    # get all unrated movies for user (unnecessary in Spark 2.2+, instead use the recommendForAllUsers(num_to_rec) method)
    all_user_unrated = get_inference_data(all_ratings_updated, movieIds)
    
    print 'Formatting training and prediction dataframes for NCF'
    # Fornat ratings data into RDD Samples (the format needed for Analytics Zoo models)
    trainPairFeatureRdds = all_ratings_updated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))
    predPairFeatureRdds = all_user_unrated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))

    train_rdd = trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
    pred_rdd = predPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
    
    print 'Training NCF Model'
    # train NCF model, then predict movie ratings    
    batch_size = 46080 #16 executors, 16 cores each
    max_user_id = all_ratings_updated.agg({'userId': 'max'}).collect()[0]['max(userId)']
    max_movie_id = all_ratings_updated.agg({'itemId': 'max'}).collect()[0]['max(itemId)']

    ncf = NeuralCF(user_count = max_user_id, item_count = max_movie_id, 
                   class_num = 10, hidden_layers = [20, 10], include_mf = False)

    optimizer = Optimizer(
        model=ncf,
        training_rdd=train_rdd,
        criterion=ClassNLLCriterion(),
        end_trigger=MaxEpoch(10),
        batch_size=batch_size, # 16 executors, 16 cores each
        optim_method=Adam(learningrate=0.001))

    optimizer.optimize()
    # del all_ratings_updated
    
    print 'Making Predictions'
    # keep top 30 predictions
    full_predictions_sorted = ncf.recommend_for_user(predPairFeatureRdds, 3*num_recs).toDF().sort(desc('prediction'))
    ncf_top_n_predictions = full_predictions_sorted.take(num_recs)
    # extract movie ids
    ncf_top_n_ids = [r[1] for r in ncf_top_n_predictions]
    
    ncf_movie_recs = movies.filter(movies.item_id.isin(ncf_top_n_ids)).select('title', 'year')
    print ''
    print 'NCF Recommendations'
    print ncf_movie_recs.toPandas()
    
    
    # import WnD model input data format
    # Create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
    # filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
    # then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
    # this should now be a vector of user preferences. 
    # join a OHE age, gender, and possibly occupation, to the user preferences
    user_summary_sdf = get_user_preferences(user_ratings = new_user_ratings, movieIds = movieIds, 
                                            movies_gp = movies_gp, sqlContext = sqlContext)

    ncf_top_3xn_ids = full_predictions_sorted.select('item_id')
    all_user_unrated_top_3xn = all_user_unrated.join(ncf_top_3xn_ids, all_user_unrated.itemId == ncf_top_3xn_ids.item_id, 
                                                     'inner').drop(ncf_top_3xn_ids.item_id)
    top_3xn_movies_metadata = movies.join(ncf_top_3xn_ids, movies.item_id == ncf_top_3xn_ids.item_id, 
                                          'inner').drop(ncf_top_3xn_ids.item_id)
        
    # lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
    # (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
    unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                    .join(top_3xn_movies_metadata, 
                                          all_user_unrated_top_3xn.itemId == top_3xn_movies_metadata.item_id, 
                                          how = 'left') \
                                    .drop(top_3xn_movies_metadata.item_id)
    unrated_with_full_metadata = unrated_with_movie_metadata \
                                    .join(user_summary_sdf, on = 'userId', how = 'left')
    # Create lists of columns sets
    identifier_fields = ['userId', 'itemId', 'label', 'title', 'imdb_id']
    continuous_base_fields = ['imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year']
    all_base_fields = identifier_fields + continuous_base_fields

    user_avgs = [col_name for col_name in unrated_with_full_metadata.columns if col_name[-11:] == '_avg_rating']
    movie_metadata = [col_name for col_name in unrated_with_full_metadata.columns 
                      if (col_name[-11:] != '_avg_rating' and col_name not in all_base_fields)]

    user_avgs_genres = [genre for genre in user_avgs if 'genre' in genre]
    user_avgs_ml_genres = [genre for genre in user_avgs if genre[:8] == 'ml_genre']
    user_avgs_imdb_genres = [genre for genre in user_avgs if genre[:10] == 'imdb_genre']
    user_avgs_directors = [director for director in user_avgs if director[0:9] == 'director_']
    user_avgs_actors = [actor for actor in user_avgs if actor[0:6] == 'actor_']

    movie_genres = [genre for genre in movie_metadata if 'genre' in genre]
    movie_ml_genres = [genre for genre in movie_metadata if genre[:8] == 'ml_genre']
    movie_imdb_genres = [genre for genre in movie_metadata if genre[:10] == 'imdb_genre']
    movie_directors = [director for director in movie_metadata if director[0:9] == 'director_']
    movie_actors = [actor for actor in movie_metadata if actor[0:6] == 'actor_']
    
    # Determine embedding dimmensions
    max_user_id = unrated_with_full_metadata.agg({"userId": "max"}).collect()[0][0]
    max_movie_id = unrated_with_full_metadata.agg({"itemId": "max"}).collect()[0][0]
    # num_rating_labels = unrated_with_full_metadata.select('label').distinct().count()
    
    # Create column_info for feature formatting
    bucket_size = 100
    wide_base_cols = indicator_base_fields + movie_genres + user_avgs_genres
    wide_base_dims = [3 for i in (indicator_base_fields + movie_genres)] + [6 for i in user_avgs_genres]
    indicator_cols = indicator_base_fields + movie_genres
    indicator_dims = [3 for i in indicator_cols]
    continuous_cols = continuous_base_fields + user_avgs_genres
    column_info = ColumnFeatureInfo(
                wide_base_cols = wide_base_cols,
                wide_base_dims = wide_base_dims,
                # wide_cross_cols = ["age-gender"],
                # wide_cross_dims = [bucket_size],
                indicator_cols = indicator_cols,
                indicator_dims = indicator_dims,
                embed_cols = ["userId", "itemId"],
                embed_in_dims = [max_user_id, max_movie_id],
                embed_out_dims = [500, 500],
                continuous_cols = continuous_cols)
    
    # format coumns to feature
    wnd_pred_rdd = unrated_with_full_metadata.rdd.map(lambda row: to_user_item_feature(row, column_info))
    
    # import the WideAndDeep model
    WnDModel = WideAndDeep.load_model(path = data_path + 'WnD_Model_20m.bigdl', 
                                      weight_path = data_path + 'WnD_Model_20m_weights.h5')
    
    # recommend items for the new user
    wnd_user_recs = WnDModel.recommend_for_user(wnd_pred_rdd, num_recs)
    # extract the item_ids for the recommended items
    user_recs = [user_rec.item_id for user_rec in wnd_user_recs.take(num_recs)]

    # filter the movies sdf for only the recommended items
    wnd_movie_recs = movies.filter(col('item_id').isin(user_recs)).select('title', 'year')
    print ' '
    print 'Wind&Deep Recommendations'
    print wnd_movie_recs.toPandas()

In [5]:
def get_movieId(movies_df, fav_movie_list):
    """
    return all movieId(s) of user's favorite movies
    
    Parameters
    ----------
    df_movies: spark Dataframe, movies data
    
    fav_movie_list: list, user's list of favorite movies
    
    Return
    ------
    movieId_list: list of movieId(s)
    """
    movieId_list = []
    for movie in fav_movie_list:
        if movie[0:4] == 'The ':
            movie = movie[4:]
        elif movie[0:3] == 'An ':
            movie = movie[3:]
        elif movie[0:3] == 'La ':
            movie = movie[3:]
        elif movie[0:2] == 'A ':
            movie = movie[3:]

        if movie[-6:-5] == '(':
            year = int(movie[-5:-1])
            movie = movie[0:-7]
            movieIds = movies_df.item_id[(movies_df.title.str.contains(movie)) & (movies_df.year == year)]
            movieId_list.extend(movieIds)
        elif len(movie.split(' ')) == 1:
            movieIds = movies_df.item_id[movies_df.title == movie]
            movieId_list.extend(movieIds)
        else:
            movieIds = movies_df.item_id[movies_df.title.str.contains(movie)]
            movieId_list.extend(movieIds)
    return movieId_list

In [6]:
def add_new_user_to_data(train_data, movieIds, spark_context):
    """
    add new rows with new user, user's movie and ratings to
    existing train data

    Parameters
    ----------
    train_data: Spark DataFrame, ratings data
    
    movieIds: spark DataFrame, single column of movieId(s)

    spark_context: Spark Context object
    
    Return
    ------
    new train data with the new user's rows
    """
    # get new user id
    new_id = train_data.agg({"userId": "max"}).collect()[0][0] + 1
    # get max rating
    max_rating = train_data.agg({"label": "max"}).collect()[0][0]
    # create new user sdf for max rating
    user_rows_max = [(new_id, movieId, max_rating) for movieId in movieIds]
    new_sdf_max = spark_context.parallelize(user_rows_max).toDF(['userId', 'itemId', 'label'])
    # return new train data
    return train_data.union(new_sdf_max), new_sdf_max # , new_sdf_binary

In [7]:
def get_inference_data(train_data, movieIds):
    """
    return a rdd with the userid and all movies (except ones in movieId_list)

    Parameters
    ----------
    train_data: spark RDD, ratings data

    df_movies: spark Dataframe, movies data
    
    movieId_list: list, list of movieId(s)

    Return
    ------
    inference data: Spark RDD
    """
    # get new user id
    new_id = train_data.agg({"userId": "max"}).collect()[0][0]
    
    distinct_unrated_items = ratings.select('itemId').distinct().filter(~col('itemId').isin(movieIds))

    user_unrated = distinct_unrated_items.withColumn('userId', lit(new_id)).select('userId', 'itemId')
    user_unrated = user_unrated.withColumn('label', lit(0))
    return user_unrated

In [8]:
def build_sample(user_id, item_id, rating):
    sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating]))
    return UserItemFeature(user_id, item_id, sample)

In [9]:
def get_user_preferences(user_ratings, movieIds, movies_gp, sqlContext):        
    #new_user_ratings
    # pivoted_user_ratings = user_ratings.groupBy('user_id').pivot('item_id').agg(avg('label'))
    # pivoted_new_user_ratings_binary = user_ratings_binary.groupBy('user_id').pivot('item_id').agg(avg('label')).drop('user_id')
    pivoted_user_ratings_df = user_ratings.toPandas() \
                                            .pivot(index='userId', 
                                                   columns='itemId',
                                                   values='label') \
                                            .fillna(0)
    pivoted_user_ratings_df_binary = pivoted_user_ratings_df / pivoted_user_ratings_df
    
    movies_gp_filtered = movies_gp.filter(col('item_id').isin(movieIds))
    movies_gp_filtered_df = movies_gp_filtered.toPandas()
    # movies_gp_filtered_df.item_id = movies_gp_filtered_df.item_id.astype(str) only necessary when pivot was done on spark df
    movies_gp_filtered_df = movies_gp_filtered_df.set_index('item_id')
    
    user_summary_total = pivoted_user_ratings_df.dot(movies_gp_filtered_df)
    user_summary_count = pivoted_user_ratings_df_binary.dot(movies_gp_filtered_df)
    user_summary_avg = (user_summary_total / user_summary_count).fillna(0)
    user_summary_avg = user_summary_avg.add_suffix('_avg_rating').reset_index()
    
    sorted_columns = list(user_summary_avg.columns.sort_values())
    user_summary_sdf = sqlContext.createDataFrame(user_summary_avg[sorted_columns])
    return user_summary_sdf

### Step by Step Walkthrough of Main Function (to show runtime)

In [10]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
# collect favorite movie ids
print 'Collecting favorite movie IDs'
movieIds = get_movieId(movies_df, fav_movies)
if movies_df is not None:
    print 'Favorite movies in the available set'
    print movies_df[['item_id', 'title', 'year']].loc[movieIds]

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                                              title  year
item_id                                                                  
89753      89753                   Tinker Tailor Soldier Spy (2011)  2011
318          318                   Shawshank Redemption, The (1994)  1994
2116        2116                      Lord of the Rings, The (1978)  1978
7153        7153  Lord of the Rings: The Return of the King, The...  2003
4993        4993  Lord of the Rings: The Fellowship of the Ring,...  2001
5952        5952      Lord of the Rings: The Two Towers, The (2002)  2002
CPU times: user 44.7 ms, sys: 4.33 ms, total: 49 ms
Wall time: 48 ms


In [11]:
%%time
print 'Adding ratings to full set'
# add new user movie ratings to all ratings dataframe
# all_ratings_updated, user_ratings, user_ratings_binary = add_new_user_to_data(all_ratings, movieIds, spark_context)
all_ratings_updated, user_ratings = add_new_user_to_data(ratings, movieIds, sc)

Adding ratings to full set
CPU times: user 23.3 ms, sys: 4.66 ms, total: 28 ms
Wall time: 9.5 s


In [12]:
%%time
print 'Creating prediction set'
# get all unrated movies for user (unnecessary in Spark 2.2+, instead use the recommendForAllUsers(num_to_rec) method)
all_user_unrated = get_inference_data(all_ratings_updated, movieIds)

Creating prediction set
CPU times: user 15.9 ms, sys: 900 µs, total: 16.8 ms
Wall time: 3.88 s


In [13]:
%%time
print 'Formatting training and prediction dataframes for NCF'
# Fornat ratings data into RDD Samples (the format needed for Analytics Zoo models)
trainPairFeatureRdds = all_ratings_updated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))
predPairFeatureRdds = all_user_unrated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))

train_rdd = trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
# pred_rdd = predPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)

Formatting training and prediction dataframes for NCF
CPU times: user 1.88 ms, sys: 0 ns, total: 1.88 ms
Wall time: 140 ms


In [14]:
%%time
print 'Training NCF Model'
# train NCF model, then predict movie ratings    
batch_size = 46080 #16 executors, 16 cores each
max_user_id = all_ratings_updated.agg({'userId': 'max'}).collect()[0]['max(userId)']
max_item_id = all_ratings_updated.agg({'itemId': 'max'}).collect()[0]['max(itemId)']

ncf = NeuralCF(user_count = max_user_id, item_count = max_item_id, 
               class_num = 10, hidden_layers = [20, 10], include_mf = False)

optimizer = Optimizer(
    model=ncf,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    end_trigger=MaxEpoch(10),
    batch_size=batch_size, # 16 executors, 16 cores each
    optim_method=Adam(learningrate=0.001))

optimizer.optimize()
# del all_ratings_updated

Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
CPU times: user 62.3 ms, sys: 15.6 ms, total: 77.9 ms
Wall time: 8min 32s


In [15]:
%%time
print 'Making Predictions'
# keep top 15 predictions
num_recs = 15
full_predictions_sorted = ncf.recommend_for_user(predPairFeatureRdds, 3*num_recs).toDF().sort(desc('prediction'))
ncf_top_n_predictions = full_predictions_sorted.take(num_recs)
# extract movie ids
ncf_top_n_ids = [r[1] for r in ncf_top_n_predictions]

ncf_movie_recs = movies.filter(movies.item_id.isin(ncf_top_n_ids)).select('title', 'year')
print ''
print 'NCF Recommendations'
print ncf_movie_recs.toPandas()

Making Predictions

NCF Recommendations
                                        title  year
0                         Love You You (2011)  2011
1                  Usual Suspects, The (1995)  1995
2   First Name: Carmen (PrAnom Carmen) (1983)  1983
3                 How to Die in Oregon (2011)  2011
4             History Is Made at Night (1937)  1937
5                 Ween Live in Chicago (2004)  2004
6    Death on the Staircase (SoupAons) (2004)     0
7                Car Bonus (Autobonus) (2001)  2001
8                     India's Daughter (2015)  2015
9          Witness for the Prosecution (1957)  1957
10                Murder on Flight 502 (1975)  1975
11                    Dark Knight, The (2008)  2008
12               The Salt of the Earth (2014)  2014
13                   Symbol (Shinboru) (2009)  2009
14                    Schindler's List (1993)  1993
CPU times: user 74.5 ms, sys: 10.6 ms, total: 85.2 ms
Wall time: 26.3 s


In [16]:
%%time
# import WnD model input data format
# Create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
# filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
# then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
# this should now be a vector of user preferences. 
# join a OHE age, gender, and possibly occupation, to the user preferences
user_summary_sdf = get_user_preferences(user_ratings = user_ratings, movieIds = movieIds, 
                                        movies_gp = movies_gp, sqlContext = sqlContext)

CPU times: user 104 ms, sys: 30.2 ms, total: 134 ms
Wall time: 4.34 s


In [17]:
%%time
ncf_top_3xn_ids = full_predictions_sorted.select('item_id')
all_user_unrated_top_3xn = all_user_unrated.join(ncf_top_3xn_ids, all_user_unrated.itemId == ncf_top_3xn_ids.item_id, 
                                                 'inner').drop(ncf_top_3xn_ids.item_id)
top_3xn_movies_metadata = movies.join(ncf_top_3xn_ids, movies.item_id == ncf_top_3xn_ids.item_id, 
                                      'inner').drop(ncf_top_3xn_ids.item_id)

CPU times: user 2.81 ms, sys: 1.37 ms, total: 4.18 ms
Wall time: 140 ms


In [18]:
%%time
# lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
# (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                .join(top_3xn_movies_metadata, 
                                      all_user_unrated_top_3xn.itemId == top_3xn_movies_metadata.item_id, 
                                      how = 'left') \
                                .drop(top_3xn_movies_metadata.item_id)
unrated_with_full_metadata = unrated_with_movie_metadata \
                                .join(user_summary_sdf, on = 'userId', how = 'left')

CPU times: user 7.1 ms, sys: 482 µs, total: 7.58 ms
Wall time: 550 ms


In [19]:
identifier_fields = ['userId', 'itemId', 'label', 'title', 'imdb_id']
continuous_base_fields = ['imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year']
all_base_fields = identifier_fields + continuous_base_fields

user_avgs = [col_name for col_name in unrated_with_full_metadata.columns if col_name[-11:] == '_avg_rating']
movie_metadata = [col_name for col_name in unrated_with_full_metadata.columns 
                  if (col_name[-11:] != '_avg_rating' and col_name not in all_base_fields)]

user_avgs_genres = [genre for genre in user_avgs if 'genre' in genre]
user_avgs_ml_genres = [genre for genre in user_avgs if genre[:8] == 'ml_genre']
user_avgs_imdb_genres = [genre for genre in user_avgs if genre[:10] == 'imdb_genre']
user_avgs_directors = [director for director in user_avgs if director[0:9] == 'director_']
user_avgs_actors = [actor for actor in user_avgs if actor[0:6] == 'actor_']

movie_genres = [genre for genre in movie_metadata if 'genre' in genre]
movie_ml_genres = [genre for genre in movie_metadata if genre[:8] == 'ml_genre']
movie_imdb_genres = [genre for genre in movie_metadata if genre[:10] == 'imdb_genre']
movie_directors = [director for director in movie_metadata if director[0:9] == 'director_']
movie_actors = [actor for actor in movie_metadata if actor[0:6] == 'actor_']

In [20]:
max_user_id = unrated_with_full_metadata.agg({"userId": "max"}).collect()[0][0]
max_movie_id = unrated_with_full_metadata.agg({"itemId": "max"}).collect()[0][0]
num_rating_labels = unrated_with_full_metadata.select('label').distinct().count()

In [21]:
bucket_size = 100
wide_base_cols = movie_genres + user_avgs_genres
wide_base_dims = [3 for i in movie_genres] + [11 for i in user_avgs_genres]
# indicator_cols = indicator_base_fields + movie_genres
# indicator_dims = [3 for i in indicator_cols]
continuous_cols = continuous_base_fields + user_avgs_genres
column_info = ColumnFeatureInfo(
            wide_base_cols = wide_base_cols,
            wide_base_dims = wide_base_dims,
            # wide_cross_cols = ["age-gender"],
            # wide_cross_dims = [bucket_size],
            # indicator_cols = indicator_cols,
            # indicator_dims = indicator_dims,
            embed_cols = ["userId", "itemId"],
            embed_in_dims = [max_user_id, max_movie_id],
            embed_out_dims = [500, 500],
            continuous_cols = continuous_cols)

In [22]:
%%time
wnd_pred_rdd = unrated_with_full_metadata.rdd.map(lambda row: to_user_item_feature(row, column_info))

CPU times: user 6.58 ms, sys: 6.21 ms, total: 12.8 ms
Wall time: 2.06 s


In [23]:
%%time
WnDModel = WideAndDeep.load_model(path = data_path + 'WnD_Model_20m.bigdl', 
                                  weight_path = data_path + 'WnD_Model_20m_weights.h5')

CPU times: user 4.44 ms, sys: 2.11 ms, total: 6.55 ms
Wall time: 52.4 s


In [None]:
%%time
wnd_user_recs = WnDModel.recommend_for_user(wnd_pred_rdd, num_recs)
# user_recs = [user_rec.item_id for user_rec in wnd_user_recs.take(num_recs)]

In [None]:
%%time
wnd_movie_recs = movies.filter(col('item_id').isin(user_recs)).select('title', 'year')
print 'Wind&Deep Recommendations'
print wnd_movie_recs.toPandas()

### Full Function Recommendation Examples

In [None]:
%%time
fav_movies = ['Iron Man', 'Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings', 'Harry Potter',
             'The Family Stone', 'Shaun of the Dead', 'Up', 'A View to a Kill']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, movies_gp = movies_gp, movies_df = movies_df)

In [None]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, movies_gp = movies_gp, movies_df = movies_df)

In [26]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Adding ratings to full set
Creating prediction set
Formatting training and prediction dataframes for NCF
Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
Making Predictions

NCF Recommendations
                                               title  year
0  Heimat - A Chronicle of Germany (Heimat - Eine...     0
1      Zero Motivation (Efes beyahasei enosh) (2014)  2014
2                               North & South (2004)  2004
3                                 Connections (1978)     0
4                              Shepard & Dark (2012)  2012
5                   Shawshank Redemption, The (1994)  1994
6                         My Future Boyfriend (2011)  2011
7  Lord of the Rings: The Fellowship of the Ring,...  2001
8                    TT3D: Closer to the Edge (2011)  2011
9                            Band of Brothers (2001)  2001


NameError: global name 'indicator_base_fields' is not defined

In [None]:
%%time
fav_movies = ['The Sound of Music', 'Blackhawk Down', 'Pearl Harbor', 'Toy Story', 'The Princess Bride',  
              'Foreign Student', 'Star Wars', 'The Shining', 'Rear Window', 'Groundhog Day', 'Ghostbusters', 
              'Robin Hood (1993)', 'Die Hard']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, movies_gp = movies_gp, movies_df = movies_df)