In [1]:
# Some functions are from or based on work done by Kevin Liao in the below notebook
# https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/master/movie_recommender/movie_recommendation_using_ALS.ipynb

# Intialization
import os
import time
import sys
import datetime as df

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.types import StringType, ArrayType
from zoo.models.recommendation import *
from zoo.models.recommendation.utils import *
from zoo.common.nncontext import init_nncontext

from bigdl.dataset.transformer import *
from bigdl.dataset.base import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.nn.layer import *

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

init_engine()

In [None]:
data_path = 'hdfs:///user/andrew/'

In [2]:
%%time
# Read in data through spark since the data is sored in hadoop and format the columns
# Convert to pandas dataframes for easier and faster manipulation
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import *
sqlContext = SQLContext(sc)


movies = sqlContext.read.parquet(data_path + 'movie_metadata_OHE_subset')
movies_df = movies.toPandas()
movies_df = movies_df.set_index(movies_df.item_id) # set index so no sorting errors occur

# movies_gp = sqlContext.read.parquet('hdfs:///user/andrew/movie_genre_and_people_metadata_ohe_subset')
movies_gp = movies.drop('title', 'imdb_id', 'imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year')

users_full_sdf = sqlContext.read.parquet(data_path + 'users_metadata') 
users_full_sdf = users_full_sdf.na.fill(0)

Rating = Row("userId", "itemId", "label") # Ignore timestamp
ratings = sc.textFile(data_path + 'ratings.dat')\
    .map(lambda line: line.split("::")[0:3])\
    .map(lambda line: map(int, line))\
    .map(lambda r: Rating(*r))
ratings = sqlContext.createDataFrame(ratings)

CPU times: user 404 ms, sys: 41.3 ms, total: 445 ms
Wall time: 10.1 s


In [26]:
# User input function - takes user input data, strpis it down, and calls other functions on that data
# Takes in user age, gender, list of favorite movies
# All movies in the list of favorite movies will be rated 5 stars
def new_user_input(fav_movies, all_ratings, movies, spark_context, 
                   sqlContext = None, num_recs = 10, age = None, 
                   gender = None, movies_gp = None, movies_df = None):
    # age should be an integer in 1 - 100
    # gender should be M or F
    # fav_movies should be in the form of ["Iron Man", "The Shawshank Redemption", "Robin Hood"]
    #    If there are multiple versions of the movie and the user wishes for one other than the most recent one, they
    #    should specify with a year in parenthesis, like "Robin Hood (1993)"
    
    # Collect favorite movie ids
    print 'Collecting favorite movie IDs'
    movieIds = get_movieId(movies_df, fav_movies)
    print 'Favorite movies in the available set'
    print movies_df[['item_id', 'title', 'year']].loc[movieIds]
    
    print 'Adding ratings to full set'
    # Add new user movie ratings to all ratings dataframe
    all_ratings_updated, new_user_ratings = add_new_user_to_data(all_ratings, movieIds, spark_context)
    del all_ratings
    
    print 'Creating prediction set'
    all_user_unrated = get_inference_data(all_ratings_updated, movieIds)
    
    print 'Formatting training and prediction dataframes for NCF'
    # Format ratings data into RDD Samples (the format needed for Analytics Zoo models)
    trainPairFeatureRdds = all_ratings_updated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))
    predPairFeatureRdds = all_user_unrated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))

    train_rdd = trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
    pred_rdd = predPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
    
    print 'Training NCF Model'
    # Train NCF model, then recommend new movies    
    batch_size = 46080
    max_user_id = all_ratings_updated.agg({'userId': 'max'}).collect()[0]['max(userId)']
    max_movie_id = all_ratings_updated.agg({'itemId': 'max'}).collect()[0]['max(itemId)']

    ncf = NeuralCF(user_count = max_user_id, item_count = max_movie_id, 
                   class_num = 5, hidden_layers = [20, 10], include_mf = False)

    optimizer = Optimizer(
        model=ncf,
        training_rdd=train_rdd,
        criterion=ClassNLLCriterion(),
        end_trigger=MaxEpoch(10),
        batch_size=batch_size,
        optim_method=Adam(learningrate=0.001))

    optimizer.optimize()
    # del all_ratings_updated
    
    print 'Making Predictions'
    # Determine top 3*num_recs recommendations
    full_predictions_sorted = ncf.recommend_for_user(predPairFeatureRdds, 3*num_recs).toDF().sort(desc('prediction'))
    # Keep only num_recs from the sorted set for the recommendations
    ncf_top_n_predictions = full_predictions_sorted.take(num_recs)
    # Extract movie ids
    ncf_top_n_ids = [r[1] for r in ncf_top_n_predictions]
    
    ncf_movie_recs = movies.filter(movies.item_id.isin(ncf_top_n_ids)).select('title', 'year')
    print ''
    print 'NCF Recommendations'
    print ncf_movie_recs.toPandas()
    
    
    # Import WnD model input data format
    # Create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
    # filter movies_gp dataframe by the movieIds. Pivot new_user_ratings into a vector, 
    # then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
    # this should now be a vector of user preferences. 
    # Join a OHE age, gender, and possibly occupation, to the user preferences
    user_summary_sdf = get_user_preferences(user_ratings = new_user_ratings, movieIds = movieIds, 
                                            movies_gp = movies_gp, age = 26, gender = 'M', 
                                            sqlContext = sqlContext)

    ncf_top_3xn_ids = full_predictions_sorted.select('item_id')
    all_user_unrated_top_3xn = all_user_unrated.join(ncf_top_3xn_ids, all_user_unrated.itemId == ncf_top_3xn_ids.item_id, 
                                                     'inner').drop(ncf_top_3xn_ids.item_id)
    top_3xn_movies_metadata = movies.join(ncf_top_3xn_ids, movies.item_id == ncf_top_3xn_ids.item_id, 
                                          'inner').drop(ncf_top_3xn_ids.item_id)
        
    # lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
    # (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
    unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                    .join(top_3xn_movies_metadata, 
                                          all_user_unrated_top_3xn.itemId == top_3xn_movies_metadata.item_id, 
                                          how = 'left') \
                                    .drop(top_3xn_movies_metadata.item_id)
    unrated_with_full_metadata = unrated_with_movie_metadata \
                                    .join(user_summary_sdf, on = 'userId', how = 'left')
    # Create lists of columns sets
    identifier_fields = ['userId', 'itemId', 'label', 'title', 'imdb_id']
    continuous_base_fields = ['imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year']
    indicator_base_fields = ['gender_F', 'gender_M', 'age_group_1', 
                             'age_group_18', 'age_group_25', 'age_group_35', 
                             'age_group_45', 'age_group_50', 'age_group_56']
    all_base_fields = identifier_fields + continuous_base_fields + indicator_base_fields

    user_avgs = [col_name for col_name in unrated_with_full_metadata.columns if col_name[-11:] == '_avg_rating']
    movie_metadata = [col_name for col_name in unrated_with_full_metadata.columns 
                      if (col_name[-11:] != '_avg_rating' and col_name not in all_base_fields)]

    user_avgs_genres = [genre for genre in user_avgs if 'genre' in genre]
    user_avgs_ml_genres = [genre for genre in user_avgs if genre[:8] == 'ml_genre']
    user_avgs_imdb_genres = [genre for genre in user_avgs if genre[:10] == 'imdb_genre']
    user_avgs_directors = [director for director in user_avgs if director[0:9] == 'director_']
    user_avgs_actors = [actor for actor in user_avgs if actor[0:6] == 'actor_']

    movie_genres = [genre for genre in movie_metadata if 'genre' in genre]
    movie_ml_genres = [genre for genre in movie_metadata if genre[:8] == 'ml_genre']
    movie_imdb_genres = [genre for genre in movie_metadata if genre[:10] == 'imdb_genre']
    movie_directors = [director for director in movie_metadata if director[0:9] == 'director_']
    movie_actors = [actor for actor in movie_metadata if actor[0:6] == 'actor_']
    
    # Determine embedding dimmensions
    max_user_id = unrated_with_full_metadata.agg({"userId": "max"}).collect()[0][0]
    max_movie_id = unrated_with_full_metadata.agg({"itemId": "max"}).collect()[0][0]
    # num_rating_labels = unrated_with_full_metadata.select('label').distinct().count() - This is already part of the model
    
    # Specify column sets to be used for different parts of the Wide&Deep model
    # Column dims values need to be greater than the dims of the columns. 
    # For indicator columns the dim is 3 because the column has two levels, 
    # and for avgs the dim is 6 because the column has values up to 5
    # bucket_size = 100
    wide_base_cols = indicator_base_fields + movie_genres + user_avgs_genres
    wide_base_dims = [3 for i in (indicator_base_fields + movie_genres)] + [6 for i in user_avgs_genres]
    indicator_cols = indicator_base_fields + movie_genres
    indicator_dims = [3 for i in indicator_cols]
    continuous_cols = continuous_base_fields + user_avgs_genres
    column_info = ColumnFeatureInfo(
                wide_base_cols = wide_base_cols,
                wide_base_dims = wide_base_dims,
                # wide_cross_cols = ["age-gender"],
                # wide_cross_dims = [bucket_size],
                indicator_cols = indicator_cols,
                indicator_dims = indicator_dims,
                embed_cols = ["userId", "itemId"],
                embed_in_dims = [max_user_id, max_movie_id],
                embed_out_dims = [100, 100],
                continuous_cols = continuous_cols)
    
    # Format coumns to feature
    wnd_pred_rdd = unrated_with_full_metadata.rdd.map(lambda row: to_user_item_feature(row, column_info))
    
    # Import the WideAndDeep model
    WnDModel = WideAndDeep.load_model(path = data_path + 'WnD_Model.bigdl', 
                                      weight_path = data_path + 'WnD_Model_weights.h5')
    
    # Recommend items for the new user
    wnd_user_recs = WnDModel.recommend_for_user(wnd_pred_rdd, num_recs)
    # Extract the item_ids for the recommended items
    user_recs = [user_rec.item_id for user_rec in wnd_user_recs.take(num_recs)]

    # Filter the movies sdf for only the recommended items
    wnd_movie_recs = movies.filter(col('item_id').isin(user_recs)).select('title', 'year')
    print ' '
    print 'Wind&Deep Recommendations'
    print wnd_movie_recs.toPandas()

In [4]:
def get_movieId(movies_df, fav_movie_list):
    """
    return all movieId(s) of user's favorite movies
    
    Parameters
    ----------
    df_movies: spark Dataframe, movies data
    
    fav_movie_list: list, user's list of favorite movies
    
    Return
    ------
    movieId_list: list of movieId(s)
    """
    movieId_list = []
    for movie in fav_movie_list:
        # Remove first word in specific cases. This is because sometimes the first word is listed 
        # at the end of the movie title and would not be found by the search if it is included.
        if movie[0:4] == 'The ':
            movie = movie[4:]
        elif movie[0:3] == 'An ':
            movie = movie[3:]
        elif movie[0:3] == 'La ':
            movie = movie[3:]
        elif movie[0:2] == 'A ':
            movie = movie[3:]
        
        # If a year is provided by the user, remove it from the title and use it in the filtering.
        if movie[-6:-5] == '(':
            year = int(movie[-5:-1])
            movie = movie[0:-7]
            movieIds = movies_df.item_id[(movies_df.title.str.contains(movie)) & (movies_df.year == year)]
            movieId_list.extend(movieIds)
        # If no year is provided, determine if the title is a single word.
        # If it is a single work, find an exact match
        elif len(movie.split(' ')) == 1:
            movieIds = movies_df.item_id[movies_df.title == movie]
            movieId_list.extend(movieIds)
        # Otherwise search for the movie title
        else:
            movieIds = movies_df.item_id[movies_df.title.str.contains(movie)]
            movieId_list.extend(movieIds)
    return movieId_list

In [5]:
def add_new_user_to_data(train_data, movieIds, spark_context):
    """
    add new rows with new user, user's movie and ratings to
    existing train data

    Parameters
    ----------
    train_data: Spark DataFrame, ratings data
    
    movieIds: spark DataFrame, single column of movieId(s)

    spark_context: Spark Context object
    
    Return
    ------
    new train data with the new user's rows
    """
    # Get new user id
    new_id = train_data.agg({"userId": "max"}).collect()[0][0] + 1
    # Get max rating
    max_rating = train_data.agg({"label": "max"}).collect()[0][0]
    # Create new user sdf for max rating
    user_rows_max = [(new_id, movieId, max_rating) for movieId in movieIds]
    new_sdf_max = spark_context.parallelize(user_rows_max).toDF(['userId', 'itemId', 'label'])
    # Return new train data
    return train_data.union(new_sdf_max), new_sdf_max # , new_sdf_binary

In [6]:
def get_inference_data(train_data, movieIds):
    """
    return a rdd with the userid and all movies (except ones in movieId_list)

    Parameters
    ----------
    train_data: spark RDD, ratings data

    df_movies: spark Dataframe, movies data
    
    movieId_list: list, list of movieId(s)

    Return
    ------
    inference data: Spark RDD
    """
    # Get new user id
    new_id = train_data.agg({"userId": "max"}).collect()[0][0]
    # Get the set of movies that have not been rated by the user
    distinct_unrated_items = ratings.select('itemId').distinct().filter(~col('itemId').isin(movieIds))
    # Create the dataset for the new user with their unrated movies
    user_unrated = distinct_unrated_items.withColumn('userId', lit(new_id)).select('userId', 'itemId')
    user_unrated = user_unrated.withColumn('label', lit(0))
    return user_unrated

In [7]:
def build_sample(user_id, item_id, rating):
    sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating]))
    return UserItemFeature(user_id, item_id, sample)

In [8]:
def get_user_preferences(user_ratings, movieIds, movies_gp, age, gender, sqlContext):
    user_demog = pd.DataFrame({'gender_M': 0, 'gender_F': 0, 'age_group_1': 0, 
                               'age_group_18': 0, 'age_group_25': 0, 'age_group_35': 0, 
                               'age_group_45': 0, 'age_group_50': 0, 'age_group_56': 0}, index = [0])
    # Bin user by age
    if age < 18:
        user_demog.age_group_1[0] = 1
    elif age < 25:
        user_demog.age_group_18[0] = 1
    elif age < 35:
        user_demog.age_group_25[0] = 1
    elif age < 45:
        user_demog.age_group_35[0] = 1
    elif age < 50:
        user_demog.age_group_45[0] = 1
    elif age < 56:
        user_demog.age_group_50[0] = 1
    else:
        user_demog.age_group_56[0] = 1
    # Binarize gender
    if gender == 'M':
        user_demog.gender_M[0] = 1
    else:
        user_demog.gender_F[0] = 1
        
    # new_user_ratings
    # Create the user-item matrix for the new user
    pivoted_user_ratings_df = user_ratings.toPandas() \
                                            .pivot(index='userId', 
                                                   columns='itemId',
                                                   values='label') \
                                            .fillna(0)
    pivoted_user_ratings_df_binary = pivoted_user_ratings_df / pivoted_user_ratings_df
    
    # Reduce the movie set to only the user's rate movies
    movies_gp_filtered = movies_gp.filter(col('item_id').isin(movieIds))
    movies_gp_filtered_df = movies_gp_filtered.toPandas()
    movies_gp_filtered_df = movies_gp_filtered_df.set_index('item_id')
    
    # Create user profile
    user_summary_total = pivoted_user_ratings_df.dot(movies_gp_filtered_df)
    user_summary_count = pivoted_user_ratings_df_binary.dot(movies_gp_filtered_df)
    user_summary_avg = (user_summary_total / user_summary_count).fillna(0)
    user_summary_avg = user_summary_avg.add_suffix('_avg_rating').reset_index()
    
    # Create Spark dataframe output for full user profile
    user_summary = pd.concat([user_summary_avg, user_demog], axis = 1)
    sorted_columns = list(user_summary.columns.sort_values())
    user_summary_sdf = sqlContext.createDataFrame(user_summary[sorted_columns])
    return user_summary_sdf

### Step by Step Walkthrough of Main Function (to show runtime)

In [9]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
# collect favorite movie ids
print 'Collecting favorite movie IDs'
movieIds = get_movieId(movies_df, fav_movies)
if movies_df is not None:
    print 'Favorite movies in the available set'
    print movies_df[['item_id', 'title', 'year']].loc[movieIds]

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                      title  year
item_id                                          
318          318  Shawshank Redemption, The  1994
2116        2116     Lord of the Rings, The  1978
CPU times: user 14.4 ms, sys: 4.5 ms, total: 18.9 ms
Wall time: 17 ms


In [10]:
%%time
print 'Adding ratings to full set'
# add new user movie ratings to all ratings dataframe
all_ratings_updated, user_ratings = add_new_user_to_data(ratings, movieIds, sc)

Adding ratings to full set
CPU times: user 45.5 ms, sys: 5.5 ms, total: 51 ms
Wall time: 10.2 s


In [11]:
%%time
print 'Creating prediction set'
all_user_unrated = get_inference_data(all_ratings_updated, movieIds)

Creating prediction set
CPU times: user 14.5 ms, sys: 4.35 ms, total: 18.8 ms
Wall time: 4.14 s


In [12]:
%%time
print 'Formatting training and prediction dataframes for NCF'
# Fornat ratings data into RDD Samples (the format needed for Analytics Zoo models)
trainPairFeatureRdds = all_ratings_updated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))
predPairFeatureRdds = all_user_unrated.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))

train_rdd = trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)

Formatting training and prediction dataframes for NCF
CPU times: user 815 µs, sys: 828 µs, total: 1.64 ms
Wall time: 102 ms


In [13]:
%%time
print 'Training NCF Model'
# train NCF model, then predict movie ratings    
batch_size = 46080
max_user_id = all_ratings_updated.agg({'userId': 'max'}).collect()[0]['max(userId)']
max_item_id = all_ratings_updated.agg({'itemId': 'max'}).collect()[0]['max(itemId)']

ncf = NeuralCF(user_count = max_user_id, item_count = max_item_id, 
               class_num = 5, hidden_layers = [20, 10], include_mf = False)

optimizer = Optimizer(
    model=ncf,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    end_trigger=MaxEpoch(10),
    batch_size=batch_size,
    optim_method=Adam(learningrate=0.001))

optimizer.optimize()
# del all_ratings_updated

Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
CPU times: user 35.3 ms, sys: 11.6 ms, total: 46.9 ms
Wall time: 50 s


In [14]:
%%time
print 'Making Predictions'
# keep top 15 predictions
num_recs = 15
full_predictions_sorted = ncf.recommend_for_user(predPairFeatureRdds, 3*num_recs).toDF().sort(desc('prediction'))
ncf_top_n_predictions = full_predictions_sorted.take(num_recs)
# extract movie ids
ncf_top_n_ids = [r[1] for r in ncf_top_n_predictions]

ncf_movie_recs = movies.filter(movies.item_id.isin(ncf_top_n_ids)).select('title', 'year')
print ''
print 'NCF Recommendations'
print ncf_movie_recs.toPandas()

Making Predictions

NCF Recommendations
                                                title  year
0                                 Usual Suspects, The  1995
1                                            Ridicule  1996
2                                 Wrong Trousers, The  1993
3                                      Third Man, The  1949
4                                     White Christmas  1954
5                                    Schindler's List  1993
6   Seven Samurai (The Magnificent Seven) (Shichin...  1954
7                                           Bluebeard  1944
8                                            Rushmore  1998
9                                     King and I, The  1956
10                                           Trekkies  1997
11                                     Close Shave, A  1995
12                                            Foxfire  1996
13                                     Godfather, The  1972
14                                Maltese Falcon, The  1941


In [15]:
%%time
# import WnD model input data format
# Create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
# filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
# then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
# this should now be a vector of user preferences. 
# join a OHE age, gender, and possibly occupation, to the user preferences
user_summary_sdf = get_user_preferences(user_ratings = user_ratings, movieIds = movieIds, 
                                        movies_gp = movies_gp, age = 26, gender = 'M', 
                                        sqlContext = sqlContext)

CPU times: user 57.4 ms, sys: 10.5 ms, total: 67.9 ms
Wall time: 5.47 s


In [16]:
%%time
ncf_top_3xn_ids = full_predictions_sorted.select('item_id')
all_user_unrated_top_3xn = all_user_unrated.join(ncf_top_3xn_ids, all_user_unrated.itemId == ncf_top_3xn_ids.item_id, 
                                                 'inner').drop(ncf_top_3xn_ids.item_id)
top_3xn_movies_metadata = movies.join(ncf_top_3xn_ids, movies.item_id == ncf_top_3xn_ids.item_id, 
                                      'inner').drop(ncf_top_3xn_ids.item_id)

CPU times: user 3.61 ms, sys: 0 ns, total: 3.61 ms
Wall time: 142 ms


In [17]:
%%time
# lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
# (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                .join(top_3xn_movies_metadata, 
                                      all_user_unrated_top_3xn.itemId == top_3xn_movies_metadata.item_id, 
                                      how = 'left') \
                                .drop(top_3xn_movies_metadata.item_id)
unrated_with_full_metadata = unrated_with_movie_metadata \
                                .join(user_summary_sdf, on = 'userId', how = 'left')

CPU times: user 6.4 ms, sys: 1.45 ms, total: 7.85 ms
Wall time: 267 ms


In [18]:
identifier_fields = ['userId', 'itemId', 'label', 'title', 'imdb_id']
continuous_base_fields = ['imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year']
indicator_base_fields = ['gender_F', 'gender_M', 'age_group_1', 
                         'age_group_18', 'age_group_25', 'age_group_35', 
                         'age_group_45', 'age_group_50', 'age_group_56']
all_base_fields = identifier_fields + continuous_base_fields + indicator_base_fields

user_avgs = [col_name for col_name in unrated_with_full_metadata.columns if col_name[-11:] == '_avg_rating']
movie_metadata = [col_name for col_name in unrated_with_full_metadata.columns 
                  if (col_name[-11:] != '_avg_rating' and col_name not in all_base_fields)]

user_avgs_genres = [genre for genre in user_avgs if 'genre' in genre]
user_avgs_ml_genres = [genre for genre in user_avgs if genre[:8] == 'ml_genre']
user_avgs_imdb_genres = [genre for genre in user_avgs if genre[:10] == 'imdb_genre']
user_avgs_directors = [director for director in user_avgs if director[0:9] == 'director_']
user_avgs_actors = [actor for actor in user_avgs if actor[0:6] == 'actor_']

movie_genres = [genre for genre in movie_metadata if 'genre' in genre]
movie_ml_genres = [genre for genre in movie_metadata if genre[:8] == 'ml_genre']
movie_imdb_genres = [genre for genre in movie_metadata if genre[:10] == 'imdb_genre']
movie_directors = [director for director in movie_metadata if director[0:9] == 'director_']
movie_actors = [actor for actor in movie_metadata if actor[0:6] == 'actor_']

In [19]:
max_user_id = unrated_with_full_metadata.agg({"userId": "max"}).collect()[0][0]
max_movie_id = unrated_with_full_metadata.agg({"itemId": "max"}).collect()[0][0]
num_rating_labels = unrated_with_full_metadata.select('label').distinct().count()

In [20]:
bucket_size = 100
wide_base_cols = indicator_base_fields + movie_genres + user_avgs_genres
wide_base_dims = [3 for i in (indicator_base_fields + movie_genres)] + [6 for i in user_avgs_genres]
indicator_cols = indicator_base_fields + movie_genres
indicator_dims = [3 for i in indicator_cols]
continuous_cols = continuous_base_fields + user_avgs_genres
column_info = ColumnFeatureInfo(
            wide_base_cols = wide_base_cols,
            wide_base_dims = wide_base_dims,
            # wide_cross_cols = ["age-gender"],
            # wide_cross_dims = [bucket_size],
            indicator_cols = indicator_cols,
            indicator_dims = indicator_dims,
            embed_cols = ["userId", "itemId"],
            embed_in_dims = [max_user_id, max_movie_id],
            embed_out_dims = [100, 100],
            continuous_cols = continuous_cols)

In [21]:
%%time
wnd_pred_rdd = unrated_with_full_metadata.rdd.map(lambda row: to_user_item_feature(row, column_info))

CPU times: user 8.31 ms, sys: 3.17 ms, total: 11.5 ms
Wall time: 3.72 s


In [22]:
%%time
WnDModel = WideAndDeep.load_model(path = data_path + 'WnD_Model.bigdl', 
                                  weight_path = data_path + 'WnD_Model_weights.h5')

CPU times: user 1.6 ms, sys: 630 µs, total: 2.23 ms
Wall time: 1.61 s


In [23]:
%%time
wnd_user_recs = WnDModel.recommend_for_user(wnd_pred_rdd, num_recs)
user_recs = [user_rec.item_id for user_rec in wnd_user_recs.take(num_recs)]

CPU times: user 57.4 ms, sys: 16.4 ms, total: 73.8 ms
Wall time: 6.33 s


In [24]:
%%time
wnd_movie_recs = movies.filter(col('item_id').isin(user_recs)).select('title', 'year')
print 'Wind&Deep Recommendations'
print wnd_movie_recs.toPandas()

Wind&Deep Recommendations
                                 title  year
0                          Nine Months  1995
1        Bridge on the River Kwai, The  1957
2      Monty Python and the Holy Grail  1974
3                   Lawrence of Arabia  1962
4                              Yojimbo  1961
5                           Spaceballs  1987
6   Star Wars: Episode IV - A New Hope  1977
7                      American Beauty  1999
8                      King and I, The  1956
9                          Matrix, The  1999
10                        Jackie Brown  1997
11                      Close Shave, A  1995
12                      Godfather, The  1972
13                        Citizen Kane  1941
14                  Driving Miss Daisy  1989
CPU times: user 17.4 ms, sys: 2.18 ms, total: 19.5 ms
Wall time: 3.76 s


### Full Function Recommendation Examples

In [27]:
%%time
fav_movies = ['Iron Man', 'Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings', 'Harry Potter',
             'The Family Stone', 'Shaun of the Dead', 'Up', 'A View to a Kill']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 26, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                      title  year
item_id                                          
318          318  Shawshank Redemption, The  1994
2116        2116     Lord of the Rings, The  1978
2376        2376          View to a Kill, A  1985
Adding ratings to full set
Creating prediction set
Formatting training and prediction dataframes for NCF
Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
Making Predictions

NCF Recommendations
                                               title  year
0                                Usual Suspects, The  1995
1                                       Verdict, The  1982
2                           Rescuers Down Under, The  1990
3                                      Forever Young  1992
4                                     Meet Joe Black  1998
5                        

In [28]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 26, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                      title  year
item_id                                          
318          318  Shawshank Redemption, The  1994
2116        2116     Lord of the Rings, The  1978
Adding ratings to full set
Creating prediction set
Formatting training and prediction dataframes for NCF
Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
Making Predictions

NCF Recommendations
                      title  year
0             Cool Runnings  1993
1            Paths of Glory  1957
2                 Beautiful  2000
3               Blue Hawaii  1961
4         Cold Comfort Farm  1995
5                   Sanjuro  1962
6                  Trekkies  1997
7  Blair Witch Project, The  1999
8            Close Shave, A  1995
9            Godfather, The  1972
 
Wind&Deep Recommendations
                           

In [29]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 26, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                title  year
item_id                                    
1              1            Toy Story  1995
3114        3114          Toy Story 2  1999
1197        1197  Princess Bride, The  1987
Adding ratings to full set
Creating prediction set
Formatting training and prediction dataframes for NCF
Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
Making Predictions

NCF Recommendations
                                               title  year
0                                Usual Suspects, The  1995
1                                Wrong Trousers, The  1993
2                              To Kill a Mockingbird  1962
3           Bicycle Thief, The (Ladri di biciclette)  1948
4  Seven Samurai (The Magnificent Seven) (Shichin...  1954
5                          Shawshank Redemption, The  1

In [30]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 8, gender = 'F', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                title  year
item_id                                    
1              1            Toy Story  1995
3114        3114          Toy Story 2  1999
1197        1197  Princess Bride, The  1987
Adding ratings to full set
Creating prediction set
Formatting training and prediction dataframes for NCF
Training NCF Model
creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
Making Predictions

NCF Recommendations
                       title  year
0        Usual Suspects, The  1995
1                  Cape Fear  1991
2              Cool Runnings  1993
3                  Beautiful  2000
4                Blue Hawaii  1961
5           Schindler's List  1993
6  Shawshank Redemption, The  1994
7                    Sanjuro  1962
8             Close Shave, A  1995
9             Godfather, The  1972
 
Wind&Deep Recommendati

In [31]:
%%time
fav_movies = ['The Sound of Music', 'Blackhawk Down', 'Pearl Harbor', 'Toy Story', 'The Princess Bride',  
              'Foreign Student', 'Star Wars', 'The Shining', 'Rear Window', 'Groundhog Day', 'Ghostbusters', 
              'Robin Hood (1993)', 'Die Hard']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 40, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                                           title  year
item_id                                                               
1035        1035                             Sound of Music, The  1965
1              1                                       Toy Story  1995
3114        3114                                     Toy Story 2  1999
1197        1197                             Princess Bride, The  1987
572          572                                 Foreign Student  1994
1196        1196  Star Wars: Episode V - The Empire Strikes Back  1980
1210        1210      Star Wars: Episode VI - Return of the Jedi  1983
260          260              Star Wars: Episode IV - A New Hope  1977
2628        2628       Star Wars: Episode I - The Phantom Menace  1999
904          904                                     Rear Window  1954
1265        1265                                   Groundhog Day  1993
2716      

In [None]:
sc.stop()