In [1]:
# Some functions are from or based on work done by Kevin Liao in the below notebook
# https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/master/movie_recommender/movie_recommendation_using_ALS.ipynb

# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.recommendation import ALS
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor, GBTRegressionModel

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
data_path = 'hdfs:///user/andrew/'

In [2]:
%%time
# Read in data through spark since the data is sored in hadoop and format the columns
# Convert to pandas dataframes for easier and faster manipulation
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import *
sqlContext = SQLContext(sc)

Rating = Row("user_id", "item_id", "label") # Ignore timestamp

movies = sqlContext.read.parquet(data_path + 'movie_metadata_OHE')
movies_df = movies.toPandas()
movies_df = movies_df.set_index(movies_df.item_id) # set index so no sorting errors occur

# movies_gp = sqlContext.read.parquet('hdfs:///user/andrew/movie_genre_and_people_metadata_ohe_subset')
movies_gp = movies.drop('title', 'imdb_id', 'imdb_rating', 'imdb_votes', 'metascore', 'runtime', 'year')

ratings = sc.textFile(data_path + 'ratings.dat')\
    .map(lambda line: line.split("::")[0:3])\
    .map(lambda line: map(int, line))\
    .map(lambda r: Rating(*r))
ratings = sqlContext.createDataFrame(ratings)

CPU times: user 59.6 s, sys: 2.17 s, total: 1min 1s
Wall time: 2min 2s


In [3]:
# User input function - takes user input data, strpis it down, and calls other functions on that data
# Takes in user age, gender, list of favorite movies
# All movies in the list of favorite movies will be rated 5 stars
def new_user_input(fav_movies, all_ratings, movies, spark_context, 
                   sqlContext = None, num_recs = 10, age = None, 
                   gender = None, movies_gp = None, movies_df = None):
    # age should be an integer in 1 - 100
    # gender should be M or F
    # fav_movies should be in the form of ["Iron Man", "The Shawshank Redemption", "Robin Hood"]
    #    If there are multiple versions of the movie and the user wishes for one other than the most recent one, they
    #    should specify with a year in parenthesis, like "Robin Hood (1993)"
    
    # collect favorite movie ids
    print 'Collecting favorite movie IDs'
    movieIds = get_movieId(movies_df, fav_movies)
    print 'Favorite movies in the available set'
    print movies_df[['item_id', 'title', 'year']].loc[movieIds]
    
    print 'Adding ratings to full set'
    # add new user movie ratings to all ratings dataframe
    all_ratings_updated, new_user_ratings = add_new_user_to_data(all_ratings, movieIds, spark_context)
    del all_ratings
    
    print 'Creating prediction set'
    # get all unrated movies for user (unnecessary in Spark 2.2+, instead use the recommendForAllUsers(num_to_rec) method)
    all_user_unrated = get_inference_data(all_ratings_updated, movieIds)
    
    print 'Training ALS model'
    # train ALS model, then predict movie ratings
    als = ALS(seed = 42, regParam = 0.1, maxIter = 15, rank = 12, 
          userCol = "user_id", itemCol = "item_id", ratingCol = "label")
    als_model = als.fit(all_ratings_updated)
    del all_ratings_updated
    
    print 'Making Predictions'
    # keep top 30 predictions
    full_predictions_sorted = als_model.transform(all_user_unrated).sort(desc('prediction'))
    als_top_n_predictions = full_predictions_sorted.take(num_recs)
    # extract movie ids
    als_top_n_ids = [r[1] for r in als_top_n_predictions]
    
    als_movie_recs = movies.filter(movies.item_id.isin(als_top_n_ids)).select('title', 'year')
    print ''
    print 'ALS Recommendations'
    print als_movie_recs.toPandas()
    
    
    # format data for prediction using GBTs
    # create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
    # filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
    # then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
    # this should now be a vector of user preferences. 
    # join a OHE age, gender, and possibly occupation, to the user preferences
    user_summary = get_user_preferences(user_ratings = new_user_ratings, movieIds = movieIds, 
                                        movies_gp = movies_gp, age = age, gender = gender, sqlContext = sqlContext)
    
    # Extract movie ids from the top 5*num_recs for Gradient Boosted Trees prediction
    als_top_3xn_predictions = full_predictions_sorted.take(3*num_recs)
    als_top_3xn_ids = [r[1] for r in als_top_3xn_predictions]
    all_user_unrated_top_3xn = all_user_unrated.filter(all_user_unrated.item_id.isin(als_top_3xn_ids))
    top_3xn_movies_metadata = movies.filter(movies.item_id.isin(als_top_3xn_ids))
        
    # lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
    # (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
    unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                    .join(top_3xn_movies_metadata, on = 'item_id', how = 'left')
    unrated_with_full_metadata = unrated_with_movie_metadata \
                                    .join(user_summary, on = 'user_id', how = 'left') \
                                    .drop('user_id', 'title', 'imdb_id')
    
    # the GBT model takes in the rows as vectors, so the columns must be converted to the feature space
    unrated_with_full_metadata_rdd = unrated_with_full_metadata.rdd.map(lambda x: (x[0], Vectors.dense(x[1:])))
    unrated_metadata_features = sqlContext.createDataFrame(unrated_with_full_metadata_rdd, schema = ['item_id', 'features'])
    
    # import the GBT model, in this case a GBTRegressionModel with tree depth of 10
    GBTRegD10Model = GBTRegressionModel.load(data_path + 'GBTRegD10Model')
    
    # use pre-trained GBT model to predict movie ratings
    gbtr_preds = GBTRegD10Model.transform(unrated_metadata_features)
    
    # sort by predicted rating, and keep top recommend top n
    gbtr_top_n_predictions = gbtr_preds.sort(desc('prediction')).take(num_recs)
    # extract movie ids
    gbtr_top_n_ids = [r[0] for r in gbtr_top_n_predictions]

    gbtr_movie_recs = movies.filter(movies.item_id.isin(gbtr_top_n_ids)).select('title', 'year')
    print ''
    print 'GBTR Recommendations'
    print gbtr_movie_recs.toPandas()

In [4]:
def get_movieId(movies_df, fav_movie_list):
    """
    return all movieId(s) of user's favorite movies
    
    Parameters
    ----------
    df_movies: spark Dataframe, movies data
    
    fav_movie_list: list, user's list of favorite movies
    
    Return
    ------
    movieId_list: list of movieId(s)
    """
    movieId_list = []
    for movie in fav_movie_list:
        # Remove first word in specific cases. This is because sometimes the first word is listed 
        # at the end of the movie title and would not be found by the search if it is included.
        if movie[0:4] == 'The ':
            movie = movie[4:]
        elif movie[0:3] == 'An ':
            movie = movie[3:]
        elif movie[0:3] == 'La ':
            movie = movie[3:]
        elif movie[0:2] == 'A ':
            movie = movie[3:]
        
        # If a year is provided by the user, remove it from the title and use it in the filtering.
        if movie[-6:-5] == '(':
            year = int(movie[-5:-1])
            movie = movie[0:-7]
            movieIds = movies_df.item_id[(movies_df.title.str.contains(movie)) & (movies_df.year == year)]
            movieId_list.extend(movieIds)
        # If no year is provided, determine if the title is a single word.
        # If it is a single work, find an exact match
        elif len(movie.split(' ')) == 1:
            movieIds = movies_df.item_id[movies_df.title == movie]
            movieId_list.extend(movieIds)
        # Otherwise search for the movie title
        else:
            movieIds = movies_df.item_id[movies_df.title.str.contains(movie)]
            movieId_list.extend(movieIds)
    return movieId_list

In [5]:
def add_new_user_to_data(train_data, movieIds, spark_context):
    """
    add new rows with new user, user's movie and ratings to
    existing train data

    Parameters
    ----------
    train_data: Spark DataFrame, ratings data
    
    movieIds: spark DataFrame, single column of movieId(s)

    spark_context: Spark Context object
    
    Return
    ------
    new train data with the new user's rows
    """
    # Get new user id
    new_id = train_data.agg({"user_id": "max"}).collect()[0][0] + 1
    # Get max rating
    max_rating = train_data.agg({"label": "max"}).collect()[0][0]
    # Create new user sdf for max rating
    user_rows_max = [(new_id, movieId, max_rating) for movieId in movieIds]
    new_sdf_max = spark_context.parallelize(user_rows_max).toDF(['user_id', 'item_id', 'label'])
    # Return new train data
    return train_data.union(new_sdf_max), new_sdf_max

In [6]:
def get_inference_data(train_data, movieIds):
    """
    return a rdd with the userid and all movies (except ones in movieId_list)

    Parameters
    ----------
    train_data: spark RDD, ratings data

    df_movies: spark Dataframe, movies data
    
    movieId_list: list, list of movieId(s)

    Return
    ------
    inference data: Spark RDD
    """
    # Get new user id
    new_id = train_data.agg({"user_id": "max"}).collect()[0][0]
    # Get the set of movies that have not been rated by the user
    distinct_unrated_items = ratings.select('item_id').distinct().filter(~col('item_id').isin(movieIds))
    # Create the dataset for the new user with their unrated movies
    user_unrated = distinct_unrated_items.withColumn('user_id', lit(new_id)).select('user_id', 'item_id')
    return user_unrated

In [7]:
def get_user_preferences(user_ratings, movieIds, movies_gp, age, gender, sqlContext):
    user_demog = pd.DataFrame({'gender_M': 0, 'gender_F': 0, 'age_group_1': 0, 
                               'age_group_18': 0, 'age_group_25': 0, 'age_group_35': 0, 
                               'age_group_45': 0, 'age_group_50': 0, 'age_group_56': 0}, index = [0])
    # Bin user by age
    if age < 18:
        user_demog.age_group_1[0] = 1
    elif age < 25:
        user_demog.age_group_18[0] = 1
    elif age < 35:
        user_demog.age_group_25[0] = 1
    elif age < 45:
        user_demog.age_group_35[0] = 1
    elif age < 50:
        user_demog.age_group_45[0] = 1
    elif age < 56:
        user_demog.age_group_50[0] = 1
    else:
        user_demog.age_group_56[0] = 1
    # Binarize gender
    if gender == 'M':
        user_demog.gender_M[0] = 1
    else:
        user_demog.gender_F[0] = 1
        
    # new_user_ratings
    # Create the user-item matrix for the new user
    pivoted_user_ratings_df = user_ratings.toPandas() \
                                            .pivot(index='user_id', 
                                                   columns='item_id',
                                                   values='label') \
                                            .fillna(0)
    pivoted_user_ratings_df_binary = pivoted_user_ratings_df / pivoted_user_ratings_df
    
    # Reduce the movie set to only the user's rate movies
    movies_gp_filtered = movies_gp.filter(col('item_id').isin(movieIds))
    movies_gp_filtered_df = movies_gp_filtered.toPandas()
    movies_gp_filtered_df = movies_gp_filtered_df.set_index('item_id')
    
    # Create user profile
    user_summary_total = pivoted_user_ratings_df.dot(movies_gp_filtered_df)
    user_summary_count = pivoted_user_ratings_df_binary.dot(movies_gp_filtered_df)
    user_summary_avg = (user_summary_total / user_summary_count).fillna(0)
    user_summary_avg = user_summary_avg.add_suffix('_avg_rating').reset_index()
    
    # Create Spark dataframe output for full user profile
    user_summary = pd.concat([user_summary_avg, user_demog], axis = 1)
    sorted_columns = list(user_summary.columns.sort_values())
    user_summary_sdf = sqlContext.createDataFrame(user_summary[sorted_columns])
    return user_summary_sdf

### Step by Step Walkthrough of Main Function (to show runtime)

In [8]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
# collect favorite movie ids
print 'Collecting favorite movie IDs'
movieIds = get_movieId(movies_df, fav_movies)
if movies_df is not None:
    print 'Favorite movies in the available set'
    print movies_df[['item_id', 'title', 'year']].loc[movieIds]

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                      title  year
item_id                                          
318          318  Shawshank Redemption, The  1994
2116        2116     Lord of the Rings, The  1978
CPU times: user 13.6 ms, sys: 2.06 ms, total: 15.7 ms
Wall time: 14.9 ms


In [9]:
%%time
print 'Adding ratings to full set'
# add new user movie ratings to all ratings dataframe
all_ratings_updated, user_ratings = add_new_user_to_data(ratings, movieIds, sc)

Adding ratings to full set
CPU times: user 43.4 ms, sys: 4.89 ms, total: 48.3 ms
Wall time: 12 s


In [10]:
%%time
print 'Creating prediction set'
# get all unrated movies for user (unnecessary in Spark 2.2+, instead use the recommendForAllUsers(num_to_rec) method)
all_user_unrated = get_inference_data(all_ratings_updated, movieIds)

Creating prediction set
CPU times: user 14.8 ms, sys: 4.14 ms, total: 18.9 ms
Wall time: 4.19 s


In [11]:
%%time
print 'Training ALS model'
# train ALS model, then predict movie ratings
als = ALS(seed = 42, regParam = 0.1, maxIter = 15, rank = 12,
      userCol = "user_id", itemCol = "item_id", ratingCol = "label")
als_model = als.fit(all_ratings_updated)
del all_ratings_updated

Training ALS model
CPU times: user 41 ms, sys: 11.5 ms, total: 52.5 ms
Wall time: 8.09 s


In [12]:
%%time
print 'Making Predictions'
# keep top 15 predictions
num_recs = 15
full_predictions = als_model.transform(all_user_unrated)
als_top_n_predictions = full_predictions.sort(desc('prediction')).take(num_recs)
# extract movie ids
als_top_n_ids = [r[1] for r in als_top_n_predictions]

als_movie_recs = movies.filter(movies.item_id.isin(als_top_n_ids)).select('title', 'year')
print 'ALS Recommendations'
print als_movie_recs.toPandas()

Making Predictions
ALS Recommendations
                                        title  year
0                            Ulysses (Ulisse)  1954
1          Schlafes Bruder (Brother of Sleep)  1995
2                             Foreign Student  1994
3                             Window to Paris  1994
4                      Across the Sea of Time  1995
5                         Usual Suspects, The  1995
6                                  Braveheart  1995
7                               Smashing Time  1967
8                    Very Thought of You, The  1998
9                       Saltmen of Tibet, The  1997
10                          Bewegte Mann, Der  1994
11  Midaq Alley (Callejn de los milagros, El)  1995
12         Star Wars: Episode IV - A New Hope  1977
13         Life Is Beautiful (La Vita  bella)  1997
14                         American History X  1998
CPU times: user 39.7 ms, sys: 4.22 ms, total: 43.9 ms
Wall time: 44 s


In [13]:
%%time
# import GBT model input data format
# Create user_id x item_id matrix need to get data in the form of user_id, item_id, label, then pivot
# filter movies_gp dataframe by the movieIds. pivot new_user_ratings into a vector, 
# then multiply by the filtered movies_gp dataframe; divide by binarized user ratings; 
# this should now be a vector of user preferences. 
# join a OHE age, gender, and possibly occupation, to the user preferences
user_summary_sdf = get_user_preferences(user_ratings = user_ratings, movieIds = movieIds, 
                                        movies_gp = movies_gp, age = 26, gender = 'M', 
                                        sqlContext = sqlContext)

CPU times: user 2.2 s, sys: 1.08 s, total: 3.28 s
Wall time: 1min 6s


In [14]:
%%time
als_top_3xn_predictions = full_predictions.sort(desc('prediction')).take(3*num_recs)
als_top_3xn_ids = [r[1] for r in als_top_3xn_predictions]
all_user_unrated_top_3xn = all_user_unrated.filter(all_user_unrated.item_id.isin(als_top_3xn_ids))
top_3xn_movies_metadata = movies.filter(movies.item_id.isin(als_top_3xn_ids))

CPU times: user 40.2 ms, sys: 9.43 ms, total: 49.6 ms
Wall time: 5.55 s


In [15]:
%%time
# lastly, replicate the user pref rows for each rated movieId, then join with the filtered movies dataframe
# (MAKE SURE ALL COLUMNS ARE ORDERED AND NAMED CORRECTLY)
unrated_with_movie_metadata = all_user_unrated_top_3xn \
                                .join(top_3xn_movies_metadata, on = 'item_id', how = 'left')
unrated_with_full_metadata = unrated_with_movie_metadata \
                                .join(user_summary_sdf, on = 'user_id', how = 'left') \
                                .drop('user_id', 'title', 'imdb_id')

CPU times: user 3.25 ms, sys: 937 µs, total: 4.19 ms
Wall time: 8.22 s


In [16]:
%%time
# convert predictors to "features" and it is ready for prediction. 
unrated_with_full_metadata_rdd = unrated_with_full_metadata.rdd.map(lambda x: (x[0], Vectors.dense(x[1:])))
unrated_metadata_features = sqlContext.createDataFrame(unrated_with_full_metadata_rdd, schema = ['item_id', 'features'])

CPU times: user 59.1 ms, sys: 14.1 ms, total: 73.3 ms
Wall time: 5min 16s


In [17]:
%%time
GBTRegD10Model = GBTRegressionModel.load(data_path + 'GBTRegD10Model')
gbtr_preds = GBTRegD10Model.transform(unrated_metadata_features)

CPU times: user 2.83 ms, sys: 1.25 ms, total: 4.08 ms
Wall time: 13.3 s


In [18]:
%%time
gbtr_top_n_predictions = gbtr_preds.sort(desc('prediction')).take(num_recs)
# extract movie ids
gbtr_top_n_ids = [r[0] for r in gbtr_top_n_predictions]

gbtr_movie_recs = movies.filter(movies.item_id.isin(gbtr_top_n_ids)).select('title', 'year')
print 'GBTR Recommendations'
print gbtr_movie_recs.toPandas()

GBTR Recommendations
                                 title  year
0                     Ulysses (Ulisse)  1954
1   Schlafes Bruder (Brother of Sleep)  1995
2                      Foreign Student  1994
3                      Window to Paris  1994
4               Across the Sea of Time  1995
5                             Lamerica  1994
6                        Smashing Time  1967
7                       Big Trees, The  1952
8                      Acid House, The  1998
9             Very Thought of You, The  1998
10                   Julien Donkey-Boy  1999
11               Saltmen of Tibet, The  1997
12                   Bewegte Mann, Der  1994
13                        Mother Night  1996
14               Dream With the Fishes  1997
CPU times: user 56.6 ms, sys: 9.68 ms, total: 66.3 ms
Wall time: 2min 33s


### Full Function Recommendation Examples

In [19]:
%%time
fav_movies = ['Iron Man', 'Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings', 'Harry Potter',
             'The Family Stone', 'Shaun of the Dead', 'Up', 'A View to a Kill']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 26, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                      title  year
item_id                                          
318          318  Shawshank Redemption, The  1994
2116        2116     Lord of the Rings, The  1978
2376        2376          View to a Kill, A  1985
Adding ratings to full set
Creating prediction set
Training ALS model
Making Predictions

ALS Recommendations
                                       title  year
0                           Ulysses (Ulisse)  1954
1         Schlafes Bruder (Brother of Sleep)  1995
2                            Foreign Student  1994
3                            Window to Paris  1994
4                     Across the Sea of Time  1995
5                   Very Thought of You, The  1998
6                      Saltmen of Tibet, The  1997
7                          Bewegte Mann, Der  1994
8  Midaq Alley (Callejn de los milagros, El)  1995
9         Star Wars: Episode IV - A New Hope  1977

GBTR Recomm

In [20]:
%%time
fav_movies = ['Tinker Tailor Soldier Spy', 'Shawshank Redemption', 'Lord of the Rings']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 26, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                      title  year
item_id                                          
318          318  Shawshank Redemption, The  1994
2116        2116     Lord of the Rings, The  1978
Adding ratings to full set
Creating prediction set
Training ALS model
Making Predictions

ALS Recommendations
                                title  year
0                    Ulysses (Ulisse)  1954
1                     Foreign Student  1994
2                     Window to Paris  1994
3              Across the Sea of Time  1995
4                          Braveheart  1995
5                       Smashing Time  1967
6               Saltmen of Tibet, The  1997
7                   Bewegte Mann, Der  1994
8  Life Is Beautiful (La Vita  bella)  1997
9                  American History X  1998

GBTR Recommendations
                                title  year
0                    Ulysses (Ulisse)  1954
1  Schlafes Bruder (Brother o

In [21]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 26, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                title  year
item_id                                    
3114        3114          Toy Story 2  1999
1              1            Toy Story  1995
1197        1197  Princess Bride, The  1987
Adding ratings to full set
Creating prediction set
Training ALS model
Making Predictions

ALS Recommendations
                                             title  year
0                              Wrong Trousers, The  1993
1                          Raiders of the Lost Ark  1981
2                                 Schindler's List  1993
3                                  Foreign Student  1994
4  Wallace & Gromit: The Best of Aardman Animation  1996
5                                          Sanjuro  1962
6                        Leather Jacket Love Story  1997
7                                   Close Shave, A  1995
8               Star Wars: Episode IV - A New Hope  1977
9                        Shawshan

In [22]:
%%time
fav_movies = ['Frozen', 'Tangled', 'Oceans Eleven', 'Toy Story', 'The Princess Bride',  
              'The Incredibles', 'Castle in the Sky', 'Monsters, Inc']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 8, gender = 'F', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                title  year
item_id                                    
3114        3114          Toy Story 2  1999
1              1            Toy Story  1995
1197        1197  Princess Bride, The  1987
Adding ratings to full set
Creating prediction set
Training ALS model
Making Predictions

ALS Recommendations
                                             title  year
0                              Wrong Trousers, The  1993
1                          Raiders of the Lost Ark  1981
2                                 Schindler's List  1993
3                                  Foreign Student  1994
4  Wallace & Gromit: The Best of Aardman Animation  1996
5                                          Sanjuro  1962
6                        Leather Jacket Love Story  1997
7                                   Close Shave, A  1995
8               Star Wars: Episode IV - A New Hope  1977
9                        Shawshan

In [23]:
%%time
fav_movies = ['The Sound of Music', 'Blackhawk Down', 'Pearl Harbor', 'Toy Story', 'The Princess Bride',  
              'Foreign Student', 'Star Wars', 'The Shining', 'Rear Window', 'Groundhog Day', 'Ghostbusters', 
              'Robin Hood (1993)', 'Die Hard']
new_user_input(fav_movies = fav_movies, all_ratings = ratings, 
               movies = movies, spark_context = sc, sqlContext = sqlContext,
               num_recs = 10, age = 40, gender = 'M', movies_gp = movies_gp, movies_df = movies_df)

Collecting favorite movie IDs
Favorite movies in the available set
         item_id                                           title  year
item_id                                                               
1035        1035                             Sound of Music, The  1965
3114        3114                                     Toy Story 2  1999
1              1                                       Toy Story  1995
1197        1197                             Princess Bride, The  1987
572          572                                 Foreign Student  1994
1196        1196  Star Wars: Episode V - The Empire Strikes Back  1980
1210        1210      Star Wars: Episode VI - Return of the Jedi  1983
2628        2628       Star Wars: Episode I - The Phantom Menace  1999
260          260              Star Wars: Episode IV - A New Hope  1977
904          904                                     Rear Window  1954
1265        1265                                   Groundhog Day  1993
2716      

In [None]:
sc.stop()