# Notebook for Collaborative Filtering with both ALS and NCF models for 20M rows

In this notebook, we implement ALS and NCF models for Movie Recommendation System for 1M movie ratings. The 20M reviews dataset contains 20 million reviews made by 138,000 users on 27,000 movies.

In [1]:
# Intialization
import os
import time
import datetime as dt

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark sql imports
from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql.functions import UserDefinedFunction, explode, desc, rank, col, row_number
from pyspark.sql.types import *
from pyspark.sql.window import Window

# spark ml imports
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# spark bigdl, analytics zoo imports
from zoo.models.recommendation import UserItemFeature
from zoo.models.recommendation import NeuralCF
from zoo.common.nncontext import init_nncontext
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *

# data science imports
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from operator import itemgetter

In [None]:
data_path = 'hdfs:///user/andrew/'

In [3]:
sc = init_nncontext("NCF Example")

## Data Preparation

In [4]:
# Initialize the SQLContext for reading in parquet files as Spark dataframes
sqlContext = SQLContext(sc)

# Load in the ratings data and format such that it has 3 columns - userId, movieId, rating
# The ratings data will be used for modeling and making recommendations
ratings = sqlContext.read.parquet(data_path + 'ratings_20m')
ratings = ratings.drop('timestamp')
ratings = ratings.withColumn("userId", ratings["userId"].cast("int"))
ratings = ratings.withColumn("rating", ratings["rating"] * 2) #Multiply by 2 so that values are whole numbers -> values 1 to 10

# Load in the movies data and format such that it contains 3 columns - movieId, title, genres
# The movies data will be used in the final step to understand what items have been recommended
movies = sqlContext.read.parquet(data_path + 'movies_20m')
movies = movies.drop('imdbId')

In [5]:
ratings.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|130432|   3003|  10.0|
|130432|   3006|  10.0|
|130432|   3010|   8.0|
|130432|   3052|  10.0|
|130432|   3055|  10.0|
+------+-------+------+
only showing top 5 rows



In [6]:
movies.show(5)

+-------+--------------------+-------------+
|movieId|               title|       genres|
+-------+--------------------+-------------+
|  66509| Funny People (2009)| Comedy|Drama|
|  66511|Berlin Calling (2...| Comedy|Drama|
|  66513|Devil Hides in Do...|  Documentary|
|  66517|Against the Dark ...|Action|Horror|
|  66537|Letter for the Ki...|    Adventure|
+-------+--------------------+-------------+
only showing top 5 rows



In [7]:
ratings_train, ratings_val = ratings.randomSplit([0.8, 0.2], seed = 42)
print('The random split results in %s reviews in the training dataset and %s reviews in the validation dataset.' 
      % (ratings_train.count(), ratings_val.count()))
ratings_train.take(3)

The random split results in 16003582 reviews in the training dataset and 3996681 reviews in the validation dataset.


[Row(userId=130432, movieId=3003, rating=10.0),
 Row(userId=130432, movieId=3055, rating=10.0),
 Row(userId=130432, movieId=3083, rating=6.0)]

In [8]:
# Format the training and validation datasets into RDDs of Sample. This is the distributed format 
# used in Analytics Zoo and BigDL to speed up processing time.
def build_sample(user_id, item_id, rating):
    sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating]))
    return UserItemFeature(user_id, item_id, sample)

fullPairFeatureRdds = ratings.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))
trainPairFeatureRdds = ratings_train.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))
valPairFeatureRdds = ratings_val.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))

full_rdd = fullPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
train_rdd = trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
val_rdd = valPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)

In [9]:
# Visualize the first three rows of the training data to better understand what a RDD of Sample looks like.
train_rdd.take(3)

[Sample: features: [JTensor: storage: [130432.   3003.], shape: [2], float], labels: [JTensor: storage: [10.], shape: [1], float],
 Sample: features: [JTensor: storage: [130432.   3055.], shape: [2], float], labels: [JTensor: storage: [10.], shape: [1], float],
 Sample: features: [JTensor: storage: [130432.   3083.], shape: [2], float], labels: [JTensor: storage: [6.], shape: [1], float]]

## ALS and NCF Model Training and Validation on Training data
Train ALS and NCF models and compare the Mean Absolte Error (MAE) for each on the validation set. With the parameter settings set below, the ALS model has slightly lower validation error, but also takes far less time to train. However, when comparing the training and validation error for each model, the ALS model is more over fit.

In [10]:
%%time
als = ALS(seed = 42, regParam = 0.1, maxIter = 15, rank = 12,
          userCol = "userId", itemCol = "movieId", ratingCol = "rating")
evaluator = RegressionEvaluator(metricName="mae", labelCol="rating",
                                predictionCol="prediction")
als_model = als.fit(ratings_train)

CPU times: user 8.03 ms, sys: 2.06 ms, total: 10.1 ms
Wall time: 19.7 s


In [11]:
%%time
print 'Training Error (MAE):', evaluator.evaluate(als_model.transform(ratings_train))
print 'Validation Error (MAE):', evaluator.evaluate(als_model.transform(ratings_val).fillna(0))

Training Error (MAE): 1.12183070622
Validation Error (MAE): 1.21714696856
CPU times: user 7.63 ms, sys: 1.86 ms, total: 9.48 ms
Wall time: 13.6 s


In [12]:
# Save ALS model (trained on all 20M reviews)
als_model.write().overwrite().save(path = data_path + 'ALS_Model_test.h5')

als_model_test = ALSModel.load(path = data_path + 'ALS_Model_test.h5')

print 'Training Error (MAE):', evaluator.evaluate(als_model_test.transform(ratings_train))
print 'Validation Error (MAE):', evaluator.evaluate(als_model_test.transform(ratings_val).fillna(0))

Training Error (MAE): 1.12183070622
Validation Error (MAE): 1.21714696856


In [13]:
%%time
batch_size = 92160
max_user_id = ratings.agg({'userId': 'max'}).collect()[0]['max(userId)']
max_movie_id = ratings.agg({'movieId': 'max'}).collect()[0]['max(movieId)']

ncf = NeuralCF(user_count = max_user_id, item_count = max_movie_id, 
               class_num = 10, hidden_layers = [20, 10], include_mf = False)

optimizer = Optimizer(
    model=ncf,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    end_trigger=MaxEpoch(10),
    batch_size=batch_size, # 16 executors, 16 cores each
    optim_method=Adam(learningrate=0.001))

optimizer.set_validation(
    batch_size=batch_size, # 16 executors, 16 cores each
    val_rdd=val_rdd,
    trigger=EveryEpoch(),
    val_method=[MAE(), Loss(ClassNLLCriterion())]
)

optimizer.optimize()

creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
creating: createEveryEpoch
creating: createMAE
creating: createClassNLLCriterion
creating: createLoss
CPU times: user 43.4 ms, sys: 29 ms, total: 72.4 ms
Wall time: 5min 14s


In [14]:
%%time
train_res = ncf.evaluate(train_rdd, batch_size, [MAE()])
val_res = ncf.evaluate(val_rdd, batch_size, [MAE()])

print 'Training Error (MAE):', train_res[0]
print 'Validation Error (MAE):', val_res[0]

creating: createMAE
creating: createMAE
Training Error (MAE): Evaluated result: 1.23713159561, total_num: 44580, method: MAE
Validation Error (MAE): Evaluated result: 1.27953612804, total_num: 11238, method: MAE
CPU times: user 24.7 ms, sys: 3.41 ms, total: 28.1 ms
Wall time: 13.9 s


In [15]:
# Save NCF model (trained on all 20M reviews)
ncf.save_model(path = data_path + 'NCF_Model_test.bigdl', 
               weight_path = data_path + 'NCF_Model_test_weights.bin', 
               over_write = True)
# Load NCF model - compare loaded model results to trained model results
ncf_test = NeuralCF.load_model(path = data_path + 'NCF_Model_test.bigdl', 
                               weight_path = data_path + 'NCF_Model_test_weights.bin')

train_res = ncf_test.evaluate(train_rdd, batch_size, [MAE()])
val_res = ncf_test.evaluate(val_rdd, batch_size, [MAE()])

print 'Training Error (MAE):', train_res[0]
print 'Validation Error (MAE):', val_res[0]

creating: createMAE
creating: createMAE
Training Error (MAE): Evaluated result: 1.23713171482, total_num: 44580, method: MAE
Validation Error (MAE): Evaluated result: 1.27953600883, total_num: 11238, method: MAE


## ALS and NCF Model Training and Validation on the entire dataset

In [21]:
%%time
als = ALS(seed = 42, regParam = 0.1, maxIter = 15, rank = 12, # coldStartStrategy = 'drop', # drops userIds/movieIds from the validation set or test set so that NaNs are not returned
          userCol = "userId", itemCol = "movieId", ratingCol = "rating")
evaluator = RegressionEvaluator(metricName="mae", labelCol="rating",
                                predictionCol="prediction")
als_model = als.fit(ratings)
print 'Model Error (MAE):', evaluator.evaluate(als_model.transform(ratings))

Model Error (MAE): 1.13562066895
CPU times: user 13.9 ms, sys: 3.29 ms, total: 17.2 ms
Wall time: 26.1 s


In [22]:
# Save ALS model (trained on all 20M reviews)
als_model.write().overwrite().save(path = data_path + 'ALS_Model_20m.h5')

In [23]:
%%time
max_user_id = ratings.agg({'userId': 'max'}).collect()[0]['max(userId)']
max_movie_id = ratings.agg({'movieId': 'max'}).collect()[0]['max(movieId)']
ncf = NeuralCF(user_count=max_user_id, item_count=max_movie_id, class_num=10, hidden_layers=[20, 10], include_mf = False)

optimizer = Optimizer(
    model=ncf,
    training_rdd=full_rdd,
    criterion=ClassNLLCriterion(),
    end_trigger=MaxEpoch(10),
    batch_size=batch_size, # 16 executors, 16 cores each
    optim_method=Adam(learningrate=0.001))

optimizer.optimize()

full_res = ncf.evaluate(full_rdd, batch_size, [MAE()])
print 'Model Error (MAE):', full_res[0]

creating: createZooNeuralCF
creating: createClassNLLCriterion
creating: createMaxEpoch
creating: createAdam
creating: createDistriOptimizer
creating: createMAE
Model Error (MAE): Evaluated result: 1.24277722836, total_num: 55778, method: MAE
CPU times: user 41.8 ms, sys: 30.3 ms, total: 72.1 ms
Wall time: 5min 58s


In [24]:
# Save NCF model (trained on all 20M reviews)
ncf.save_model(path = data_path + 'NCF_Model_20m.bigdl', 
               weight_path = data_path + 'NCF_Model_20m_weights.bin', 
               over_write = True)

### Predictions Comparison

Compare the prediction between ALS and NCF for one specific user. The user id is specified in the final two cells

In [25]:
%%time
# Create a sparse matrix of all combinations of items
ratings_df = ratings.toPandas()
ratings_matrix = ratings_df.pivot(index='userId',columns='movieId',values='rating').fillna(0)

# Melt sparse matrix to dataframe of 3 columns containing userId, movieId, and rating
ratings_matrix['userId'] = ratings_matrix.index
ratings_df_2 = pd.melt(ratings_matrix, id_vars = ['userId'], value_vars = list(ratings_matrix.columns).remove('userId'))
ratings_df_2.columns = ['userId', 'movieId', 'rating']
ratings_df_2.shape

CPU times: user 3min 56s, sys: 50.1 s, total: 4min 46s
Wall time: 5min 3s


In [26]:
%%time
# Predict for specified user
pred_userId = 25643
# keep only the userId, movieId pairs that do not have ratings
ratings_blanks_df = ratings_df_2.iloc[np.where((ratings_df_2.rating == 0) 
                                               & (ratings_df_2.userId == pred_userId))]

# Convert to spark dataframe
ratings_blanks = sqlContext.createDataFrame(ratings_blanks_df)
# Create RDD of Sample from the spark dataframe
blankPairFeatureRdds = ratings_blanks.rdd.map(lambda x: build_sample(x[0], x[1], x[2]))

CPU times: user 17.9 s, sys: 8.83 s, total: 26.7 s
Wall time: 26.8 s


In [27]:
%%time
als_pair_preds = als_model.transform(ratings_blanks)
ncf_pair_preds = ncf.recommend_for_user(blankPairFeatureRdds, 10).toDF()

als_preds = als_pair_preds.select('userId', 'movieId', 'prediction').toDF('userId', 'movieId', 'als_pred')
ncf_preds_topN = ncf_pair_preds.select('user_id', 'item_id', 'prediction').toDF('userId', 'movieId', 'ncf_pred')
del als_pair_preds, ncf_pair_preds

CPU times: user 54.4 ms, sys: 15.1 ms, total: 69.5 ms
Wall time: 5.39 s


In [28]:
%%time
window = Window.partitionBy(als_preds['userId']).orderBy(als_preds['als_pred'].desc())
als_preds_topN = als_preds.select(col('*'), row_number().over(window).alias('row_number')).where(col('row_number') <= 10)

als_preds_topN_labeled = als_preds_topN.join(movies, how = 'left', on = 'movieId')
ncf_preds_topN_labeled = ncf_preds_topN.join(movies, how = 'left', on = 'movieId')

als_final = als_preds_topN_labeled.select('userId', 'movieId', 'als_pred', 'title').sort(col("userId")).toPandas()
ncf_final = ncf_preds_topN_labeled.select('userId', 'movieId', 'ncf_pred', 'title').sort(col("userId")).toPandas()
del window, als_preds, als_preds_topN, ncf_preds_topN, als_preds_topN_labeled, ncf_preds_topN_labeled

CPU times: user 52.9 ms, sys: 12.7 ms, total: 65.7 ms
Wall time: 10.9 s


In [29]:
als_final

Unnamed: 0,userId,movieId,als_pred,title
0,25643,77947,10.808342,Harishchandrachi Factory (2009)
1,25643,94222,10.600865,Don't Eat the Pictures: Sesame Street at the M...
2,25643,87164,10.402793,Henri-Georges Clouzot's Inferno (L'enfer d'Hen...
3,25643,113218,10.314117,Space Milkshake (2012)
4,25643,109786,10.275653,Carmina and Amen (Carmina y amÃ©n) (2014)
5,25643,82836,10.231801,"Life of Reilly, The (2006)"
6,25643,110669,10.168848,"Honest Liar, An (2014)"
7,25643,68273,10.152931,Amazing Journey: The Story of The Who (2007)
8,25643,116951,10.125403,Bo Burnham: what. (2013)
9,25643,43267,9.998287,On Probation (Tiempo de Valientes) (2005)


In [30]:
ncf_final

Unnamed: 0,userId,movieId,ncf_pred,title
0,25643,105250,9,"Century of the Self, The (2002)"
1,25643,101850,9,Death on the Staircase (SoupÃ§ons) (2004)
2,25643,86237,9,Connections (1978)
3,25643,77658,9,Cosmos (1980)
4,25643,318,9,"Shawshank Redemption, The (1994)"
5,25643,93040,9,"Civil War, The (1990)"
6,25643,7502,9,Band of Brothers (2001)
7,25643,26587,9,"Decalogue, The (Dekalog) (1989)"
8,25643,2019,9,Seven Samurai (Shichinin no samurai) (1954)
9,25643,114635,9,"Look of Silence, The (2014)"
