In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('amazon_reviews_sample').getOrCreate()

%matplotlib inline
import pyspark.sql.functions as f
#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '32g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','32g')])
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

#print spark configuration settings
spark.sparkContext.getConf().getAll()

[('spark.eventLog.enabled', 'true'),
 ('spark.yarn.jars',
  'local:/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/jars/*,local:/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/hive/*'),
 ('spark.yarn.appMasterEnv.MKL_NUM_THREADS', '1'),
 ('spark.sql.queryExecutionListeners',
  'com.cloudera.spark.lineage.NavigatorQueryListener'),
 ('spark.lineage.log.dir', '/var/log/spark/lineage'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
  'md01.rcc.local,md02.rcc.local'),
 ('spark.driver.port', '40763'),
 ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),
 ('spark.executorEnv.PYTHONPATH',
  '/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.10.7-src.zip:/opt/cloudera/parcels/CDH/lib/spark/python/lib/pyspark.zip<CPS>/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/python/lib/py4j-0.10.7-src.zip<CPS>/opt/cloudera/parcels/CDH-6.3.0-1.cdh6.3.0.p0.1279813/lib/spark/python/lib/pyspark.zip'),


In [2]:
books = spark.read.json("/user/jianminb/data/books_5.json.gz")
books.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [3]:
books = books.select("asin","overall","reviewerID")

In [3]:
books.dtypes

[('asin', 'string'),
 ('helpful', 'array<bigint>'),
 ('overall', 'double'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('reviewerID', 'string'),
 ('reviewerName', 'string'),
 ('summary', 'string'),
 ('unixReviewTime', 'bigint')]

In [4]:
from pyspark.sql.types import DoubleType
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [5]:
stringIndexer = StringIndexer(inputCol="asin", outputCol="asinIndex")
model = stringIndexer.fit(books)
books_index = model.transform(books)

In [6]:
books_index.dtypes

[('asin', 'string'),
 ('helpful', 'array<bigint>'),
 ('overall', 'double'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('reviewerID', 'string'),
 ('reviewerName', 'string'),
 ('summary', 'string'),
 ('unixReviewTime', 'bigint'),
 ('asinIndex', 'double')]

In [7]:
stringIndexer = StringIndexer(inputCol="reviewerID", outputCol="reviewerIDIndex")
model = stringIndexer.fit(books_index)
books1_index = model.transform(books_index)

In [8]:
books1_index.dtypes

[('asin', 'string'),
 ('helpful', 'array<bigint>'),
 ('overall', 'double'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('reviewerID', 'string'),
 ('reviewerName', 'string'),
 ('summary', 'string'),
 ('unixReviewTime', 'bigint'),
 ('asinIndex', 'double'),
 ('reviewerIDIndex', 'double')]

In [10]:
books2_index = books1_index.drop("reviewerID")
books2_index = books2_index.drop("asin")

In [11]:
books2_index.printSchema()

root
 |-- overall: double (nullable = true)
 |-- asinIndex: double (nullable = false)
 |-- reviewerIDIndex: double (nullable = false)



In [9]:
training, test = books1_index.randomSplit([0.8,0.2])

In [61]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [10]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=10, regParam=0.01, userCol="reviewerIDIndex", itemCol="asinIndex", ratingCol="overall",
          coldStartStrategy="drop", nonnegative = True)

In [62]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50]) \
            .addGrid(als.regParam, [.01, .05]) \
            .build()

In [70]:
# Add hyperparameters and their respective values to param_grid
param_grid2 = ParamGridBuilder() \
            .addGrid(als.rank, [50,100]) \
            .addGrid(als.regParam, [.05, 0.1]) \
            .build()

In [63]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="overall", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  4


In [64]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

In [66]:
model_cv = cv.fit(training)

In [67]:
best_model = model_cv.bestModel

In [68]:
# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 50
  MaxIter: 10
  RegParam: 0.05


In [69]:
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

1.0357154239881492


In [71]:
# Build cross validation using CrossValidator
cv2 = CrossValidator(estimator=als, estimatorParamMaps=param_grid2, evaluator=evaluator, numFolds=3)

In [72]:
model_cv2 = cv2.fit(training)

In [73]:
best_model2 = model_cv2.bestModel

In [74]:
# Print best_model
print(type(best_model2))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model2._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model2._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model2._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 100
  MaxIter: 10
  RegParam: 0.1


In [75]:
# View the predictions
test_predictions2 = best_model2.transform(test)
RMSE2 = evaluator.evaluate(test_predictions2)
print(RMSE2)

0.9877977806109016


In [76]:
# Add hyperparameters and their respective values to param_grid
param_grid3 = ParamGridBuilder() \
            .addGrid(als.rank, [50,100,150]) \
            .addGrid(als.regParam, [0.01,0.05,0.1,0.15]) \
            .build()

In [77]:
# Build cross validation using CrossValidator
cv3 = CrossValidator(estimator=als, estimatorParamMaps=param_grid3, evaluator=evaluator, numFolds=3)

In [78]:
model_cv3 = cv3.fit(training)

In [79]:
best_model3 = model_cv3.bestModel

In [80]:
# Print best_model
print(type(best_model3))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model3._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model3._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model3._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 150
  MaxIter: 10
  RegParam: 0.15


In [81]:
# View the predictions
test_predictions3 = best_model3.transform(test)
RMSE3 = evaluator.evaluate(test_predictions3)
print(RMSE3)

0.9824782504983577


In [82]:
# Add hyperparameters and their respective values to param_grid
param_grid4 = ParamGridBuilder() \
            .addGrid(als.rank, [150,200]) \
            .addGrid(als.regParam, [0.15,0.2,0.25]) \
            .build()

In [83]:
# Build cross validation using CrossValidator
cv4 = CrossValidator(estimator=als, estimatorParamMaps=param_grid4, evaluator=evaluator, numFolds=3)

In [84]:
model_cv4 = cv4.fit(training)

Py4JJavaError: An error occurred while calling o758.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4787.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4787.0 (TID 43511, hd04.rcc.local, executor 561): ExecutorLostFailure (executor 561 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  1.5 GB of 1.5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary$lzycompute(RegressionMetrics.scala:57)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary(RegressionMetrics.scala:54)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr$lzycompute(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.meanSquaredError(RegressionMetrics.scala:100)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError(RegressionMetrics.scala:109)
	at org.apache.spark.ml.evaluation.RegressionEvaluator.evaluate(RegressionEvaluator.scala:86)
	at sun.reflect.GeneratedMethodAccessor233.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
best_model4 = model_cv4.bestModel

In [None]:
# Print best_model
print(type(best_model4))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model4._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model4._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model4._java_obj.parent().getRegParam())

In [None]:
# View the predictions
test_predictions4 = best_model4.transform(test)
RMSE4 = evaluator.evaluate(test_predictions4)
print(RMSE4)

In [11]:
#fit and predict
model = als.fit(training)
predictions = model.transform(test)

In [12]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='overall')
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.3619927172186903


In [13]:
#explain parameters of the model
model.explainParams()

'coldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: nan,drop. (default: nan, current: drop)\nitemCol: column name for item ids. Ids must be within the integer value range. (default: item, current: asinIndex)\npredictionCol: prediction column name (default: prediction)\nuserCol: column name for user ids. Ids must be within the integer value range. (default: user, current: reviewerIDIndex)'

In [14]:
#item factors 
model.itemFactors.show(10, truncate = False)

+---+---------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                             |
+---+---------------------------------------------------------------------------------------------------------------------+
|0  |[0.9989152, 0.6086671, 0.0, 1.1316842, 0.14719044, 0.4466637, 0.63492686, 0.708429, 0.8661433, 0.96513003]           |
|10 |[1.0328232, 0.35163817, 0.39595658, 0.5199836, 0.8044754, 0.9919952, 0.60510373, 0.6314512, 0.37546015, 1.0589483]   |
|20 |[0.9003094, 0.79917353, 0.69117737, 0.47700605, 0.70824647, 0.8046747, 0.8154053, 0.9168871, 0.56048584, 0.6843929]  |
|30 |[0.97663516, 0.62136, 0.4612496, 0.4295348, 0.96013474, 0.83717257, 0.924034, 0.12046359, 0.8095437, 0.95464647]     |
|40 |[0.75584763, 0.56663465, 0.7882819, 0.93283737, 0.7941211, 0.18517363, 0.33851552, 1.0515912, 0.7521167, 0.81481457] |
|50 |[0.

In [12]:
from pyspark.sql import functions as func
from pyspark.sql.functions import explode, col

In [13]:
# Generate top 10 book recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show(10, truncate=False)

+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviewerIDIndex|recommendations                                                                                                                                                                                                   |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148            |[[326409, 10.956128], [267975, 10.946776], [311108, 10.835842], [344111, 10.823646], [207236, 10.767737], [153367, 10.761895], [274360, 10.692666], [287024, 10.681555], [257191, 10.679497], [152127, 10.637549]]|
|463            |[[154298, 11.324488], [66406, 10.42019], [255212, 10.123577], [1262

In [18]:
udf_item_index = udf(lambda x:x[0],FloatType())
udf_rating = udf(lambda x: x[1], FloatType())

In [15]:
from pyspark.sql.functions import udf

In [17]:
from pyspark.sql.types import StructType, FloatType, IntegerType

In [19]:
df_recommendation = userRecs.withColumn("recommendations",explode("recommendations"))

In [20]:
udf_item_index = udf(lambda x:x[0],IntegerType())
udf_rating = udf(lambda x: x[1], FloatType())

In [27]:
df_recommendation.withColumn('asinIndex', udf_item_index('recommendations')).withColumn('overall', udf_rating('recommendations'))

reviewerIDIndex,recommendations,asinIndex,overall
148,"[324579, 11.239884]",,11.239884
148,"[294762, 10.652662]",,10.652662
148,"[218873, 10.610807]",,10.610807
148,"[134777, 10.592444]",,10.592444
148,"[337798, 10.329012]",,10.329012
148,"[253057, 10.318841]",,10.318841
148,"[360592, 10.291299]",,10.291299
148,"[316419, 10.264614]",,10.264614
148,"[185895, 10.260252]",,10.260252
148,"[128506, 10.239238]",,10.239238


In [21]:
nrecommendations = df_recommendation.withColumn('asinIndex', udf_item_index('recommendations')).withColumn('overall', udf_rating('recommendations')).drop('recommendations')
nrecommendations.limit(20).show()

+---------------+---------+----------+
|reviewerIDIndex|asinIndex|   overall|
+---------------+---------+----------+
|            134|   343863| 11.708134|
|            134|   274360| 11.634775|
|            134|   245530| 11.588929|
|            134|   188082| 11.539589|
|            134|   153367| 11.535437|
|            134|   227642| 11.518301|
|            134|   227242| 11.516125|
|            134|   188687| 11.504252|
|            134|   240691| 11.491187|
|            134|   257191| 11.477867|
|            584|   351360| 10.753606|
|            584|   292159| 10.599555|
|            584|   362007| 10.584541|
|            584|   188082| 10.328151|
|            584|   260682|10.3169365|
|            584|   257191| 10.299189|
|            584|   197354| 10.245995|
|            584|   188687| 10.210522|
|            584|   341812| 10.187984|
|            584|   281154| 10.186992|
+---------------+---------+----------+



In [22]:
Meta_Books = spark.read.json("/user/jianminb/data/Meta_Books.json.gz")
Meta_Books.printSchema()

root
 |-- also_buy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- also_view: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- asin: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: string (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- details: struct (nullable = true)
 |    |-- 
    Item Weight: 
    : string (nullable = true)
 |    |-- 
    Package Dimensions: 
    : string (nullable = true)
 |    |-- 
    Product Dimensions: 
    : string (nullable = true)
 |    |--  Date first listed on Amazon:: string (nullable = true)
 |    |-- 3.5" and 5.25" disks:: string (nullable = true)
 |    |-- 3.5" disk:: string (nullable = true)
 |    |-- 5.25" disk:: string (nullable = true)
 |    |-- ASIN:: string (nullable = true)
 |    |-- ASIN: : string (nullable = tru

In [23]:
Meta_Books_df = Meta_Books.select("asin","category",'price','title')

In [69]:
books_meta_index = books1_index.join(Meta_Books_df, on='asin')

In [24]:
books1_index.dtypes

[('asin', 'string'),
 ('helpful', 'array<bigint>'),
 ('overall', 'double'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('reviewerID', 'string'),
 ('reviewerName', 'string'),
 ('summary', 'string'),
 ('unixReviewTime', 'bigint'),
 ('asinIndex', 'double'),
 ('reviewerIDIndex', 'double')]

In [79]:
Meta_Books.select("asin","category",'price','title').show(10)

+----------+--------------------+-------+--------------------+
|      asin|            category|  price|               title|
+----------+--------------------+-------+--------------------+
|0000092878|                  []| $39.94|Biology Gods Livi...|
|000047715X|[Books, New, Used...|       |Mksap 16 Audio Co...|
|0000004545|[Books, Arts & Ph...|$199.99|Flex! Discography...|
|0000013765|[Books, Arts & Ph...|       |Heavenly Highway ...|
|0000000116|                  []|$164.10|Georgina Goodman ...|
|0000555010|[Books, New, Used...|       |Principles of Ana...|
|0000477141|[Books, Medical B...|       |MKSAP 15 Audio Co...|
|0000230022|[Books, New, Used...|       |The Simple Truths...|
|0000038504|[Books, Education...|$198.70|Double-Speak: Fro...|
|0000001589|                  []|       |LJ Classique Inte...|
+----------+--------------------+-------+--------------------+
only showing top 10 rows



In [78]:
books1_index.select('asin','asinIndex','reviewerID').show(10)

+----------+---------+--------------------+
|      asin|asinIndex|          reviewerID|
+----------+---------+--------------------+
|000100039X|   4321.0|A10000012B7CGYKOM...|
|000100039X|   4321.0|      A2S166WSCFIFP5|
|000100039X|   4321.0|      A1BM81XB4QHOA3|
|000100039X|   4321.0|      A1MOSTXNIO5MPJ|
|000100039X|   4321.0|      A2XQ5LZHTD4AFT|
|000100039X|   4321.0|      A3V1MKC2BVWY48|
|000100039X|   4321.0|      A12387207U8U24|
|000100039X|   4321.0|      A29TRDMK51GKZR|
|000100039X|   4321.0|      A3FI0744PG1WYG|
|000100039X|   4321.0|      A2LBBQHYLEHM7P|
+----------+---------+--------------------+
only showing top 10 rows



In [25]:
books1_index_recommend = books1_index.select('asin','overall','reviewerID','asinIndex')

In [86]:
books1_index.select('asin').distinct().show(10)

+----------+
|      asin|
+----------+
|0002216973|
|0006476155|
|0006544150|
|0006550479|
|0007163932|
|0023605103|
|0027861317|
|0028633784|
|0060087447|
|0060192097|
+----------+
only showing top 10 rows



In [26]:
books1_index_distinct = books1_index.select('asin','asinIndex').distinct()

In [27]:
books_meta_index_recommend = books1_index_distinct.join(Meta_Books_df, on='asin')

In [89]:
books_meta_index_recommend.select('asin').show(10)

+----------+
|      asin|
+----------+
|0002216973|
|0006476155|
|0006544150|
|0006550479|
|0007163932|
|0023605103|
|0027861317|
|0028633784|
|0060192097|
|0060392436|
+----------+
only showing top 10 rows



In [94]:
books1_index.select('overall').show(10)

+-------+
|overall|
+-------+
|    5.0|
|    5.0|
|    5.0|
|    5.0|
|    5.0|
|    5.0|
|    5.0|
|    5.0|
|    5.0|
|    5.0|
+-------+
only showing top 10 rows



In [95]:
books1_index.select('asin').count()

8898041

In [96]:
Meta_Books.select('asin').count()

2934949

# User A10000012B7CGYKOMPQ4L ALS Recommendations

In [74]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '3'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,overall.1,reviewerID,category,price,title
0,324579,3,11.828129,979544114,5.0,A3NBC5OFMFP2MS,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
1,324579,3,11.828129,979544114,5.0,A22Y8RL2DOA57T,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
2,324579,3,11.828129,979544114,5.0,A105Q3YYDW6HQ1,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
3,324579,3,11.828129,979544114,5.0,AG8DPYVTLT2C2,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
4,324579,3,11.828129,979544114,5.0,A3Z9QN0RCOHEY,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
5,242874,3,11.528156,9706061371,5.0,A1ZB0MH8DLW843,"[Books, Self-Help]",.a-section.a-spacing-mini{margin-bottom:6px!im...,C&oacute;mo vencer 5 errores que nos arruinan ...
6,242874,3,11.528156,9706061371,5.0,A2D2TAWMGQJYCJ,"[Books, Self-Help]",.a-section.a-spacing-mini{margin-bottom:6px!im...,C&oacute;mo vencer 5 errores que nos arruinan ...
7,242874,3,11.528156,9706061371,5.0,A2SZGIUKIIVDMS,"[Books, Self-Help]",.a-section.a-spacing-mini{margin-bottom:6px!im...,C&oacute;mo vencer 5 errores que nos arruinan ...
8,242874,3,11.528156,9706061371,5.0,A2CSSHRXZC4EUH,"[Books, Self-Help]",.a-section.a-spacing-mini{margin-bottom:6px!im...,C&oacute;mo vencer 5 errores que nos arruinan ...
9,242874,3,11.528156,9706061371,5.0,A7NZ2ELAMGYAC,"[Books, Self-Help]",.a-section.a-spacing-mini{margin-bottom:6px!im...,C&oacute;mo vencer 5 errores que nos arruinan ...


In [91]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '101'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,339877,101,10.608191,9706061126,[],$11.70,"A veces, Dios dice ""NO!"" (Sometimes, God says ..."
1,324579,101,12.042072,979544114,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
2,242874,101,10.771022,9706061371,"[Books, Self-Help]",.a-section.a-spacing-mini{margin-bottom:6px!im...,C&oacute;mo vencer 5 errores que nos arruinan ...
3,242290,101,10.683022,9706061185,"[Books, Self-Help, Happiness]",.a-section.a-spacing-mini{margin-bottom:6px!im...,Amor A S&iacute; Mismo Despu&eacute;s de los 4...
4,308060,101,10.722486,9686636617,[],.a-section.a-spacing-mini{margin-bottom:6px!im...,Analice su Personalidad ( Analyze your Persona...
5,360592,101,10.757002,9706060863,[],$15.80,Un Clavel en el Fango (Spanish Edition)
6,335568,101,10.66779,9706061614,[],,Dietas Violentas para Valientes (Harsh Diets f...
7,253057,101,10.969269,9706061061,[],.a-section.a-spacing-mini{margin-bottom:6px!im...,Un Mundo Feliz: EL PROZAC
8,294762,101,11.023747,9686636412,[],$275.00,Lo Mejor de la Superacin Personal ( The Best o...


In [37]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '134'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,227642,134,11.518301,9687968257,"[Books, Politics & Social Sciences]",$9.66,El Derecho Prohibido...(A Baby:Forbiden Right)...
1,188687,134,11.504252,9687968281,[],$45.98,Antologa de Autoestima y Amor (The Best of Sel...
2,343863,134,11.708134,9686801693,"[Books, Health, Fitness &amp; Dieting]",,El estres es Vida (Spanish Edition)
3,257191,134,11.477867,9686801707,"[Books, Health, Fitness &amp; Dieting, Women's...",$70.33,Libro de Oro del Embarazo (Spanish Edition)
4,240691,134,11.491187,9706061789,"[Books, Health, Fitness &amp; Dieting, Exercis...",,Ejercicios Isomtricos (Isometric Exercises) (S...
5,245530,134,11.588929,9706061576,"[Books, Cookbooks, Food & Wine, Cooking Educat...",.a-section.a-spacing-mini{margin-bottom:6px!im...,Cocina Mexicana de los siglos XVI al XIX ( Mex...
6,227242,134,11.516125,1857910478,"[Books, Reference, Dictionaries & Thesauruses]",$16.60,Focloir Poca: English-Irish Irish-English Dict...
7,274360,134,11.634775,9706061681,[],$10.00,"Dios Mio ! &iexcl;Hazme Delgada!( Oh,Lord, Mak..."
8,153367,134,11.535437,9706061908,"[Books, Literature &amp; Fiction, History &amp...",$29.95,La Leona de Mxico (Mexico`s Lioness) (Spanish ...
9,188082,134,11.539589,9686801278,"[Books, Cookbooks, Food &amp; Wine, Regional &...",,Cocina para celebrar (Spanish Edition)


In [38]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '584'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,188687,584,10.210522,9687968281,[],$45.98,Antologa de Autoestima y Amor (The Best of Sel...
1,260682,584,10.316936,9706061193,"[Books, Health, Fitness & Dieting, Women's Hea...",.a-section.a-spacing-mini{margin-bottom:6px!im...,&iexcl;Lo que no Fue!:&iquest;Era tu Criatura?...
2,257191,584,10.299189,9686801707,"[Books, Health, Fitness &amp; Dieting, Women's...",$70.33,Libro de Oro del Embarazo (Spanish Edition)
3,362007,584,10.584541,9687968222,"[Books, Health, Fitness & Dieting, Psychology ...",,Tests de Amor y Sexualidad (Spanish Edition)
4,281154,584,10.186992,9706060480,"[Books, Self-Help, Self-Esteem]",,Amor A S Mismo : La Clave Dinmica de la Felicidad
5,197354,584,10.245995,9686801758,"[Books, Religion &amp; Spirituality, New Age &...",$125.43,Ultimas Profecas Y Revelaciones Para El Tercer...
6,188082,584,10.328151,9686801278,"[Books, Cookbooks, Food &amp; Wine, Regional &...",,Cocina para celebrar (Spanish Edition)


In [43]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '300'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,366581,300,9.464398,9706061312,"[Books, Reference, Foreign Language Study &amp...",$14.46,La Cruda (Spanish Edition)
1,227642,300,9.452726,9687968257,"[Books, Politics & Social Sciences]",$9.66,El Derecho Prohibido...(A Baby:Forbiden Right)...
2,343863,300,9.757562,9686801693,"[Books, Health, Fitness &amp; Dieting]",,El estres es Vida (Spanish Edition)
3,311108,300,9.628062,968663620X,"[Books, Reference, Foreign Language Study &amp...",,Hable Ingles Facilmente (Learn English Easily)...
4,185439,300,9.456269,9706060375,"[Books, Health, Fitness & Dieting, Diseases & ...",,Sindrome de Fatiga Cronica (Spanish Edition)
5,274360,300,9.643435,9706061681,[],$10.00,"Dios Mio ! &iexcl;Hazme Delgada!( Oh,Lord, Mak..."
6,347860,300,9.677858,9686801626,[],.a-section.a-spacing-mini{margin-bottom:6px!im...,Genio e Ingenio (Spanish Edition)
7,188082,300,9.670881,9686801278,"[Books, Cookbooks, Food &amp; Wine, Regional &...",,Cocina para celebrar (Spanish Edition)
8,309686,300,9.638183,9686801766,[],$20.99,Gimnasia emocional (Spanish Edition)
9,267975,300,9.535411,968680188X,"[Books, Self-Help, Relationships]",,El Libro del BUEN AMOR


In [46]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '149'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,227642,149,11.363262,9687968257,"[Books, Politics & Social Sciences]",$9.66,El Derecho Prohibido...(A Baby:Forbiden Right)...
1,188687,149,10.936651,9687968281,[],$45.98,Antologa de Autoestima y Amor (The Best of Sel...
2,326409,149,11.072326,9687968516,"[Books, Parenting &amp; Relationships, Parenting]",$12.95,Juegos de Talento para tu Hijo (Talent games f...
3,240691,149,10.925278,9706061789,"[Books, Health, Fitness &amp; Dieting, Exercis...",,Ejercicios Isomtricos (Isometric Exercises) (S...
4,245530,149,10.902111,9706061576,"[Books, Cookbooks, Food & Wine, Cooking Educat...",.a-section.a-spacing-mini{margin-bottom:6px!im...,Cocina Mexicana de los siglos XVI al XIX ( Mex...
5,204980,149,11.341427,9687968141,[],,El Gozo del Perdn
6,209640,149,11.3648,9706061711,"[Books, Cookbooks, Food & Wine, Special Diet]",,Noventa Recetas para Prevenir el C&aacute;ncer...
7,229039,149,10.931799,9686636307,"[Books, New, Used &amp; Rental Textbooks, Huma...",,Ingls en un mes (sin maestro) (Spanish Edition)
8,274360,149,11.01218,9706061681,[],$10.00,"Dios Mio ! &iexcl;Hazme Delgada!( Oh,Lord, Mak..."
9,153367,149,11.044093,9706061908,"[Books, Literature &amp; Fiction, History &amp...",$29.95,La Leona de Mxico (Mexico`s Lioness) (Spanish ...


In [50]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '142'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,188687,142,11.490227,9687968281,[],$45.98,Antologa de Autoestima y Amor (The Best of Sel...
1,343863,142,11.692266,9686801693,"[Books, Health, Fitness &amp; Dieting]",,El estres es Vida (Spanish Edition)
2,257191,142,11.486545,9686801707,"[Books, Health, Fitness &amp; Dieting, Women's...",$70.33,Libro de Oro del Embarazo (Spanish Edition)
3,240691,142,11.45563,9706061789,"[Books, Health, Fitness &amp; Dieting, Exercis...",,Ejercicios Isomtricos (Isometric Exercises) (S...
4,311108,142,11.469636,968663620X,"[Books, Reference, Foreign Language Study &amp...",,Hable Ingles Facilmente (Learn English Easily)...
5,245530,142,11.451735,9706061576,"[Books, Cookbooks, Food & Wine, Cooking Educat...",.a-section.a-spacing-mini{margin-bottom:6px!im...,Cocina Mexicana de los siglos XVI al XIX ( Mex...
6,362007,142,11.750139,9687968222,"[Books, Health, Fitness & Dieting, Psychology ...",,Tests de Amor y Sexualidad (Spanish Edition)
7,274360,142,11.46384,9706061681,[],$10.00,"Dios Mio ! &iexcl;Hazme Delgada!( Oh,Lord, Mak..."
8,153367,142,11.544732,9706061908,"[Books, Literature &amp; Fiction, History &amp...",$29.95,La Leona de Mxico (Mexico`s Lioness) (Spanish ...
9,188082,142,11.756474,9686801278,"[Books, Cookbooks, Food &amp; Wine, Regional &...",,Cocina para celebrar (Spanish Edition)


In [53]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '99'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,331096,99,11.996804,B00H006TZS,"[Books, Parenting & Relationships, Family Rela...",,Emotional Abusive In Marriage
1,260682,99,11.788243,9706061193,"[Books, Health, Fitness & Dieting, Women's Hea...",.a-section.a-spacing-mini{margin-bottom:6px!im...,&iexcl;Lo que no Fue!:&iquest;Era tu Criatura?...
2,264383,99,11.80555,968536804X,[],$23.96,Amor a si mismo y A Los Dem&agrave;s
3,343863,99,11.994937,9686801693,"[Books, Health, Fitness &amp; Dieting]",,El estres es Vida (Spanish Edition)
4,257191,99,11.923459,9686801707,"[Books, Health, Fitness &amp; Dieting, Women's...",$70.33,Libro de Oro del Embarazo (Spanish Edition)
5,311108,99,11.78764,968663620X,"[Books, Reference, Foreign Language Study &amp...",,Hable Ingles Facilmente (Learn English Easily)...
6,185439,99,11.805047,9706060375,"[Books, Health, Fitness & Dieting, Diseases & ...",,Sindrome de Fatiga Cronica (Spanish Edition)
7,347860,99,12.058932,9686801626,[],.a-section.a-spacing-mini{margin-bottom:6px!im...,Genio e Ingenio (Spanish Edition)
8,188082,99,12.056048,9686801278,"[Books, Cookbooks, Food &amp; Wine, Regional &...",,Cocina para celebrar (Spanish Edition)
9,267975,99,11.805303,968680188X,"[Books, Self-Help, Relationships]",,El Libro del BUEN AMOR


In [80]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '148'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,overall.1,reviewerID,category,price,title
0,324579,148,11.239884,979544114,5.0,A3NBC5OFMFP2MS,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
1,324579,148,11.239884,979544114,5.0,A22Y8RL2DOA57T,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
2,324579,148,11.239884,979544114,5.0,A105Q3YYDW6HQ1,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
3,324579,148,11.239884,979544114,5.0,AG8DPYVTLT2C2,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
4,324579,148,11.239884,979544114,5.0,A3Z9QN0RCOHEY,"[Books, Children's Books, Animals]",$30.81,Christmas at the Zoo: A Pop-Up Winter Wonderland
5,316419,148,10.264614,688112161,5.0,AFEZZWNH0IWKV,"[Books, Cookbooks, Food &amp; Wine, Regional &...",$13.68,Good Housekeeping Illustrated American Cookbook
6,316419,148,10.264614,688112161,5.0,A1BDH156XG7Y86,"[Books, Cookbooks, Food &amp; Wine, Regional &...",$13.68,Good Housekeeping Illustrated American Cookbook
7,316419,148,10.264614,688112161,5.0,A1NIE1ASE0MEWX,"[Books, Cookbooks, Food &amp; Wine, Regional &...",$13.68,Good Housekeeping Illustrated American Cookbook
8,316419,148,10.264614,688112161,5.0,A2WO0NZL5ZXU7E,"[Books, Cookbooks, Food &amp; Wine, Regional &...",$13.68,Good Housekeeping Illustrated American Cookbook
9,316419,148,10.264614,688112161,5.0,A2HPQ2H97EKBCH,"[Books, Cookbooks, Food &amp; Wine, Regional &...",$13.68,Good Housekeeping Illustrated American Cookbook


In [90]:
nrecommendations.join(books_meta_index_recommend, on='asinIndex').filter("reviewerIDIndex = '463'").toPandas()

Unnamed: 0,asinIndex,reviewerIDIndex,overall,asin,category,price,title
0,93338,463,9.947626,1495325814,"[Books, Arts &amp; Photography, Performing Arts]",$6.99,Sherlock Lives!: 100+ Facts on Sherlock and th...
1,330384,463,9.992572,8181462920,"[Books, Humor & Entertainment]",,All About Nothing ( Hindi Edition)


In [41]:
books_index.select('reviewerID').show(10, truncate=False)

+---------------------+
|reviewerID           |
+---------------------+
|A10000012B7CGYKOMPQ4L|
|A2S166WSCFIFP5       |
|A1BM81XB4QHOA3       |
|A1MOSTXNIO5MPJ       |
|A2XQ5LZHTD4AFT       |
|A3V1MKC2BVWY48       |
|A12387207U8U24       |
|A29TRDMK51GKZR       |
|A3FI0744PG1WYG       |
|A2LBBQHYLEHM7P       |
+---------------------+
only showing top 10 rows



# User A10000012B7CGYKOMPQ4L Actual Preference:

In [54]:
books_index.join(Meta_Books_df, on='asin').filter("reviewerID == 'A10000012B7CGYKOMPQ4L'").sort('overall', ascending=False).limit(10).show()

+----------+-------+-------+--------------------+-----------+--------------------+------------+----------+--------------+---------+--------------------+------+--------------------+--------------------+
|      asin|helpful|overall|          reviewText| reviewTime|          reviewerID|reviewerName|   summary|unixReviewTime|asinIndex|            category| price|               title|         description|
+----------+-------+-------+--------------------+-----------+--------------------+------------+----------+--------------+---------+--------------------+------+--------------------+--------------------+
|000100039X| [0, 0]|    5.0|Spiritually and m...|12 16, 2012|A10000012B7CGYKOM...|        Adam|Wonderful!|    1355616000|   4321.0|[Books, Literatur...|      |         The Prophet|[Kahlil Gibran wa...|
|0446691437| [0, 0]|    5.0|Excellent book! I...|12 16, 2012|A10000012B7CGYKOM...|        Adam| So Great.|    1355616000|   1007.0|[Books, Health, F...|$26.25|The War of Art: B...|[Steven Pres

In [29]:
Meta_Books_df_interpret = Meta_Books.select("asin","category",'price','title','description')
books_index_interpret = books1_index.select('asin','reviewerID','overall','reviewerIDIndex')

In [75]:
books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '3'").sort('overall', ascending=False).limit(10).toPandas()

Unnamed: 0,asin,reviewerID,overall,reviewerIDIndex,category,price,title,description
0,0061253758,A320TMDV6KCFU,5.0,3.0,"[Books, Literature & Fiction, United States]",$4.74,No One Left To Tell (No One Series),[&#34;Jordan Dane shows true brilliance with h...
1,0061456756,A320TMDV6KCFU,5.0,3.0,"[Books, Literature & Fiction, United States]",$7.99,Always a Scoundrel: The Notorious Gentlemen,"[, A native and current resident of Southern C..."
2,014314295X,A320TMDV6KCFU,5.0,3.0,"[Books, Literature & Fiction, Genre Fiction]",$11.79,The Seduction of the Crimson Rose,"[<DIV>, Lauren Willig is a law student and Ph...."
3,0446502049,A320TMDV6KCFU,5.0,3.0,"[Books, Literature &amp; Fiction, United States]",$7.99,Addicted to Love (Wedding Veil Wishes),"[""Entertaining and humorous...There's a seriou..."
4,0451226518,A320TMDV6KCFU,5.0,3.0,"[Books, Romance, Historical]",,The Secret Wedding,[Caro Hill mustdo one thing before she accepts...
5,0505527324,A320TMDV6KCFU,5.0,3.0,"[Books, Literature &amp; Fiction, United States]",$11.97,"Divorced, Desperate and Dating","[, ]"
6,0975453386,A320TMDV6KCFU,5.0,3.0,"[Books, Romance, Historical]",,Redemption,"[""Morgan Leshay makes a promising debut-a beau..."
7,141996920X,A320TMDV6KCFU,5.0,3.0,"[Books, Literature &amp; Fiction, United States]",,Shadow (New Spiecies),"[, ]"
8,159998105X,A320TMDV6KCFU,5.0,3.0,"[Books, Literature &amp; Fiction, Erotica]",$16.11,Loving Lies,"[, , , ]"
9,160202054X,A320TMDV6KCFU,5.0,3.0,"[Books, Literature &amp; Fiction, Erotica]",,A Few Good Men,"[<b>The Red Hot &amp; Blue Series</b>, , , ]"


In [30]:
df101 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '101'").sort('overall', ascending=False).limit(100).toPandas()

In [31]:
df101

Unnamed: 0,asin,reviewerID,overall,reviewerIDIndex,category,price,title,description
0,0006530699,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Business & Money, Processes & Infrastr...",$12.25,Raving Fans : Revolutionary Approach to Custom...,[With a new foreword by Ken Blanchard A straig...
1,0440221048,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Mystery, Thriller & Suspense, Thriller...",$30.97,A Certain Justice (Abe Glitsky),[&quot;Catapults Lescroart into the top ranks ...
2,0967721318,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Health, Fitness &amp; Dieting, Alterna...",$19.95,Lyme Disease and Modern Chinese Medicine,[This is a comprehensive book on the cause and...
3,0061905267,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Parenting &amp; Relationships, Family ...",$13.09,Heroes for My Daughter,"[, An inspiring collection of heroes from whom..."
4,0451191137,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Literature &amp; Fiction, Genre Fiction]",$6.52,Anthem,"[Born February 2, 1905, <b>Ayn Rand</b> publis..."
5,0143145088,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Business &amp; Money, Business Culture]",$18.00,Drive: The Surprising Truth About What Motivat...,"[""Pink makes a convincing case that organizati..."
6,0471477532,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Reference, Writing, Research &amp; Pub...",,Edward R. Murrow and the Birth of Broadcast Jo...,"[Edwards, who has hosted NPR's <i>Morning Edit..."
7,1577310721,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Health, Fitness &amp; Dieting, Psychol...",$17.00,Small Graces: The Quiet Gifts of Everyday Life,"[, ]"
8,1612120512,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Cookbooks, Food & Wine, Beverages & Wine]",$14.95,The Fresh Honey Cookbook: 84 Recipes from a Be...,[<DIV><P>Honey through the seasons</P><P></P><...
9,0802717780,A2HM0BZWQRV1EF,5.0,101.0,"[Books, Sports &amp; Outdoors, Biographies]",$18.50,Bill Veeck: Baseball's Greatest Maverick,"[, Veeck was a one of a kind whose impact reac..."


In [36]:
df101['category'].astype('str').value_counts()

['Books', 'Mystery, Thriller &amp; Suspense', 'Thrillers &amp; Suspense']      6
['Books', 'Business &amp; Money', 'Management &amp; Leadership']               5
['Books', 'Business &amp; Money', 'Business Culture']                          5
['Books', 'Self-Help', 'Relationships']                                        4
['Books', 'Politics &amp; Social Sciences', 'Politics &amp; Government']       3
['Books', 'Health, Fitness &amp; Dieting', 'Psychology &amp; Counseling']      3
['Books', 'Biographies &amp; Memoirs', 'Arts &amp; Literature']                3
['Books', 'Business & Money', 'Skills']                                        2
['Books', 'Self-Help', 'Happiness']                                            2
['Books', 'Parenting &amp; Relationships', 'Family Relationships']             2
['Books', 'Politics & Social Sciences', 'Social Sciences']                     2
['Books', 'Humor & Entertainment', 'Humor']                                    2
['Books', 'Health, Fitness &

In [39]:
df134 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '134'").sort('overall', ascending=False).limit(100).toPandas()

In [40]:
df134['category'].astype('str').value_counts()

['Books', 'Literature &amp; Fiction', 'Genre Fiction']                             14
['Books', 'Mystery, Thriller &amp; Suspense', 'Thrillers &amp; Suspense']          11
['Books', 'Mystery, Thriller &amp; Suspense', 'Mystery']                            7
['Books', 'Science &amp; Math', 'Biological Sciences']                              7
['Books', 'Cookbooks, Food &amp; Wine', 'Regional &amp; International']             7
['Books', 'Literature & Fiction', 'Genre Fiction']                                  6
['Books', 'Mystery, Thriller & Suspense', 'Mystery']                                4
['Books', 'Literature &amp; Fiction', 'United States']                              3
['Books', 'Cookbooks, Food &amp; Wine', 'Cooking Education &amp; Reference']        3
['Books', 'Literature &amp; Fiction', 'Contemporary']                               3
['Books', 'Mystery, Thriller & Suspense', 'Thrillers & Suspense']                   3
['Books', 'Humor &amp; Entertainment', 'Humor']       

In [41]:
df584 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '584'").sort('overall', ascending=False).limit(100).toPandas()

In [42]:
df584['category'].astype('str').value_counts()

['Books', 'Mystery, Thriller &amp; Suspense', 'Thrillers &amp; Suspense']         14
['Books', 'Literature &amp; Fiction', 'Genre Fiction']                            10
['Books', 'Literature & Fiction', 'Genre Fiction']                                 9
['Books', 'Mystery, Thriller &amp; Suspense', 'Mystery']                           7
['Books', 'Mystery, Thriller & Suspense', 'Mystery']                               4
['Books', 'Mystery, Thriller & Suspense', 'Thrillers & Suspense']                  4
['Books', 'Politics &amp; Social Sciences', 'Politics &amp; Government']           4
['Books', 'Science Fiction &amp; Fantasy', 'Fantasy']                              3
['Books', 'Literature &amp; Fiction', 'Literary']                                  2
['Books', 'New, Used &amp; Rental Textbooks', 'Humanities']                        2
['Books', 'Literature & Fiction', 'Action & Adventure']                            2
['Books', 'Teen &amp; Young Adult', 'Literature &amp; Fiction']  

In [44]:
df300 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '300'").sort('overall', ascending=False).limit(100).toPandas()

In [45]:
df300['category'].astype('str').value_counts()

['Books', 'Mystery, Thriller &amp; Suspense', 'Thrillers &amp; Suspense']    20
['Books', 'Literature &amp; Fiction', 'Genre Fiction']                       13
['Books', 'Mystery, Thriller &amp; Suspense', 'Mystery']                      8
['Books', 'Biographies &amp; Memoirs', 'Arts &amp; Literature']               5
['Books', 'Mystery, Thriller & Suspense', 'Thrillers & Suspense']             4
['Books', 'Literature & Fiction', 'Genre Fiction']                            3
['Books', 'Literature &amp; Fiction', 'Action &amp; Adventure']               3
['Books', 'Literature &amp; Fiction', 'United States']                        3
['Books', 'Humor &amp; Entertainment', 'Humor']                               3
['Books', 'Biographies &amp; Memoirs', 'Specific Groups']                     2
['Books', 'Literature & Fiction', 'Contemporary']                             2
['Books', 'Reference', 'Writing, Research &amp; Publishing Guides']           2
['Books', 'Biographies &amp; Memoirs', '

In [47]:
df149 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '149'").sort('overall', ascending=False).limit(100).toPandas()

In [48]:
df149['category'].astype('str').value_counts()

['Books', 'Politics &amp; Social Sciences', 'Politics &amp; Government']           9
['Books', "Children's Books", 'Growing Up &amp; Facts of Life']                    5
['Books', 'Business &amp; Money', 'Business Culture']                              5
['Books', 'Business &amp; Money', 'Economics']                                     3
['Books', 'Biographies &amp; Memoirs', 'Historical']                               3
['Books', 'Education &amp; Teaching', 'Schools &amp; Teaching']                    3
['Books', 'Business &amp; Money', 'Investing']                                     3
['Books', 'Business &amp; Money', 'Management &amp; Leadership']                   2
['Books', 'Business &amp; Money', 'Finance']                                       2
['Books', 'Self-Help', 'Relationships']                                            2
['Books', 'Business & Money', 'Management & Leadership']                           2
['Books', 'History', 'Military']                                 

In [51]:
df145 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '142'").sort('overall', ascending=False).limit(100).toPandas()

In [52]:
df145['category'].astype('str').value_counts()

['Books', 'Literature &amp; Fiction', 'Genre Fiction']                       20
['Books', 'Mystery, Thriller &amp; Suspense', 'Mystery']                     16
['Books', 'Science Fiction &amp; Fantasy', 'Science Fiction']                 7
['Books', 'Mystery, Thriller &amp; Suspense', 'Thrillers &amp; Suspense']     7
['Books', 'Literature & Fiction', 'Genre Fiction']                            5
['Books', 'Christian Books & Bibles', 'Literature & Fiction']                 3
['Books', 'Science Fiction & Fantasy', 'Science Fiction']                     3
['Books', 'Mystery, Thriller & Suspense', 'Mystery']                          3
['Books', 'Literature &amp; Fiction', 'History &amp; Criticism']              3
['Books', "Children's Books", 'Literature &amp; Fiction']                     2
['Books', 'Literature & Fiction', 'Action & Adventure']                       2
['Books', 'Literature &amp; Fiction', 'Literary']                             2
['Books', "Children's Books", 'Growing U

In [54]:
df99 = books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '99'").sort('overall', ascending=False).limit(100).toPandas()

In [55]:
df99['category'].astype('str').value_counts()

['Books', 'Mystery, Thriller &amp; Suspense', 'Thrillers &amp; Suspense']       12
['Books', 'Literature &amp; Fiction', 'Genre Fiction']                           8
['Books', 'Mystery, Thriller & Suspense', 'Thrillers & Suspense']                6
['Books', 'Literature & Fiction', 'Genre Fiction']                               5
['Books', 'Mystery, Thriller &amp; Suspense', 'Mystery']                         4
['Books', "Children's Books", 'Animals']                                         3
['Books', 'Religion &amp; Spirituality', 'Judaism']                              3
['Books', "Children's Books", 'Geography &amp; Cultures']                        3
['Books', "Children's Books", 'Fairy Tales, Folk Tales &amp; Myths']             2
['Books', 'Politics & Social Sciences', 'Politics & Government']                 2
['Books', 'Crafts, Hobbies &amp; Home', 'Pets &amp; Animal Care']                2
['Books', 'History', 'World']                                                    2
['Bo

In [81]:
books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '148'").sort('overall', ascending=False).limit(10).toPandas()

Unnamed: 0,asin,reviewerID,overall,reviewerIDIndex,category,price,title,description
0,193563920X,A8710RWTFG3IY,5.0,148.0,"[Books, Literature & Fiction, United States]",$9.05,Glaciers (A Tin House New Voice),"[<div>""<B><I>Glaciers</I> has all the things I..."
1,B00CPCYPQQ,A8710RWTFG3IY,5.0,148.0,"[Books, Politics & Social Sciences, Politics &...",,India Dishonoured: Behind a nation&#39;s war o...,[]
2,1250040698,A8710RWTFG3IY,5.0,148.0,"[Books, Biographies &amp; Memoirs, Historical]",$17.82,"The Woman Before Wallis: Prince Edward, the Pa...","[, Riveting[Rose] delivers a vivid account of ..."
3,1400069939,A8710RWTFG3IY,5.0,148.0,"[Books, Arts &amp; Photography, Graphic Design]",$16.79,"The Coat Route: Craft, Luxury, &amp; Obsession...",[A spirited tour of fashion history . . . <i>T...
4,1250006538,A8710RWTFG3IY,5.0,148.0,"[Books, Biographies &amp; Memoirs, Leaders &am...",$11.95,Defiant: The POWs Who Endured Vietnam's Most I...,"[, You told it exactly like it was. In this un..."
5,0812981693,A8710RWTFG3IY,5.0,148.0,"[Books, Biographies &amp; Memoirs, Specific Gr...",$13.08,Mother Daughter Me: A Memoir,"[The most raw, honest and engaging memoir Ive ..."
6,0763644765,A8710RWTFG3IY,5.0,148.0,"[Books, Children's Books, Growing Up &amp; Fac...",$6.99,Potty (Leslie Patricelli board books),"[PreSBoth books feature the same smiling, bald..."
7,B0081GLCAM,A8710RWTFG3IY,5.0,148.0,"[Books, Literature & Fiction, Genre Fiction]",,Warm Moonlight (Kindle Single) eBook,[]
8,1619272040,A8710RWTFG3IY,5.0,148.0,[],$25.95,Desperado's Wife: A Memoir,"[Smart, accomplished, attractive reporter goes..."
9,0399158774,A8710RWTFG3IY,5.0,148.0,"[Books, Biographies &amp; Memoirs, Memoirs]",$9.90,Jujitsu Rabbi and the Godless Blonde: A True S...,"[<a href=""http://www.amazon.com/gp/feature.htm..."


In [83]:
books_index_interpret.join(Meta_Books_df_interpret, on='asin').filter("reviewerIDIndex == '463'").sort('overall', ascending=False).limit(10).toPandas()

Unnamed: 0,asin,reviewerID,overall,reviewerIDIndex,category,price,title,description
0,193332063X,A2RPZTZP8W80W,5.0,463.0,"[Books, Romance, Paranormal]",$14.95,The Witch's Dream (The Order of the Black Swan...,[<div><i><b>Suggested Reading Order</b></i></d...
1,1484980662,A2RPZTZP8W80W,5.0,463.0,"[Books, Romance, Paranormal]",$8.99,Wickedest Witch: Paranormal Romance,"[Five Hearts ""There wasn't anything that I did..."
2,1612177379,A2RPZTZP8W80W,5.0,463.0,"[Books, Romance, Paranormal]",$15.99,Apocalyptic Moon,[<span>Eva Gordon writes genre bending paranor...
3,0547729960,A2RPZTZP8W80W,5.0,463.0,"[Books, Teen &amp; Young Adult, Literature &am...",$11.24,The Suburban Strange,"[Gr 9-11-Celia Balaustine, a new sophomore at ..."
4,0312872380,A2RPZTZP8W80W,5.0,463.0,"[Books, Science Fiction &amp; Fantasy, Fantasy]",,Kushiel's Dart (Kushiel's Legacy),"[HThis brilliant and daring debut, set in a sk..."
5,0385528213,A2RPZTZP8W80W,5.0,463.0,"[Books, Literature &amp; Fiction, Genre Fiction]",,The Rapture,[Starred Review. Apocalyptic global climate ch...
6,1475138733,A2RPZTZP8W80W,5.0,463.0,"[Books, Romance, Science Fiction]",,F814: Cyborgs: More Than Machines (Volume 2),[<b>Do you love sci-fi romance? Then check out...
7,1622660315,A2RPZTZP8W80W,5.0,463.0,"[Books, Humor &amp; Entertainment, Humor]",$14.99,Cinderella Screwed Me Over (Entangled Select),"[, , <b>Cindi Madsen</b> sits at her computer ..."
8,0786915897,A2RPZTZP8W80W,5.0,463.0,"[Books, Christian Books &amp; Bibles, Literatu...",$6.95,Dragons of Spring Dawning (Dragonlance Chronic...,[The final installment in Weis and Hickman's C...
9,0985148373,A2RPZTZP8W80W,5.0,463.0,"[Books, Romance, Science Fiction]",$11.99,Colonization: Book one of Paradise Reclaimed (...,"[, ]"


In [56]:
# Generate top 10 user recommendations for each book
booksRecs = model.recommendForAllItems(10)
booksRecs.show(10, truncate=False)

+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|asinIndex|recommendations                                                                                                                                                                                                  |
+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148      |[[281543, 10.860487], [169881, 10.388498], [306332, 10.139483], [317620, 10.074364], [569432, 9.984281], [568940, 9.863988], [509337, 9.762174], [259625, 9.730604], [402846, 9.700727], [511421, 9.519455]]     |
|463      |[[240956, 14.545721], [593886, 13.560465], [458265, 13.12389], [530236, 12.64061], [495359, 12.594008

In [57]:
df_booksRecs_recommendation = booksRecs.withColumn("recommendations",explode("recommendations"))

In [58]:
udf_user_index = udf(lambda x:x[0],IntegerType())
udf_rating = udf(lambda x: x[1], FloatType())

In [59]:
nrecommendations_book = df_booksRecs_recommendation.withColumn('reviewerIDIndex', udf_user_index('recommendations')).withColumn('overall', udf_rating('recommendations')).drop('recommendations')
nrecommendations_book.limit(20).show()

+---------+---------------+---------+
|asinIndex|reviewerIDIndex|  overall|
+---------+---------------+---------+
|        1|         593886| 9.984209|
|        1|         240956| 9.229199|
|        1|         119415|  9.16442|
|        1|         568943| 8.809243|
|        1|         337349| 8.635262|
|        1|         562232| 8.606853|
|        1|         523376| 8.601137|
|        1|         325903| 8.496689|
|        1|         577831| 8.486034|
|        1|         208495| 8.474072|
|      206|         577831|10.757442|
|      206|         243444|10.316485|
|      206|         580874|10.271038|
|      206|         293768|10.243166|
|      206|         133281| 10.03422|
|      206|         348488| 9.946756|
|      206|         551891|  9.94592|
|      206|         281543|9.8957205|
|      206|         594111| 9.828285|
|      206|         257515| 9.716956|
+---------+---------------+---------+



In [17]:
# Generate top 10 movie recommendations for a specified set of users
users = books1_index.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

userSubsetRecs.show(10, truncate=False)

+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviewerIDIndex|recommendations                                                                                                                                                                                                   |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|414891         |[[159215, 9.758057], [185895, 9.056629], [194090, 8.618893], [131581, 8.494107], [325685, 8.25225], [233765, 8.196823], [321030, 8.182126], [358483, 8.174082], [181476, 8.171719], [319823, 8.166932]]           |
|137720         |[[191198, 15.173658], [88825, 13.913759], [275823, 13.401945], [201

In [18]:
# Generate top 10 user recommendations for a specified set of items
items = books1_index.select(als.getItemCol()).distinct().limit(3)
itemsSubSetRecs = model.recommendForItemSubset(items, 10)

itemsSubSetRecs.show(10, truncate=False)

+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|asinIndex|recommendations                                                                                                                                                                                                   |
+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|96304    |[[299491, 15.221222], [125350, 14.114204], [539288, 13.836814], [457624, 13.719978], [498638, 13.3598175], [447053, 13.310471], [590336, 13.086271], [313127, 12.95273], [566081, 12.916631], [161884, 12.913895]]|
|69457    |[[442820, 17.746065], [173868, 16.044271], [432846, 15.91729], [89282, 15.792759], [512904, 15.60

# user based cf

In [None]:
books_cf = books2_index.toPandas()

In [40]:
from pyspark.sql.functions import countDistinct
books.select(countDistinct("reviewerID"))

count(DISTINCT reviewerID)
603668


In [41]:
books.select(countDistinct("asin"))

count(DISTINCT asin)
367982


In [42]:
n_users = 603668
n_items = 367982

In [None]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]