In [14]:
import os
import time

# spark imports
import findspark
findspark.init('C:/Users/Lenovo/Downloads/spark-3.0.0-preview2-bin-hadoop3.2')


from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
books = spark.read.load(r"C:\Users\Lenovo\Documents\Springboard\Capstone\Dataset\BX-CSV-Dump\BX-Books.csv",
                     format="csv", sep=";", inferSchema="true", header="true", encoding ='latin1')

In [6]:
win_row_number = Window.orderBy("ISBN")
isbn = books.select(F.row_number().over(win_row_number).alias("book_id"), "ISBN")
isbn.cache().show(3)

+-------+----------+
|book_id|      ISBN|
+-------+----------+
|      1|0000913154|
|      2|0001010565|
|      3|0001046438|
+-------+----------+
only showing top 3 rows



In [8]:
ratings = spark.read.load(r"C:\Users\Lenovo\Documents\Springboard\Capstone\Dataset\BX-CSV-Dump\BX-Book-Ratings.csv",
                     format="csv", sep=";", inferSchema="true", header="true", encoding ='latin1')

In [9]:
ratings = ratings.withColumn("Book-Rating", ratings["Book-Rating"].cast("Float"))
ratings_with_bid = ratings.join(isbn, on=["ISBN"], how="inner")
ratings_with_bid.show(3)

+----------+-------+-----------+-------+
|      ISBN|User-ID|Book-Rating|book_id|
+----------+-------+-----------+-------+
|034545104X| 276725|        0.0|  46116|
|0155061224| 276726|        5.0|  22830|
|0446520802| 276727|        0.0|  92994|
+----------+-------+-----------+-------+
only showing top 3 rows



In [10]:
sample_ratings_with_bid = ratings_with_bid.sample(False, 0.0001, 3)
sample_ratings_with_bid.show(3)

+----------+-------+-----------+-------+
|      ISBN|User-ID|Book-Rating|book_id|
+----------+-------+-----------+-------+
|0375706038| 277107|       10.0|  62084|
|0515133302| 277523|       10.0| 109110|
|3453151933|   1172|       10.0| 257519|
+----------+-------+-----------+-------+
only showing top 3 rows



In [8]:
ratings_with_bid.count()

1031175

In [9]:
sample_ratings_with_bid.count()

90

In [11]:
(training, test) = sample_ratings_with_bid.randomSplit([0.8, 0.2])

In [12]:
training.show(3)

+----------+-------+-----------+-------+
|      ISBN|User-ID|Book-Rating|book_id|
+----------+-------+-----------+-------+
|0060175400|  65409|        0.0|   4205|
|0060830395|  98952|        0.0|   6846|
|0060936363|  80538|        8.0|   7657|
+----------+-------+-----------+-------+
only showing top 3 rows



In [15]:
als = ALS(userCol="User-ID", itemCol="book_id", ratingCol="Book-Rating", coldStartStrategy="drop", nonnegative=True)

In [16]:
param_grid = ParamGridBuilder().addGrid(als.rank, [3, 4, 5]).addGrid(als.maxIter, [8, 9, 10]).addGrid(als.regParam, [.17, .18, .19]).build()

In [17]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Book-Rating", predictionCol="prediction")

In [18]:
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)

In [18]:
model = tvs.fit(training)

IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.