In [1]:
import os
import time

# spark imports
import findspark
findspark.init('C:/Users/Lenovo/Downloads/spark-3.0.0-preview2-bin-hadoop3.2')


from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
books = spark.read.load(r"C:\Users\Lenovo\Documents\Springboard\Capstone\Dataset\BX-CSV-Dump\BX-Books.csv",
                     format="csv", sep=";", inferSchema="true", header="true", encoding ='latin1')

In [4]:
win_row_number = Window.orderBy("ISBN")
isbn = books.select(F.row_number().over(win_row_number).alias("book_id"), "ISBN")
isbn.cache().show(3)

+-------+----------+
|book_id|      ISBN|
+-------+----------+
|      1|0000913154|
|      2|0001010565|
|      3|0001046438|
+-------+----------+
only showing top 3 rows



In [5]:
ratings = spark.read.load(r"C:\Users\Lenovo\Documents\Springboard\Capstone\Dataset\BX-CSV-Dump\BX-Book-Ratings.csv",
                     format="csv", sep=";", inferSchema="true", header="true", encoding ='latin1')

In [12]:
ratings = ratings.withColumn("Book-Rating", ratings["Book-Rating"].cast("Float"))
ratings_with_bid = ratings.join(isbn, on=["ISBN"], how="inner")
ratings_with_bid.show(3)

+----------+-------+-----------+-------+
|      ISBN|User-ID|Book-Rating|book_id|
+----------+-------+-----------+-------+
|034545104X| 276725|        0.0|  46116|
|0155061224| 276726|        5.0|  22830|
|0446520802| 276727|        0.0|  92994|
+----------+-------+-----------+-------+
only showing top 3 rows



In [13]:
(training, test) = ratings_with_bid.randomSplit([0.8, 0.2])

In [14]:
als = ALS(userCol="User-ID", itemCol="book_id", ratingCol="Book-Rating", coldStartStrategy="drop", nonnegative=True)

In [15]:
param_grid = ParamGridBuilder().addGrid(als.rank, [12, 13, 14]).addGrid(als.maxIter, [18, 19, 20]).addGrid(als.regParam, [.17, .18, .19]).build()

In [16]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Book-Rating", predictionCol="Book-Rating")

In [17]:
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)

In [None]:
model = tvs.fit(training)