# Recommender system for hotel

In [67]:
import findspark
import pyspark
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as fun
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType

from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel 
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create the spark session

In [2]:
spark = SparkSession.builder.config("spark.jars", "postgresql-42.2.26.jar") \
        .appName("Recs").getOrCreate()

# Load data from database

In [34]:
offering = spark.read.format("jdbc").option("url", "jdbc:postgresql://localhost:5432/postgres") \
                                    .option("driver", "org.postgresql.Driver").option("dbtable", "offering") \
                                    .option("user", "postgres").option("password", "adm!@#").load()

In [4]:
review = spark.read.format("jdbc").option("url", "jdbc:postgresql://localhost:5432/postgres") \
                                    .option("driver", "org.postgresql.Driver").option("dbtable", "review") \
                                    .option("user", "postgres").option("password", "adm!@#").load()

# EDA

In [7]:
nb_ratings = review.select("offering_id", "ratings_overall")
nb_ratings = nb_ratings.groupBy("offering_id", "ratings_overall").count()
print("The number of ratings for each hotel.")
nb_ratings.sort(fun.col("count").desc()).show() 

The number of ratings for each hotel.
+-----------+---------------+-----+
|offering_id|ratings_overall|count|
+-----------+---------------+-----+
|     208454|            5.0| 1968|
|     113317|            5.0| 1892|
|     122005|            4.0| 1730|
|     214197|            1.0| 1698|
|      99352|            5.0| 1582|
|     214197|            3.0| 1578|
|     268533|            5.0| 1533|
|      93520|            4.0| 1524|
|     115644|            5.0| 1489|
|      93618|            5.0| 1449|
|      93352|            5.0| 1443|
|     126260|            5.0| 1399|
|      93346|            4.0| 1392|
|      93562|            5.0| 1383|
|      93516|            4.0| 1374|
|    1503474|            5.0| 1373|
|     292142|            5.0| 1371|
|      99766|            4.0| 1324|
|     249712|            5.0| 1317|
|     224221|            5.0| 1307|
+-----------+---------------+-----+
only showing top 20 rows



In [8]:
avg_ratings = review.select("offering_id", "ratings_overall")
print("Average ratings by hotel and the number of ratings.")
avg_ratings.groupBy("offering_id").agg(fun.round(fun.avg("ratings_overall"), 2).alias("ratings_avg"), 
                                       fun.count("offering_id").alias("number_ratings")) \
                                        .sort(fun.col("number_ratings").desc(), fun.col("ratings_avg").desc()).show()

Average ratings by hotel and the number of ratings.
+-----------+-----------+--------------+
|offering_id|ratings_avg|number_ratings|
+-----------+-----------+--------------+
|     214197|       2.47|          5456|
|      93520|        3.5|          4009|
|     122005|       4.01|          3726|
|      93618|        3.9|          3534|
|     223023|       3.17|          3385|
|      99766|       3.55|          3218|
|      93562|       4.13|          3170|
|      93437|       3.52|          3034|
|     611947|       3.76|          3004|
|     208454|       4.57|          2898|
|      93569|       4.15|          2886|
|      93450|       3.75|          2867|
|     249712|       4.15|          2865|
|      93507|       4.17|          2839|
|      93346|       4.19|          2816|
|      99307|       3.43|          2783|
|      93516|       3.97|          2782|
|     111501|       3.88|          2675|
|      99352|       4.49|          2657|
|      93358|       4.04|          2641|
+----

In [9]:
user_ratings = review.select("author_id", "offering_id", "ratings_overall")
user_ratings = user_ratings.groupBy("author_id").count()
print("The number of ratings by user.")
user_ratings.sort(fun.col("count").desc()).show(truncate=False)

The number of ratings by user.
+--------------------------------+-----+
|author_id                       |count|
+--------------------------------+-----+
|                                |77066|
|CATID_                          |299  |
|6562BBD4EA770FE84E579622F68FA181|63   |
|869E5453ED1ECD12EC2E210BEB6B922E|54   |
|6C62B19C4DB8C600576B763C68AF0759|52   |
|85223AA53CB0DE6A6EB7B3C1E99981B1|52   |
|7BC0AF07CC240F2F614A865BBE21B5AA|47   |
|F73CC60121EDAB8E6B1637FDE6A09177|43   |
|17EEAF9A079A2B75E9616F7864F7CA3E|42   |
|5A259EBC8C1AEBD6F20D4C48490255FF|41   |
|106952AB894E1236A6094B030DD73C8F|41   |
|5DFE96EC85C67F248DEFFA8B84891A6A|40   |
|D4B6BB9639B4DFFA8FFD419238D5B805|39   |
|0EED87FCB3D9A27898011055E0D05D34|38   |
|05D57A581E9DAE14FF2A054EB3709AB2|38   |
|E4BD9A4CD7872825F3585ECFFF4074B7|38   |
|B8308A3B1B78754D4A2FA8462AC647F3|37   |
|B5BB3E0C885910A11A102246B76B2728|36   |
|C7C61B1711AE7174C4403FA3045C0A78|35   |
|A5F51BA3CCAD57D198DD905A05A70430|35   |
+-------------------------

In [10]:
print("Number of anonymous ratings.")
user_ratings.where(user_ratings.author_id == "").show()

Number of anonymous ratings.
+---------+-----+
|author_id|count|
+---------+-----+
|         |77066|
+---------+-----+



In [11]:
print("Number of ratings with user.")
user_ratings.where(user_ratings.author_id != "").agg(fun.sum("count").alias("number_ratings")).show()

Number of ratings with user.
+--------------+
|number_ratings|
+--------------+
|        801495|
+--------------+



In [12]:
hotel_class_ratings = nb_ratings.join(offering, offering.id==nb_ratings.offering_id, "inner")
hotel_class_ratings = hotel_class_ratings.select("offering_id", "ratings_overall", "hotel_class")
hotel_class_ratings = hotel_class_ratings.groupBy("hotel_class").count()
print("Number of ratings by hotel class.")
hotel_class_ratings.sort(fun.col("hotel_class").desc()).show()

Number of ratings by hotel class.
+-----------+-----+
|hotel_class|count|
+-----------+-----+
|        5.0|  356|
|        4.5|  304|
|        4.0| 1949|
|        3.5| 1994|
|        3.0| 3588|
|        2.5| 2961|
|        2.0| 3408|
|        1.5|  251|
|        1.0|   56|
|       null| 2707|
+-----------+-----+



# Recommender system with ALS

In [13]:
def get_hotel_name(offering_id):
    return offering.select("name").where(offering.id == offering_id).collect()[0][0]
def hotel_name(offering_id):
    return offering.select("name").where(offering.id == offering_id)

Consider only the ratings with known author.

In [14]:
known_reviews = review.select("offering_id", "author_id", "author_num_id", "author_username", "ratings_overall", "date") \
                      .where((review.author_id != "") & (review.author_id != "CATID_"))

In [44]:
train, test = known_reviews.randomSplit([0.8, 0.2], seed=42)

In [39]:
als = ALS(userCol='author_num_id', itemCol='offering_id',
          ratingCol='ratings_overall',
         nonnegative=True,
         implicitPrefs=False,
         coldStartStrategy="drop")

In [41]:
param_grid = ParamGridBuilder() \
                .addGrid(als.rank, [10, 50, 100, 150]) \
                .addGrid(als.regParam, [.01, .05, .1, .15]) \
                .build()

In [42]:
evaluator = RegressionEvaluator(metricName="rmse",
                               labelCol="ratings_overall",
                               predictionCol="prediction")
print("Number of models to be tested: ", len(param_grid))

Number of models to be tested:  16


In [43]:
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, 
                   evaluator=evaluator, numFolds=5)

In [45]:
model = cv.fit(train)

In [46]:
best_model = model.bestModel

In [47]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

**Best Model**
  Rank: 10
  MaxIter: 10
  RegParam: 0.15


In [48]:
test_predictions = best_model.transform(test)
rmse = evaluator.evaluate(test_predictions)
print(rmse)

1.5090762812636798


## Checkpoint

In [61]:
best_model.write().overwrite().save("/models")

# Load model

In [68]:
model = ALSModel.load("/models/")

In [69]:
author_num_id = 10
userSchema = StructType([StructField('author_num_id', IntegerType(), True)])
users = spark.createDataFrame([[author_num_id,]], userSchema)

In [73]:
recommendations = model.recommendForUserSubset(users, 10) 
recommendations = recommendations.withColumn("rec_exp", fun.explode("recommendations")) \
                                .select("author_num_id", fun.col("rec_exp.offering_id"),
                                        fun.col("rec_exp.rating"))
recommendations.show()

+-------------+-----------+---------+
|author_num_id|offering_id|   rating|
+-------------+-----------+---------+
|           10|    1628583|6.1533813|
|           10|    1222239| 5.850977|
|           10|    1206970| 5.755678|
|           10|     674319| 5.696856|
|           10|    1863251|5.6358633|
|           10|     571427|5.5899653|
|           10|    2622936|5.5095763|
|           10|     258634|5.4626126|
|           10|    1630591|5.3669925|
|           10|    1889985|5.2939634|
+-------------+-----------+---------+



In [9]:
spark.stop()