In [1]:
%run 00_setup.ipynb

23/11/21 13:50:37 WARN SSLSocketFactoryEx: Failed to load OpenSSL. Falling back to the JSSE default.
                                                                                

In [2]:
import time
from pyspark.sql.window import Window
from pyspark.sql.functions import to_date, sequence, explode, sum, month, year, col, row_number

In [3]:
ex_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date_time: string (nullable = true)
 |-- site_name: integer (nullable = true)
 |-- posa_continent: integer (nullable = true)
 |-- user_location_country: integer (nullable = true)
 |-- user_location_region: integer (nullable = true)
 |-- user_location_city: integer (nullable = true)
 |-- orig_destination_distance: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- is_mobile: integer (nullable = true)
 |-- is_package: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- srch_ci: string (nullable = true)
 |-- srch_co: string (nullable = true)
 |-- srch_adults_cnt: integer (nullable = true)
 |-- srch_children_cnt: integer (nullable = true)
 |-- srch_rm_cnt: integer (nullable = true)
 |-- srch_destination_id: integer (nullable = true)
 |-- srch_destination_type_id: integer (nullable = true)
 |-- hotel_id: long (nullable = true)



In [4]:
# take only clear data(correct sequence check-in and check-out dates)
ex_clear = ex_df.filter(to_date("srch_ci") <= to_date("srch_co"))

In [5]:
#sum booked rooms by dates. convert date range to records with exact dates and year_month field
ex_df1 = ex_clear.select(
    'hotel_id',
    explode(sequence(to_date('srch_ci'),to_date('srch_co'))).alias('date'),
    'srch_rm_cnt'
)

ex_df1.show(2)
ex_df1.count()

                                                                                

+------------+----------+-----------+
|    hotel_id|      date|srch_rm_cnt|
+------------+----------+-----------+
|970662608899|2017-08-22|          1|
|970662608899|2017-08-23|          1|
+------------+----------+-----------+
only showing top 2 rows



                                                                                

8607595

In [6]:
# group by 'hotel_id' and 'date', add separate year and month columns, sumarize booked rooms count
visits_df = ex_df1.groupBy('hotel_id', 'date') \
    .agg(
        year('date').alias('year'),
        month('date').alias('month'),
        sum('srch_rm_cnt').alias('used_rooms'))

visits_df.show(2)
visits_df.count()

                                                                                

+-------------+----------+----+-----+----------+
|     hotel_id|      date|year|month|used_rooms|
+-------------+----------+----+-----+----------+
|2293512536074|2016-10-31|2016|   10|        31|
|2817498546177|2017-09-29|2017|    9|        37|
+-------------+----------+----+-----+----------+
only showing top 2 rows



                                                                                

302352

In [7]:
# choose top 10 busy in every month
start_time = time.time()

window = Window.partitionBy('year', 'month').orderBy(col('used_rooms').desc())
result = visits_df.withColumn("rank", row_number().over(window)).where(col("rank").cast("int") < 11)

result.show(3)
print("Execution time is ", round(time.time() - start_time, 2), "sec.")
print("Result contains", result.count(), "records")

                                                                                

+-------------+----------+----+-----+----------+----+
|     hotel_id|      date|year|month|used_rooms|rank|
+-------------+----------+----+-----+----------+----+
|3289944948741|2016-10-24|2016|   10|        77|   1|
|2920577761282|2016-10-07|2016|   10|        75|   2|
|2293512536067|2016-10-23|2016|   10|        74|   3|
+-------------+----------+----+-----+----------+----+
only showing top 3 rows

Execution time is  6.05 sec.




Result contains 246 records


                                                                                

In [42]:
# join with hotel-weather data to get more hotel details
start_time = time.time()

result_ext = result \
    .join(hw_df, result.hotel_id == hw_df.id, "left") \
    .select(
        result['*'],
        hw_df.address.alias('hotel_name'),
        'city',
        'country') \
    .dropDuplicates()

result_ext.show(3)
print("Execution time is ", round(time.time() - start_time, 2), "sec.")
print("Result contains", result_ext.count(),  "records")

                                                                                

+-------------+----------+----+-----+----------+----+-------------------+------+-------+
|     hotel_id|      date|year|month|used_rooms|rank|         hotel_name|  city|country|
+-------------+----------+----+-----+----------+----+-------------------+------+-------+
|3289944948741|2016-10-24|2016|   10|        77|   1|The Stafford London|London|     GB|
|2920577761282|2016-10-07|2016|   10|        75|   2|Hotel Malte Astotel| Paris|     FR|
|2293512536067|2016-10-23|2016|   10|        74|   3|        Hotel Oscar| Paris|     FR|
+-------------+----------+----+-----+----------+----+-------------------+------+-------+
only showing top 3 rows

Execution time is  4.75 sec.


                                                                                

Result contains 246 records


In [43]:
result_ext.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (27)
+- HashAggregate (26)
   +- HashAggregate (25)
      +- Project (24)
         +- BroadcastHashJoin LeftOuter BuildRight (23)
            :- Filter (16)
            :  +- Window (15)
            :     +- WindowGroupLimit (14)
            :        +- Sort (13)
            :           +- Exchange (12)
            :              +- WindowGroupLimit (11)
            :                 +- Sort (10)
            :                    +- HashAggregate (9)
            :                       +- Exchange (8)
            :                          +- HashAggregate (7)
            :                             +- Project (6)
            :                                +- Generate (5)
            :                                   +- Filter (4)
            :                                      +- InMemoryTableScan (1)
            :                                            +- InMemoryRelation (2)
            :                                              