In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder \
    .appName("SparkSQL example") \
    .getOrCreate()

25/11/18 17:42:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
listings = spark.read.csv("data/listings.csv", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

                                                                                

In [4]:
reviews = spark.read.csv("data/reviews.csv", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

                                                                                

In [5]:
listings_reviews = listings.join(
    reviews, listings.id == reviews.listing_id, how='inner'
)

In [6]:
reviews_per_listing = listings_reviews \
  .groupBy(listings.id, listings.name) \
  .agg(
    F.count(reviews.id).alias('num_reviews')
  ) \
  .orderBy('num_reviews', ascending=False) \
  .show(truncate=False)

                                                                                

+--------+--------------------------------------------------+-----------+
|id      |name                                              |num_reviews|
+--------+--------------------------------------------------+-----------+
|30760930|Double Garden View room - London House Hotel***   |1682       |
|47408549|Double Room+ Ensuite                              |1650       |
|43120947|Private double room with en suite facilities      |1553       |
|19670926|Designer Studio Apartment in Central London       |1382       |
|1436172 |Cosy Double in Kings Cross Houseshare nr Eurostar |1134       |
|45006692|Budget Double Room In Colliers Hotel.             |1132       |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1071       |
|1436177 |En-suite Double in Kings Cross Houseshare Eurostar|943        |
|3855375 |Double in Kings Cross Houseshare nr Eurostar      |935        |
|2659707 |Large Room + Private Bathroom, E3.                |893        |
|42081759|Micro Studio at Locke at Bro

In [7]:
listings.createOrReplaceTempView("listings")
reviews.createOrReplaceTempView("reviews")

25/11/18 17:43:44 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [8]:
query = """
SELECT 
    listings.id, 
    listings.name, 
    COUNT(reviews.id) AS num_reviews
FROM 
    listings
INNER JOIN 
    reviews
ON 
    listings.id = reviews.listing_id
GROUP BY 
    listings.id, listings.name
ORDER BY 
    num_reviews DESC
"""

reviews_per_listing = spark.sql(query)
reviews_per_listing.show(truncate=False)

[Stage 9:>                                                          (0 + 1) / 1]

+--------+--------------------------------------------------+-----------+
|id      |name                                              |num_reviews|
+--------+--------------------------------------------------+-----------+
|30760930|Double Garden View room - London House Hotel***   |1682       |
|47408549|Double Room+ Ensuite                              |1650       |
|43120947|Private double room with en suite facilities      |1553       |
|19670926|Designer Studio Apartment in Central London       |1382       |
|1436172 |Cosy Double in Kings Cross Houseshare nr Eurostar |1134       |
|45006692|Budget Double Room In Colliers Hotel.             |1132       |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1071       |
|1436177 |En-suite Double in Kings Cross Houseshare Eurostar|943        |
|3855375 |Double in Kings Cross Houseshare nr Eurostar      |935        |
|2659707 |Large Room + Private Bathroom, E3.                |893        |
|42081759|Micro Studio at Locke at Bro

                                                                                