In [33]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("local")
sparkConf.setAppName("airbnb")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
listings_file_path = '../airbnbdata/df_listings.csv'
listing_city_file_path = '../airbnbdata/df_listing_city.csv'
owner_file_path = '../airbnbdata/df_owner.csv'
owner_listings_file_path = '../airbnbdata/df_owner_listings.csv'
renter_file_path = '../airbnbdata/df_renter.csv'
review_file_path = '../airbnbdata/df_review.csv'


#gsc_file_path = 'gs://assignment2_airbnb/df_listings.csv' 

df_listings = spark.read.format("csv").option("header", "true") \
       .load(listings_file_path)
df_listing_city = spark.read.format("csv").option("header", "true") \
       .load(listing_city_file_path)
df_owner = spark.read.format("csv").option("header", "true") \
       .load(owner_file_path)
df_owner_listings = spark.read.format("csv").option("header", "true") \
       .load(owner_listings_file_path)
df_renter = spark.read.format("csv").option("header", "true") \
       .load(renter_file_path)
df_review = spark.read.format("csv").option("header", "true") \
       .load(review_file_path)

df_listings.createOrReplaceTempView("listings")
df_listing_city.createOrReplaceTempView("listing_city")
df_owner.createOrReplaceTempView("owner")
df_owner_listings.createOrReplaceTempView("owner_listings")
df_renter.createOrReplaceTempView("renter")
df_review.createOrReplaceTempView("review")



In [52]:
min_nr_listings = 5

sql_query_highest_ratings = f"""SELECT
    owner.host_id as owner_id,
    round(avg(review_scores_rating), 2) as avg_rating,
    count(*) as num_listings
FROM
    listings, owner, owner_listings
WHERE
    listings.id = owner_listings.id
    and owner.host_id = owner_listings.host_id
GROUP BY
    owner.host_id
HAVING count(*) > {min_nr_listings}
ORDER BY
    avg_rating desc"""

results = spark.sql(sql_query_highest_ratings)

print(f"Found {results.count()} listings with at least {min_nr_listings} listings")

results.show()

Found 81 listings with at least 5 listings
+---------+----------+------------+
| owner_id|avg_rating|num_listings|
+---------+----------+------------+
|380678517|       5.0|           6|
|140775798|      4.96|           6|
|  9282300|      4.96|           9|
|303405414|       4.9|           7|
| 30890942|       4.9|           6|
|420783452|      4.88|           6|
|  4456680|      4.86|           6|
|198405490|      4.85|           7|
|430694992|      4.84|           9|
|408898089|      4.84|           7|
| 89688606|      4.83|           6|
|203731852|      4.82|          20|
| 46691672|      4.82|           9|
|244141635|      4.82|          12|
|  5796250|      4.82|          14|
| 67005410|       4.8|          17|
| 88108496|       4.8|           6|
|135487531|      4.79|           7|
|177701530|      4.78|          12|
|302893992|      4.78|          10|
+---------+----------+------------+
only showing top 20 rows



In [63]:
# Make a query to the Spark SQL engine
# Check which owners have at least one listing in each of the 3 cities

# Table names: listings, listing_city, owner, owner_listings, renter, review
# listings(id, name, price, review_scores_rating)
# listing_city(id, city)
# owner(host_id, host_name)
# owner_listings(id, host_id)
# renter(renter_id, name)
# review(renter_id, listing_id, review)

# Take owners that have listings in all of Amsterdam, Rotterdam and Den Haag

nr_cities = 2

query_owners_in_at_least_2_cities = f"""
SELECT
    owner.host_id,
    owner.host_name,
    count(distinct listing_city.city) as num_cities
FROM
    owner, owner_listings, listing_city
WHERE
    owner.host_id = owner_listings.host_id
    and owner_listings.id = listing_city.id
GROUP BY
    owner.host_id, owner.host_name
HAVING
    count(distinct listing_city.city) >= {nr_cities}

"""

results_owners_in_alledrie_steden = spark.sql(query_owners_in_at_least_2_cities)

print(f"Found {results_owners_in_alledrie_steden.count()} owners with listings in at least {nr_cities} cities")
results_owners_in_alledrie_steden.show()

Found 8 owners with listings in at least 2 cities
+---------+----------------+----------+
|  host_id|       host_name|num_cities|
+---------+----------------+----------+
|244520390|           Gunni|         2|
| 19894111|Arjen & Nathalie|         2|
| 10239880|           Lucas|         2|
|177701530| Hosted By Wendy|         2|
|121985032| Tess - BELVILLA|         2|
| 20465009|          Alette|         2|
|128826790| Natasja & Mylan|         2|
|115324475|             Lin|         2|
+---------+----------------+----------+



In [66]:
# Table names: listings, listing_city, owner, owner_listings, renter, review
# listings(id, name, price, review_scores_rating)
# listing_city(id, city)
# owner(host_id, host_name)
# owner_listings(id, host_id)
# renter(renter_id, name)
# review(renter_id, listing_id, review)

min_listings = 10

# Find owners with at least 10 listings
query_owners_with_at_least_10_listings = """
SELECT
    owner.host_id as owner_id,
    owner.host_name as owner_name,
    COUNT(*) as num_listings,
    ROUND(AVG(listings.price), 2) as avg_price,
    ROUND(AVG(listings.review_scores_rating), 1) as avg_rating
FROM
    listings, owner, owner_listings
WHERE
    listings.id = owner_listings.id
    and owner.host_id = owner_listings.host_id
GROUP BY
    owner.host_id, owner.host_name
HAVING COUNT(*) > 10
ORDER BY
    num_listings DESC
"""

results_owners_10_listings = spark.sql(query_owners_with_at_least_10_listings)
print(f"Found {results_owners_10_listings.count()} owners with at least {min_listings} listings")
results_owners_10_listings.show(20)

Found 24 owners with at least 10 listings
+---------+---------------+------------+---------+----------+
| owner_id|     owner_name|num_listings|avg_price|avg_rating|
+---------+---------------+------------+---------+----------+
|448782489|        Martijn|          36|   117.97|       4.6|
|  5285926|      Diederick|          29|   134.86|       4.5|
|  9165668|         Robert|          23|   176.87|       4.6|
| 10239880|          Lucas|          22|    104.0|       4.6|
| 14574533|Hotel Not Hotel|          21|   189.33|       4.4|
|   300966|          Elwin|          20|    226.1|       4.6|
|138369331|          Peter|          20|    139.2|       3.9|
|203731852|   SWEETS Hotel|          20|   351.45|       4.8|
|244520390|          Gunni|          19|   192.05|       4.5|
| 67005410|      Feliciano|          17|   149.76|       4.8|
|432320567|         Tijmen|          15|    146.4|       4.1|
|  5796250|          Remco|          14|   142.79|       4.8|
|364305280|         Arnold| 

In [67]:
# Calculate average price and rating over all listings
query_average_price_rating = """
SELECT
    ROUND(AVG(listings.price), 2) as avg_price,
    ROUND(AVG(listings.review_scores_rating), 1) as avg_rating
FROM
    listings
"""


spark.sql(query_average_price_rating).show(20)

+---------+----------+
|avg_price|avg_rating|
+---------+----------+
|   203.11|       4.8|
+---------+----------+

