In [16]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("local")
sparkConf.setAppName("airbnb")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
listings_file_path = '../airbnbdata/df_listings.csv'
listing_city_file_path = '../airbnbdata/df_listing_city.csv'
owner_file_path = '../airbnbdata/df_owner.csv'
owner_listings_file_path = '../airbnbdata/df_owner_listings.csv'
renter_file_path = '../airbnbdata/df_renter.csv'
review_file_path = '../airbnbdata/df_review.csv'


#gsc_file_path = 'gs://assignment2_airbnb/df_listings.csv' 

df_listings = spark.read.format("csv").option("header", "true") \
       .load(listings_file_path)
df_listing_city = spark.read.format("csv").option("header", "true") \
       .load(listing_city_file_path)
df_owner = spark.read.format("csv").option("header", "true") \
       .load(owner_file_path)
df_owner_listings = spark.read.format("csv").option("header", "true") \
       .load(owner_listings_file_path)
df_renter = spark.read.format("csv").option("header", "true") \
       .load(renter_file_path)
df_review = spark.read.format("csv").option("header", "true") \
       .load(review_file_path)

df_listings.createOrReplaceTempView("listings")
df_listing_city.createOrReplaceTempView("listing_city")
df_owner.createOrReplaceTempView("owner")
df_owner_listings.createOrReplaceTempView("owner_listings")
df_renter.createOrReplaceTempView("renter")
df_review.createOrReplaceTempView("review")



In [48]:
SQL_QUERY = """Select
    owner.host_id as owner_id,
    round(avg(review_scores_rating), 2) as avg_rating,
    count(*) as num_listings
From
    listings, owner, owner_listings
Where
    listings.id = owner_listings.id
    and owner.host_id = owner_listings.host_id
Group by
    owner.host_id
HAVING count(*) > 10
Order by
    avg_rating desc"""

In [49]:
spark.sql(SQL_QUERY).show(20)

+---------+----------+------------+
| owner_id|avg_rating|num_listings|
+---------+----------+------------+
|244141635|      4.82|          12|
|203731852|      4.82|          20|
|  5796250|      4.82|          14|
| 67005410|       4.8|          17|
|177701530|      4.78|          12|
|436284753|       4.7|          11|
|243878598|      4.67|          12|
|448782489|      4.64|          36|
|   300966|      4.59|          20|
|  3040748|      4.59|          13|
| 10239880|      4.59|          22|
|  9165668|      4.59|          23|
|  5285926|      4.54|          29|
|364305280|      4.51|          14|
|244520390|      4.49|          19|
|241644101|      4.46|          14|
| 14574533|      4.45|          21|
|115403367|      4.45|          14|
|245267147|       4.4|          12|
| 49695547|      4.39|          11|
+---------+----------+------------+
only showing top 20 rows

