In [9]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab6_DF_EX1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# load data
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://dejads_input/restaurants_geo.csv'  #  use your gcp bucket name. Also upload sales.csv first
# Create data frame
restaurants_geo_df = spark.read.format("csv").option("header", "true").option("delimiter", ";") \
       .load(gsc_file_path)
restaurants_geo_df.printSchema()
restaurants_geo_df.show(5)

restaurants_df = spark.read \
  .format("bigquery") \
  .load("de2021-324520.labdataset.restaurants")    # project_id.datatset.tablename. Use your project id
restaurants_df.printSchema()
restaurants_df.show(4)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- type: string (nullable = true)

+---+--------------------+--------------------+-----------+------------+-----------+
| id|                name|             address|       city|       phone|       type|
+---+--------------------+--------------------+-----------+------------+-----------+
|  1|arnie morton's of...|435 s. la cienega...|los angeles|310/246-1501|   american|
|  2|arnie morton's of...|435 s. la cienega...|los angeles|310-246-1501|steakhouses|
|  3|  art's delicatessen| 12224 ventura blvd.|studio city|818/762-1221|   american|
|  4|          art's deli| 12224 ventura blvd.|studio city|818-762-1221|      delis|
|  5|       hotel bel-air|701 stone canyon rd.|    bel air|310/472-1211|californian|
+---+--------------------+--------------------+-----------+------------+-----------+
only showin

**Give me the contact details of the cheapest restaurant and the most expensive restaurant in each city of USA**

In [17]:
from pyspark.sql.functions import *
from pyspark.sql import Row, Window

joinExpression = restaurants_geo_df["id"] == restaurants_df['rid']
# default join is the inner or left join
restaurants_merged = restaurants_geo_df.join(restaurants_df, joinExpression,"left").drop("rid")
restaurants_merged.show(10)
# Rank the resturants in terms of the avg price, per each city. Then, select the best and worst product using ranks
windowdesc = Window.partitionBy(col("city")).orderBy(col("avg_price").desc())
windowasc = Window.partitionBy(col("city")).orderBy(col("avg_price").asc())

restaurants_merged_windowed = restaurants_merged.withColumn("rank_desc", dense_rank().over(windowdesc)).withColumn("rank_asc", dense_rank().over(windowasc))

# Get the best and worst performing products
cheap_expensive_city_table = restaurants_merged_windowed.where((col("rank_desc") == 1) | (col("rank_asc") == 1) ).select("*")
cheap_expensive_city_table.show(10)

cheap_expensive_city_table2 = cheap_expensive_city_table.withColumn("price_category", when(col("rank_desc") == 1, "Most Expensive")
                                                          .otherwise("Cheapest")).select("city", "price_category", "avg_price", "name", "contact", "phone")
cheap_expensive_city_table2.show(10)

+---+--------------------+--------------------+------------+------------+-------------+---------+------+--------------------+
| id|                name|             address|        city|       phone|         type|avg_price|rating|             contact|
+---+--------------------+--------------------+------------+------------+-------------+---------+------+--------------------+
|  1|arnie morton's of...|435 s. la cienega...| los angeles|310/246-1501|     american|    58.98|  2.49|lbuckeridge0@indi...|
|  2|arnie morton's of...|435 s. la cienega...| los angeles|310-246-1501|  steakhouses|    60.32|  3.69|wstrickland1@goog...|
|  3|  art's delicatessen| 12224 ventura blvd.| studio city|818/762-1221|     american|    61.94|  2.57|smiroy2@delicious...|
|  4|          art's deli| 12224 ventura blvd.| studio city|818-762-1221|        delis|    77.15|  4.34|radamoli3@google....|
|  5|       hotel bel-air|701 stone canyon rd.|     bel air|310/472-1211|  californian|    36.39|  3.78|  cwiersma4@et

A useful solution for a common need
https://stackoverflow.com/questions/34409875/how-to-get-other-columns-when-using-spark-dataframe-groupby

In [19]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "dejads_temp"
spark.conf.set('temporaryGcsBucket', bucket)
# Saving the data to BigQuery
cheap_expensive_city_table2.write.format('bigquery') \
  .option('table', 'de2021-324520.labdataset.resturants_selection') \
  #.mode("append") \
  .save()

In [20]:
# Stop the spark context
spark.stop()