In [1]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("aggregate").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


In [6]:
airbnb = spark.read.csv('Datasets/nyc_air_bnb.csv',inferSchema=True,header=True)

In [7]:
airbnb.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



In [9]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
df = airbnb.withColumn("price", airbnb["price"].cast(IntegerType()))         .withColumn("minimum_nights", airbnb["minimum_nights"].cast(IntegerType()))         .withColumn("number_of_reviews", airbnb["number_of_reviews"].cast(IntegerType()))         .withColumn("reviews_per_month", airbnb["reviews_per_month"].cast(IntegerType()))         .withColumn("calculated_host_listings_count", airbnb["calculated_host_listings_count"].cast(IntegerType()))

In [10]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: integer (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)



In [11]:
df.count()

49079

In [51]:
df.groupBy('host_id').agg(sum('number_of_reviews').alias('rev')).orderBy(sum('number_of_reviews').desc()).show(1)


+--------+----+
| host_id| rev|
+--------+----+
|37312959|2273|
+--------+----+
only showing top 1 row



In [54]:
df.agg(avg('minimum_nights').alias('avr_min')).show()


+------------------+
|           avr_min|
+------------------+
|7.1286126280910596|
+------------------+



In [62]:
result = df.groupBy("neighbourhood").agg(avg(df.price).alias('avg_price'))
result.orderBy(result.avg_price.desc()).show(1) 


+--------------+---------+
| neighbourhood|avg_price|
+--------------+---------+
|Fort Wadsworth|    800.0|
+--------------+---------+
only showing top 1 row

