In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from datetime import datetime
from pyspark.sql.functions import *
# Initialize Spark Session
spark = SparkSession.builder.appName("AirbnbSearches").getOrCreate()

# ----------------------------
# Create Airbnb Searches Table
# ----------------------------

# Data
airbnb_data = [
    (1, datetime.strptime('2022-01-01', '%Y-%m-%d'), 'entire home,private room'),
    (2, datetime.strptime('2022-01-02', '%Y-%m-%d'), 'entire home,shared room'),
    (3, datetime.strptime('2022-01-02', '%Y-%m-%d'), 'private room,shared room'),
    (4, datetime.strptime('2022-01-03', '%Y-%m-%d'), 'private room')
]

# Schema
airbnb_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("date_searched", TimestampType(), True),
    StructField("filter_room_types", StringType(), True)
])

# Create DataFrame
airbnb_df = spark.createDataFrame(airbnb_data, schema=airbnb_schema)

# Create or replace temp view
airbnb_df.createOrReplaceTempView("airbnb_searches")

# ----------------------------
# Check your table!
# ----------------------------

spark.sql("SELECT * FROM airbnb_searches ORDER BY user_id").show(truncate=False)


+-------+-------------------+------------------------+
|user_id|date_searched      |filter_room_types       |
+-------+-------------------+------------------------+
|1      |2022-01-01 00:00:00|entire home,private room|
|2      |2022-01-02 00:00:00|entire home,shared room |
|3      |2022-01-02 00:00:00|private room,shared room|
|4      |2022-01-03 00:00:00|private room            |
+-------+-------------------+------------------------+



In [None]:
### Find the count of rooms

In [5]:
spark.sql(
"""
SELECT 
  room_type AS type_of_room, 
  COUNT(*) AS count
FROM (
  SELECT 
    explode(split(filter_room_types, ',')) AS room_type
  FROM airbnb_searches
)
GROUP BY room_type
ORDER BY count DESC
  """).show()

+------------+-----+
|type_of_room|count|
+------------+-----+
|private room|    3|
| entire home|    2|
| shared room|    2|
+------------+-----+



In [7]:
exploded_df = airbnb_df.select(explode(split(col("filter_room_types"), ",")) \
                        .alias("type_of_room"))

# Group by type_of_room and count
result_df = exploded_df.groupBy("type_of_room").count()