You are planning to list a property on Airbnb. To maximize profits, you need to analyze the Airbnb data for the month of January 2023 to determine the best room type for each location. The best room type is based on the maximum average occupancy during the given month.

Write an SQL query to find the best room type for each location based on the average occupancy days. Order the results in descending order of average occupancy days, rounded to 2 decimal places.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Create DataFrame Example") \
    .getOrCreate()

# Define Schema
schema = StructType([
    StructField("listing_id", IntegerType(), True),
    StructField("host_id", IntegerType(), True),
    StructField("location", StringType(), True),
    StructField("room_type", StringType(), True),
    StructField("price", FloatType(), True),
    StructField("minimum_nights", IntegerType(), True)
])

# Create Data
data = [
    (1, 101, "Downtown", "Entire home/apt", 150.00, 2),
    (2, 101, "Downtown", "Private room", 80.00, 1),
    (3, 101, "Downtown", "Entire home/apt", 200.00, 3),
    (4, 102, "Downtown", "Entire home/apt", 120.00, 2),
    (5, 102, "Downtown", "Private room", 100.00, 1),
    (6, 102, "Midtown", "Entire home/apt", 250.00, 2),
    (7, 103, "Midtown", "Entire home/apt", 70.00, 1),
    (8, 103, "Midtown", "Private room", 90.00, 1),
    (9, 104, "Midtown", "Private room", 170.00, 1)
]

# Create DataFrame
listings_df = spark.createDataFrame(data, schema)

# Show DataFrame
listings_df.show()




# Define Schema
bookings_schema = StructType([
    StructField("booking_id", IntegerType(), True),
    StructField("listing_id", IntegerType(), True),
    StructField("checkin_date", StringType(), True),
    StructField("checkout_date", StringType(), True)
])

# Create Data
bookings_data = [
    (1, 1, "2023-01-05", "2023-01-10"),
    (2, 1, "2023-01-11", "2023-01-13"),
    (3, 2, "2023-01-15", "2023-01-25"),
    (4, 3, "2023-01-10", "2023-01-17"),
    (5, 3, "2023-01-19", "2023-01-21"),
    (6, 3, "2023-01-22", "2023-01-23"),
    (7, 4, "2023-01-03", "2023-01-05"),
    (8, 5, "2023-01-10", "2023-01-12"),
    (9, 6, "2023-01-15", "2023-01-19"),
    (10, 6, "2023-01-20", "2023-01-22"),
    (11, 7, "2023-01-25", "2023-01-29"),
    (12, 8, "2023-01-05", "2023-01-17"),
    (13, 9, "2023-01-10", "2023-01-12")
]

# Create DataFrame
bookings_df = spark.createDataFrame(bookings_data, bookings_schema)

# Show DataFrame
bookings_df.show()



+----------+-------+--------+---------------+-----+--------------+
|listing_id|host_id|location|      room_type|price|minimum_nights|
+----------+-------+--------+---------------+-----+--------------+
|         1|    101|Downtown|Entire home/apt|150.0|             2|
|         2|    101|Downtown|   Private room| 80.0|             1|
|         3|    101|Downtown|Entire home/apt|200.0|             3|
|         4|    102|Downtown|Entire home/apt|120.0|             2|
|         5|    102|Downtown|   Private room|100.0|             1|
|         6|    102| Midtown|Entire home/apt|250.0|             2|
|         7|    103| Midtown|Entire home/apt| 70.0|             1|
|         8|    103| Midtown|   Private room| 90.0|             1|
|         9|    104| Midtown|   Private room|170.0|             1|
+----------+-------+--------+---------------+-----+--------------+

+----------+----------+------------+-------------+
|booking_id|listing_id|checkin_date|checkout_date|
+----------+----------+---

In [0]:
listings_df.columns

Out[62]: ['listing_id', 'host_id', 'location', 'room_type', 'price', 'minimum_nights']

In [0]:
bookings_df=bookings_df.withColumn("book_days",datediff(col("checkout_date"),col("checkin_date")))

bookings_df=bookings_df.groupBy("listing_id").agg(sum("book_days").alias("booking_days"))

ans_df=listings_df.join(bookings_df,on=listings_df.listing_id==bookings_df.listing_id,how="inner")

ans_df=ans_df.groupBy("location","room_type").agg(avg("booking_days").alias("average_booking_days"))

window_spec=Window.partitionBy("location").orderBy(col("average_booking_days").desc())

ans_df = ans_df.withColumn("Rank_By_Location", row_number().over(window_spec)) \
    .filter(col("Rank_By_Location")==1).drop(col("Rank_By_Location")) \
        .orderBy(desc("average_booking_days")) \
            .select("location","room_type",round("average_booking_days",2))

In [0]:
ans_df.show()

+--------+---------------+------------------------------+
|location|      room_type|round(average_booking_days, 2)|
+--------+---------------+------------------------------+
| Midtown|   Private room|                           7.0|
|Downtown|Entire home/apt|                          6.33|
+--------+---------------+------------------------------+

