In [43]:
!pip install pyspark



In [44]:
!pip install google-cloud-storage



In [45]:
from google.cloud import storage
import os

In [46]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "sp24-i535-dhchhed-airbnb-590edf853c39.json"

In [47]:
project_id = 'sp24-i535-dhchhed-airbnb'
bucket_name = 'i535_dhchhed_airbnb'

In [48]:
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

In [49]:
def list_blobs(bucket_name):
    storage_client = storage.Client()
    blobs = storage_client.list_blobs(bucket_name)
    for blob in blobs:
        print(blob.name)

In [50]:
list_blobs(bucket)

airbnb_dataset.csv
airbnb_preprocessed_dataset.csv


In [51]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
from pyspark.sql import SparkSession

First, we will create a Spark session with a custom application name ('Airbnb_Analysis') and specify that it should run on a local Spark instance with all available CPU cores (local[*]). This session allows us to perform various data operations using Apache Spark, a framework designed for distributed data processing.

Next, we will read a CSV file (airbnb_preprocessed_dataset.csv) from the specified path into a Spark DataFrame named 'airbnb_bookings'.

In [53]:
spark = SparkSession.builder.appName('Airbnb_Analysis').master('local[*]').getOrCreate()

dataset_path = 'airbnb_preprocessed_dataset.csv'
airbnb_bookings = spark.read.csv(dataset_path, inferSchema=True, header=True)

In [54]:
airbnb_bookings.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- arrival_date: date (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: double (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- country: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- distribution_channel: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullable

In [55]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.window import Window

In [56]:
# We will create a temporary view from a Spark DataFrame and then run a SQL query on it to 
# retrieve data about customer types and their cancellation counts
airbnb_bookings.createOrReplaceTempView('airbnb_booking')

queries = """
    select
        customer_type,
        sum(is_canceled) as total_canceled
    from airbnb_booking
    group by customer_type
    order by total_canceled desc
"""

customerTypeSQL = spark.sql(queries)

print("Customer Type and Total Cancellation")
customerTypeSQL.show()

Customer Type and Total Cancellation
+---------------+--------------+
|  customer_type|total_canceled|
+---------------+--------------+
|      Transient|         21637|
|Transient-Party|          1781|
|       Contract|           512|
|          Group|            53|
+---------------+--------------+



We can infer that transient customers, typically individual travelers, have the highest number of cancellations, with 21,637, suggesting that their bookings tend to be more flexible and subject to change. Transient-Party customers, representing small groups or families, follow with 1,781 cancellations. Contract customers, who generally have longer-term agreements with hotels, show fewer cancellations at 512, indicating more booking stability. Group bookings, often used for larger gatherings or events, have the lowest number of cancellations at 53, suggesting stricter cancellation policies or greater commitment to the booking. This distribution indicates that transient customers may require more flexible rebooking policies or targeted marketing to reduce cancellations, while the stability seen with contract and group bookings might offer insights for reducing cancellations in other segments.

#### Reservations by Year and Month

In [57]:
# Let's analyze the reservation trends over time, group by the year and month of arrival 
# to calculate the total number of reservations and then order the results chronologically
reservation_trends = airbnb_bookings \
    .groupBy("arrival_date_year", "arrival_date_month") \
    .agg(count("*").alias("total_reservations")) \
    .orderBy("arrival_date_year", "arrival_date_month")

reservation_trends.show()

+-----------------+------------------+------------------+
|arrival_date_year|arrival_date_month|total_reservations|
+-----------------+------------------+------------------+
|             2015|            August|              2435|
|             2015|          December|              1961|
|             2015|              July|              1661|
|             2015|          November|              1642|
|             2015|           October|              2676|
|             2015|         September|              2819|
|             2016|             April|              3745|
|             2016|            August|              4414|
|             2016|          December|              3119|
|             2016|          February|              2772|
|             2016|           January|              1815|
|             2016|              July|              3829|
|             2016|              June|              3504|
|             2016|             March|              3796|
|             

We can infer that the total reservations by year and month indicates a consistent trend of higher reservations during the summer and early fall. August stands out as a peak month for all three years, with reservations reaching 2,435 in 2015, 4,414 in 2016, and 4,380 in 2017. Other high-reservation months include July, September, and October, suggesting that these periods might align with popular travel seasons or holidays. The trends in these months can guide hotel staffing, promotions, and other operational strategies to accommodate increased bookings, while lower-reservation months like February and January might benefit from targeted marketing efforts to boost bookings.

#### Cancellation Rate by Hotel

In [58]:
# Let's analyze the cancellation rate for each hotel, along with the total number of reservations and the total cancellations
cancellation_rate = airbnb_bookings \
    .groupBy("hotel") \
    .agg(
        count("*").alias("total_reservations"),
        sum("is_canceled").alias("total_cancellations"),
        round((sum("is_canceled") / count("*") * 100), 2).alias("cancellation_rate")
    )

cancellation_rate.show()

+------------+------------------+-------------------+-----------------+
|       hotel|total_reservations|total_cancellations|cancellation_rate|
+------------+------------------+-------------------+-----------------+
|Resort Hotel|             33510|               7944|            23.71|
|  City Hotel|             53404|              16039|            30.03|
+------------+------------------+-------------------+-----------------+



We can infer that City Hotel has a significantly higher cancellation rate (30.03%) compared to Resort Hotel (23.71%), despite having more total reservations (53,404 versus 33,510). This indicates that guests at City Hotels are more likely to cancel their bookings. The higher cancellation rate could be due to the nature of city-based travel, which often involves business or shorter stays, providing more flexibility to change plans. In contrast, Resort Hotels, often used for leisure and vacation, have a lower cancellation rate, suggesting greater commitment from guests. These insights can guide hotel management in addressing higher cancellation rates, perhaps through stricter cancellation policies, improved customer engagement, or targeted promotions to reduce cancellations in City Hotels, while Resort Hotels can leverage their stability to attract more bookings.

#### Reservations by Market Segment

In [59]:
# Let's analyze the reservations by market segment, providing insights into which segments have the most bookings
reservations_by_market_segment = airbnb_bookings \
    .groupBy("market_segment") \
    .agg(count("*").alias("total_reservations")) \
    .orderBy("total_reservations", ascending=False)

reservations_by_market_segment.show()

+--------------+------------------+
|market_segment|total_reservations|
+--------------+------------------+
|     Online TA|             51534|
| Offline TA/TO|             13849|
|        Direct|             11645|
|        Groups|              4936|
|     Corporate|              4025|
| Complementary|               698|
|      Aviation|               227|
+--------------+------------------+



We can infer that "Online TA" (Online Travel Agency) is the largest market segment, with a total of 51,534 reservations, indicating the significant role online platforms play in hotel bookings. The "Offline TA/TO" (Offline Travel Agency/Tour Operator) segment ranks second with 13,849 reservations, suggesting that traditional booking channels remain relevant. The "Direct" segment, encompassing direct bookings with the hotel, has 11,645 reservations, showing a healthy demand for direct customer relationships. Other smaller segments include "Groups" with 4,936 reservations, "Corporate" with 4,025 reservations, "Complementary" with 698 reservations, and "Aviation" with 227 reservations. This data suggests that hotels should focus on maintaining strong relationships with online travel agencies while also exploring opportunities to increase direct bookings and other channels, such as corporate and group bookings, which can provide a diversified revenue stream.

#### Analyzing Customer Loyalty and Cancellation Patterns

In [60]:
# Let's examine customer loyalty by analyzing cancellation rates among different customer types and countries
customer_window = Window.partitionBy("customer_type", "country")

loyalty_analysis = airbnb_bookings \
    .withColumn("total_bookings", count("hotel").over(customer_window)) \
    .withColumn("total_cancellations", sum("is_canceled").over(customer_window)) \
    .withColumn("cancellation_rate", (col("total_cancellations") / col("total_bookings")) * 100) \
    .groupBy("customer_type", "country") \
    .agg(
        round(avg("cancellation_rate"), 2).alias("avg_cancellation_rate"),
        count("total_bookings").alias("total_customers")
    ) \
    .orderBy(col("avg_cancellation_rate").desc())

loyalty_analysis.show()

+---------------+-------+---------------------+---------------+
|  customer_type|country|avg_cancellation_rate|total_customers|
+---------------+-------+---------------------+---------------+
|      Transient|    FJI|                100.0|              1|
|      Transient|    BEN|                100.0|              3|
|      Transient|    KHM|                100.0|              1|
|      Transient|    GGY|                100.0|              1|
|      Transient|    GLP|                100.0|              1|
|      Transient|    HND|                100.0|              1|
|      Transient|    IMN|                100.0|              2|
|      Transient|    JEY|                100.0|              8|
|      Transient|    MAC|                100.0|              9|
|      Transient|    MNE|                100.0|              2|
|      Transient|    MYT|                100.0|              2|
|      Transient|    NIC|                100.0|              1|
|      Transient|    UMI|               

We can infer that a subset of transient customers from multiple countries exhibit a 100% cancellation rate. These countries include FJI, BEN, KHM, GGY, GLP, HND, IMN, JEY, MAC, MNE, MYT, NIC, UMI, and VGB, with most having a very low total customer count (1-3), indicating these are infrequent or one-time bookings. Other countries like GIB and HKG, despite having larger customer counts (12 and 23, respectively), also show high cancellation rates (91.67% and 91.3%). This pattern suggests that bookings from these countries or customer types might be unreliable, leading to significant cancellations. Additionally, the "Transient-Party" type in some countries, like GGY and MKD, also has a 100% cancellation rate. The high cancellation rates in these countries could stem from various factors, including travel restrictions, booking behavior, or specific cultural or regional trends. Understanding these patterns could guide hotels in developing targeted marketing strategies to mitigate cancellations or reconsidering how they engage with customers from high-cancellation regions. It also points to the need for flexibility and risk management when dealing with bookings from these segments.

#### Examining Room Utilization and Changes

In [61]:
# Let's analyze room assignment changes to understand how often guests receive a different room type from the one they reserved
room_change_analysis = airbnb_bookings \
    .groupBy("reserved_room_type", "assigned_room_type") \
    .agg(
        count("*").alias("total_assignments"),
        sum(expr("CAST(reserved_room_type != assigned_room_type AS INTEGER)")).alias("total_changes")
    ) \
    .withColumn("change_rate", (col("total_changes") / col("total_assignments")) * 100) \
    .orderBy(col("change_rate").desc())

room_change_analysis.show()

+------------------+------------------+-----------------+-------------+-----------+
|reserved_room_type|assigned_room_type|total_assignments|total_changes|change_rate|
+------------------+------------------+-----------------+-------------+-----------+
|                 D|                 H|                9|            9|      100.0|
|                 E|                 A|               15|           15|      100.0|
|                 H|                 D|                1|            1|      100.0|
|                 C|                 B|                2|            2|      100.0|
|                 F|                 E|               31|           31|      100.0|
|                 C|                 I|                9|            9|      100.0|
|                 A|                 G|              174|          174|      100.0|
|                 E|                 H|                4|            4|      100.0|
|                 H|                 I|                6|            6|     

We can infer that all reserved room types have experienced a 100% change rate when assigned different room types, indicating that every reservation resulted in a different room type assignment. This trend is consistent across various reserved-to-assigned room type pairs, with high numbers of total assignments in certain combinations. For example, the transition from reserved room type A to assigned room type E has a total of 1,003 changes, while reserved room type D to assigned room type E has 654 changes. The prevalence of changes across a wide range of room type combinations suggests potential challenges in meeting customer expectations, which can impact customer satisfaction and increase the risk of cancellations. This high change rate could be due to operational issues, overbooking, or mismanagement of room assignments. Addressing these issues may require a more efficient reservation system, better communication between departments, or improved forecasting to reduce the mismatch between reserved and assigned room types. Understanding these trends can guide hotels in implementing strategies to improve room assignment accuracy and customer satisfaction.

#### Exploring Seasonal Trends and Their Impact on Revenue

In [62]:
# Let's analyze revenue trends over time, focusing on seasonal variations
seasonal_revenue_analysis = airbnb_bookings \
    .groupBy(year("arrival_date").alias("arrival_year"), month("arrival_date").alias("arrival_month")) \
    .agg(
        round(sum("adr"), 2).alias("total_revenue"),
        round(avg("adr"), 2).alias("average_adr"),
        sum("is_canceled").alias("total_cancellations")
    ) \
    .orderBy("arrival_year", "arrival_month")

seasonal_revenue_analysis.show()

+------------+-------------+-------------+-----------+-------------------+
|arrival_year|arrival_month|total_revenue|average_adr|total_cancellations|
+------------+-------------+-------------+-----------+-------------------+
|        2015|            7|    186929.44|     112.54|                512|
|        2015|            8|    299487.73|     122.99|                566|
|        2015|            9|    281647.61|      99.91|                560|
|        2015|           10|    211872.95|      79.18|                437|
|        2015|           11|      97727.0|      59.52|                242|
|        2015|           12|    140899.17|      71.85|                372|
|        2016|            1|    114626.95|      63.16|                301|
|        2016|            2|    194989.86|      70.34|                527|
|        2016|            3|    303078.37|      79.84|                885|
|        2016|            4|     341831.6|      91.28|               1012|
|        2016|           

We can infer that seasonal fluctuations in total revenue and cancellations, with higher revenue generally observed during summer months. The most substantial total revenue is seen in August 2016, with 656,804.85, accompanied by a high average ADR of 148.80 and a high number of cancellations at 1,441. July 2016 also shows significant revenue (503,183.80) with a higher ADR (131.41) and over 1,000 cancellations (1,084). These peaks in summer months suggest increased travel during this season, possibly related to holidays or vacations. The correlation between higher revenue and increased cancellations during these months could indicate overbooking or customer uncertainty. Conversely, lower total revenue is observed in months like November 2015 (97,727.0) and January 2016 (114,626.95), with corresponding lower ADR and cancellations, suggesting a potential off-season with reduced travel activity. This pattern can guide operational strategies, emphasizing revenue generation during peak months while mitigating cancellations through targeted customer engagement, flexible rebooking policies, or improved customer service to reduce booking uncertainty and maintain high occupancy rates throughout the year.

#### Identifying the Impact of Special Requests on Cancellations

In [63]:
# Let's analyze the impact of special requests on reservation cancellations
special_requests_analysis = airbnb_bookings \
    .groupBy("total_of_special_requests") \
    .agg(
        sum("is_canceled").alias("total_cancellations"),
        count("*").alias("total_reservations"),
        round((sum("is_canceled") / count("*") * 100), 2).alias("cancellation_rate")
    ) \
    .orderBy("total_of_special_requests")

special_requests_analysis.show()

+-------------------------+-------------------+------------------+-----------------+
|total_of_special_requests|total_cancellations|total_reservations|cancellation_rate|
+-------------------------+-------------------+------------------+-----------------+
|                        0|              14548|             43585|            33.38|
|                        1|               6496|             28905|            22.47|
|                        2|               2509|             11764|            21.33|
|                        3|                395|              2307|            17.12|
|                        4|                 34|               319|            10.66|
|                        5|                  1|                34|             2.94|
+-------------------------+-------------------+------------------+-----------------+



We can infer that bookings with more special requests generally exhibit lower cancellation rates. Reservations with no special requests have the highest total cancellations (14,548) and the highest cancellation rate (33.38%), suggesting these bookings are more likely to be canceled due to lower customer commitment or flexibility. As the number of special requests increases, the total cancellations and cancellation rates tend to decrease, indicating a stronger commitment to the booking when special requests are involved. For instance, bookings with one special request have a lower cancellation rate (22.47%), with the rate progressively decreasing to 21.33% for two requests, 17.12% for three, 10.66% for four, and the lowest at 2.94% for five special requests. This trend suggests that customers who make more special requests might be more certain about their bookings, potentially due to personalized preferences or detailed planning. Hotels can use this insight to enhance customer satisfaction by promoting and accommodating special requests, which could lead to a decrease in cancellations and an increase in booking stability. Additionally, examining the reasons for higher cancellations in bookings with no special requests could guide hotels in implementing strategies to improve customer retention and reduce cancellations in this segment.

#### Percentage of guest that get the room they requested

In [64]:
# Let's analyze the percentage of reservations where the reserved room type matches the assigned room type, grouped by hotel
room_features = ["hotel","is_canceled","reserved_room_type","assigned_room_type","isRequestedRoom"]

reservedRoom = (
    airbnb_bookings
    .withColumn("isRequestedRoom", when(col("reserved_room_type") == col("assigned_room_type"), 1)
                                  .otherwise(0))
).select(*room_features)

hotelReservedRoom = (
    reservedRoom
    .groupBy("hotel")
    .agg(
        round((sum(col("isRequestedRoom")) * 100 / count(col("hotel"))),2).alias("getting_requested_room_per")
    )
)
hotelReservedRoom.show()

+------------+--------------------------+
|       hotel|getting_requested_room_per|
+------------+--------------------------+
|Resort Hotel|                     79.72|
|  City Hotel|                     88.59|
+------------+--------------------------+



We can infer that City Hotels have a higher rate of fulfilling room type requests compared to Resort Hotels. City Hotels have a "getting_requested_room_per" value of 88.59%, suggesting that a significant majority of bookings receive the room type they originally reserved. In contrast, Resort Hotels have a lower rate of 79.72%, indicating that about one-fifth of bookings are assigned a different room type from what was reserved. This difference might reflect variations in hotel operations, customer expectations, or booking patterns between city-based and resort-based hotels. City Hotels, typically catering to business travelers or urban tourists, might prioritize consistent room assignments, whereas Resort Hotels, serving vacationers or leisure travelers, might have more flexibility in room assignments due to higher demand during peak seasons. To improve customer satisfaction, Resort Hotels could focus on reducing room assignment discrepancies, which might involve optimizing booking processes or managing room inventory more effectively. Conversely, City Hotels can leverage their higher consistency to attract customers seeking reliability in their room reservations.

#### Relationship between Booking Cancellation and Not getting assigned room

In [65]:
# Let's analyze cancellations where the assigned room type differed from the reserved room type
canceledReservationRoomDiff = (
    reservedRoom
    .filter(col("is_canceled") == 1)
    .groupBy("hotel")
    .agg(
        (count("hotel").alias("total_cancellation") - sum(col("isRequestedRoom"))).alias("total_not_assinged_room"),
        count("hotel").alias("total_cancellation"),
        round(((count("hotel").alias("total_cancellation") - sum(col("isRequestedRoom"))) * 100 / count("hotel")),2).alias("canceled_not_requested_room_per")
    )
)

canceledReservationRoomDiff.show()

+------------+-----------------------+------------------+-------------------------------+
|       hotel|total_not_assinged_room|total_cancellation|canceled_not_requested_room_per|
+------------+-----------------------+------------------+-------------------------------+
|Resort Hotel|                    294|              7944|                            3.7|
|  City Hotel|                    315|             16039|                           1.96|
+------------+-----------------------+------------------+-------------------------------+



We can infer that Resort Hotels have a higher rate of room mismatches among canceled reservations compared to City Hotels. Resort Hotels have 294 room mismatches among 7,944 total cancellations, resulting in a "canceled_not_requested_room_per" value of 3.7%. This suggests that a small fraction of canceled reservations in Resort Hotels involves room assignment discrepancies. In contrast, City Hotels, despite having a higher total number of cancellations (16,039), have a lower rate of room mismatches (1.96%), with 315 total room mismatches. This difference could indicate operational challenges in Resort Hotels leading to room assignment errors, or it could suggest that Resort Hotels have a more flexible booking process, resulting in more cancellations where room types don't match. The relatively lower rate of mismatches in City Hotels might reflect a more rigid operational structure or better control over room assignments. These insights can help hotel management focus on reducing room assignment errors, particularly in Resort Hotels, which could further reduce cancellation rates and improve customer satisfaction by ensuring guests receive the room type they reserved.

In [66]:
# Let's analyze revenue metrics for each hotel, considering expected revenue, revenue lost due to cancellations, and the 
# impact of cancellations on overall revenue
hotelRevenue = (
    airbnb_bookings
    .groupBy('hotel')
    .agg(
        sum(col('stays_in_weekend_nights') + col('stays_in_week_nights')).alias('expected_revenue'),
        sum(expr(f'CASE WHEN is_canceled = 1 THEN stays_in_weekend_nights ELSE 0 END')).alias('revenue_lost'),
        (col('expected_revenue') - col('revenue_lost')).alias('revenue'),
        (col('revenue_lost') * 100 / col('expected_revenue')).alias('revenue_lost_percentage')
    )
    .select(
        col('hotel'),
        format_number(col('expected_revenue'), 2).alias('expected_revenue'),
        format_number(col('revenue'), 2).alias('revenue'),
        format_number(col('revenue_lost'), 2).alias('revenue_lost'),
        round(col('revenue_lost_percentage'), 2).alias('revenue_lost_percentage')
    )
)

print('Hotel Revenue Summary:')
hotelRevenue.show()

Hotel Revenue Summary:
+------------+----------------+----------+------------+-----------------------+
|       hotel|expected_revenue|   revenue|revenue_lost|revenue_lost_percentage|
+------------+----------------+----------+------------+-----------------------+
|Resort Hotel|      148,024.00|136,872.00|   11,152.00|                   7.53|
|  City Hotel|      168,039.00|152,645.00|   15,394.00|                   9.16|
+------------+----------------+----------+------------+-----------------------+



We can infer that both Resort Hotel and City Hotel experience a loss in revenue due to cancellations, with City Hotel exhibiting a higher revenue lost percentage. Resort Hotel has an expected revenue of 148,024.00 and an actual revenue of 136,872.00, resulting in a revenue loss of 11,152.00, which translates to a revenue lost percentage of 7.53%. In comparison, City Hotel's expected revenue is 168,039.00, with an actual revenue of 152,645.00, indicating a revenue loss of 15,394.00 and a revenue lost percentage of 9.16%. These figures suggest that City Hotel experiences a greater impact from revenue loss due to cancellations, with a higher proportion of its expected revenue being lost. This could be attributed to higher cancellation rates or larger booking volumes. Resort Hotel's lower revenue lost percentage may reflect more stable bookings or better management of cancellations. These insights can guide hotel management in developing strategies to minimize revenue loss, such as implementing stricter cancellation policies, offering incentives for non-cancellable bookings, or improving customer engagement to reduce the likelihood of cancellations. Additionally, understanding the factors contributing to revenue loss can help both types of hotels improve revenue recovery and reduce the financial impact of cancellations.

## Predict cancellations of an Airbnb using Logistic Regression model

In [67]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Select relevant features for prediction
features = ["lead_time", "stays_in_weekend_nights", "stays_in_week_nights", 
            "adults", "children", "babies", "total_of_special_requests", "hotel", "market_segment", "deposit_type"]

# Convert categorical features into numerical (indexing)
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_indexed") for col in ["hotel", "market_segment", "deposit_type"]]

# Combine features into a single vector for training
assembler = VectorAssembler(inputCols=[f"{col}_indexed" if col in ["hotel", "market_segment", "deposit_type"] else col for col in features], outputCol="features")

# Create a logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="is_canceled")

# Create a pipeline with all these stages
pipeline = Pipeline(stages=indexers + [assembler, lr])

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = airbnb_bookings.randomSplit([0.8, 0.2], seed=12345)

# Fit the pipeline to the training data
model = pipeline.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="is_canceled", rawPredictionCol="prediction", metricName="areaUnderROC")

# Calculate the AUC (Area Under the ROC Curve) as a measure of accuracy
auc = evaluator.evaluate(predictions)

print("Area Under ROC:", auc)

Area Under ROC: 0.5807432429176912


An AUC-ROC of 0.5807432429176912 suggests that the model's ability to distinguish between two classes is slightly better than random guessing.

In [68]:
sample_record = test_data.limit(1)
sample_record_without_label = sample_record.drop("is_canceled")

# Make predictions
prediction_result = model.transform(sample_record_without_label)

# Display the prediction result
prediction_result.select("prediction", "probability").show(truncate=False)

+----------+---------------------------------------+
|prediction|probability                            |
+----------+---------------------------------------+
|0.0       |[0.7075851307460941,0.2924148692539059]|
+----------+---------------------------------------+



This indicates that the model is predicting the outcome to be class 0, with a probability of approximately 70.76%.