In [24]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
from datetime import datetime
print("starting feature engineering")

starting feature engineering


In [25]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("AirbnbPricePredictor") \
    .master("local[*]") \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()
print(f"spark session connected")


spark session connected


In [26]:
df = spark.read.parquet("../data/processed/listings_clean.parquet")
print(f"loaded: {df.count():,} rows")

loaded: 118,240 rows


## 1. Host Tenure Features

In [27]:
df = df.withColumn(
    "host_tenure_days",
    datediff(current_date(), to_date(col("host_since")))
)
df = df.withColumn(
    "host_tenure_years",
    (col("host_tenure_days") / 365.25).cast("float")
)
df.select("host_since", "host_tenure_days", "host_tenure_years").limit(10).show(truncate=False)

+----------+----------------+-----------------+
|host_since|host_tenure_days|host_tenure_years|
+----------+----------------+-----------------+
|2008-10-14|6238            |17.078712        |
|2009-02-14|6115            |16.741957        |
|2008-08-30|6283            |17.201916        |
|2009-07-29|5950            |16.290213        |
|2009-08-24|5924            |16.219028        |
|2009-10-01|5886            |16.11499         |
|2009-10-08|5879            |16.095825        |
|2009-10-28|5859            |16.041067        |
|2009-10-30|5857            |16.035591        |
|2009-11-30|5826            |15.950719        |
+----------+----------------+-----------------+



## 2. Review Density Features

In [28]:
df = df.withColumn(
    "review_density",
    when(col("availability_365") > 0,
         col("reviews_per_month") / (col("availability_365") / 30)
    ).otherwise(0)
)
df = df.withColumn(
    "reviews_per_year_hosting",
    when(col("host_tenure_years") > 0,
         col("number_of_reviews") / col("host_tenure_years")
    ).otherwise(0)
)
df.select("id", "reviews_per_month", "availability_365", "review_density").limit(10).show(truncate=False)
df.filter(col("id") == 9359).show(truncate=False)

+-----+-----------------+----------------+--------------------+
|id   |reviews_per_month|availability_365|review_density      |
+-----+-----------------+----------------+--------------------+
|3109 |0.05             |350             |0.004285714285714286|
|5396 |2.26             |77              |0.8805194805194804  |
|7397 |2.2              |199             |0.3316582914572864  |
|9359 |0.0              |358             |0.0                 |
|9952 |0.36             |250             |0.043199999999999995|
|11487|0.09             |246             |0.010975609756097562|
|11798|0.81             |344             |0.07063953488372093 |
|12452|0.77             |34              |0.6794117647058824  |
|12887|0.71             |68              |0.31323529411764706 |
|14903|0.05             |173             |0.008670520231213874|
+-----+-----------------+----------------+--------------------+

+----+-------+-----+--------+---------+----------------------+------------------+---------------+------

## 3. Capacity Features

In [29]:
df = df.withColumn(
    "occupancy_rate",
    1 - (col("availability_365") / 365)
)
df = df.withColumn(
    "people_per_bedroom",
    when(col("bedrooms") > 0, col("accommodates") / col("bedrooms"))
    .otherwise(col("accommodates"))
)
df = df.withColumn(
    "people_per_bed",
    when(col("beds") > 0, col("accommodates") / col("beds"))
    .otherwise(col("accommodates"))
)
df.select("id", "accommodates", "bedrooms", "beds", "people_per_bedroom", "people_per_bed").limit(10).show(truncate=False)

+-----+------------+--------+----+------------------+--------------+
|id   |accommodates|bedrooms|beds|people_per_bedroom|people_per_bed|
+-----+------------+--------+----+------------------+--------------+
|3109 |2           |1.0     |1   |2.0               |2.0           |
|5396 |2           |0.0     |1   |2.0               |2.0           |
|7397 |4           |2.0     |1   |2.0               |4.0           |
|9359 |1           |1.0     |1   |1.0               |1.0           |
|9952 |2           |1.0     |1   |2.0               |2.0           |
|11487|1           |0.0     |1   |1.0               |1.0           |
|11798|2           |1.0     |1   |2.0               |2.0           |
|12452|4           |2.0     |2   |2.0               |2.0           |
|12887|2           |1.0     |1   |2.0               |2.0           |
|14903|3           |1.0     |1   |3.0               |3.0           |
+-----+------------+--------+----+------------------+--------------+



## 4. Location-Based Features

In [None]:
city_centers = {
    "NYC": (40.7580, -73.9855), # times square
    "LA": (34.0928, -118.3287), # hollywood
    "Paris": (48.8584, 2.2945) # eiffel tower
}
from pyspark.sql.functions import udf
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    if lat1 is None or lon1 is None:
        return None
    R = 6371 # km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

    
distance_udf = udf(haversine_distance, FloatType())
for city, (center_lat, center_lon) in city_centers.items():
    df = df.withColumn(
        f"distance_to_center_{city.lower()}",
        when(col("city") == city,
             distance_udf(col("latitude"), col("longitude"),
                         lit(center_lat), lit(center_lon))
        )
    )
df = df.withColumn(
    "distance_to_center",
    coalesce(
        col("distance_to_center_nyc"),
        col("distance_to_center_la"),
        col("distance_to_center_paris")
    )
).drop("distance_to_center_nyc", "distance_to_center_la", "distance_to_center_paris")
df.select("city", "latitude", "longitude", "distance_to_center").limit(10).show(truncate=False)

+-----+-----------------+-----------------+------------------+
|city |latitude         |longitude        |distance_to_center|
+-----+-----------------+-----------------+------------------+
|Paris|48.83191         |2.3187           |3.4369042         |
|Paris|48.85247         |2.35835          |4.717698          |
|Paris|48.85909         |2.35315          |4.2913504         |
|Paris|48.85898         |2.34701          |3.842025          |
|Paris|48.86238         |2.36957          |5.5095263         |
|Paris|48.86351         |2.3711           |5.6323214         |
|Paris|48.82524108886719|2.367469072341919|6.4892554         |
|Paris|48.85974         |2.37932          |6.206934          |
|Paris|48.86196         |2.38216          |6.424974          |
|Paris|48.88277         |2.38478          |7.137474          |
+-----+-----------------+-----------------+------------------+



## 5. Price Per Person

In [31]:
df = df.withColumn(
    "price_per_person",
    col("price") / col("accommodates")
)
df = df.withColumn(
    "price_per_bedroom",
    when(col("bedrooms") > 0, col("price") / col("bedrooms"))
    .otherwise(col("price"))
)
df.select("id", "price_per_person", "price_per_bedroom", "price").limit(10).show(truncate=False)

+-----+-----------------+-----------------+-----+
|id   |price_per_person |price_per_bedroom|price|
+-----+-----------------+-----------------+-----+
|3109 |50.0             |100.0            |100.0|
|5396 |44.0             |88.0             |88.0 |
|7397 |34.0             |68.0             |136.0|
|9359 |75.0             |75.0             |75.0 |
|9952 |75.0             |150.0            |150.0|
|11487|80.0             |80.0             |80.0 |
|11798|60.0             |120.0            |120.0|
|12452|50.25            |100.5            |201.0|
|12887|44.0             |88.0             |88.0 |
|14903|66.66666666666667|200.0            |200.0|
+-----+-----------------+-----------------+-----+



## 6. Neighborhood Price Index

In [32]:
cols = df.columns
if ("neighborhood_avg_price" not in cols) or ("neighborhood_listing_count" not in cols):
    neighborhood_prices = df.groupBy("city", "neighbourhood_cleansed").agg(
        avg("price").alias("neighborhood_avg_price"),
        count("*").alias("neighborhood_listing_count")
    )
    df = df.drop("neighborhood_avg_price", "neighborhood_listing_count")
    df = df.join(neighborhood_prices, ["city", "neighbourhood_cleansed"], "left")
if "price_index" not in df.columns:
    df = df.withColumn(
        "price_index",
        col("price") / col("neighborhood_avg_price")
    )
df.select("id", "price_index", "neighborhood_avg_price", "neighborhood_listing_count").limit(10).show(truncate=False)


+-----+-------------------+----------------------+--------------------------+
|id   |price_index        |neighborhood_avg_price|neighborhood_listing_count|
+-----+-------------------+----------------------+--------------------------+
|3109 |0.4884951679705476 |204.71031559114462    |2123                      |
|5396 |0.2796446891029651 |314.68503937007875    |2032                      |
|7397 |0.4321781558864006 |314.68503937007875    |2032                      |
|9359 |0.23209406293306611|323.1448450347881     |1581                      |
|9952 |0.7235777326123513 |207.3032284429914     |4894                      |
|11487|0.38590812405992064|207.3032284429914     |4894                      |
|11798|0.7103884018220111 |168.92167677881963    |1813                      |
|12452|0.9695941617005506 |207.3032284429914     |4894                      |
|12887|0.4244989364659127 |207.3032284429914     |4894                      |
|14903|1.0232027774101262 |195.46467661691543    |3015          

## 7. Host Performance Score

In [33]:
df = df.withColumn(
    "host_performance_score",
    (col("host_response_rate") * 0.3 +
     col("host_is_superhost") * 0.4 +
     (col("host_tenure_years") / 10).cast("float") * 0.3)
)
df.select("host_response_rate", "host_is_superhost", "host_tenure_years", "host_performance_score").limit(10).show()

+------------------+-----------------+-----------------+----------------------+
|host_response_rate|host_is_superhost|host_tenure_years|host_performance_score|
+------------------+-----------------+-----------------+----------------------+
|               1.0|                0|        17.078712|    0.8123613595962524|
|               1.0|                0|        16.741957|    0.8022586941719054|
|               1.0|                1|        17.201916|    1.2160574793815613|
|               0.0|                0|        16.290213|   0.48870638608932493|
|               1.0|                0|        16.219028|    0.7865708470344543|
|               1.0|                1|         16.11499|    1.1834497213363646|
|               1.0|                0|        16.095825|    0.7828747630119324|
|               1.0|                0|        16.041067|    0.7812319993972778|
|               1.0|                0|        16.035591|    0.7810677409172058|
|               0.0|                0|  

## 8. Minimum Nights Category

In [35]:
df = df.withColumn(
    "min_nights_category",
    when(col("minimum_nights") == 1, "short_term")
    .when((col("minimum_nights") >= 2) & (col("minimum_nights") <= 7), "week")
    .when((col("minimum_nights") >= 8) & (col("minimum_nights") <= 29), "long_term")
    .when(col("minimum_nights") >= 30, "monthly")
    .otherwise("other")
)
df.groupBy("min_nights_category").count().orderBy(desc("count")).show()

+-------------------+-----+
|min_nights_category|count|
+-------------------+-----+
|               week|49116|
|            monthly|42058|
|         short_term|25134|
|          long_term| 1932|
+-------------------+-----+



## 9. Property Type Simplification

In [36]:
df = df.withColumn(
    "property_category",
    when(col("property_type").like("%Entire%"), "Entire Home")
    .when(col("property_type").like("Private room%"), "Private Room")
    .when(col("property_type").like("Shared room%"), "Shared Room")
    .when(col("property_type").rlike("(?i)hotel|resort|aparthotel|minsu|casa|kezhan|bed and breakfast"), "Hotel / Hospitality")
    .when(col("property_type").rlike("(?i)tiny|rv|yurt|hut|tipi|treehouse|dome|tent|bus|train|boat|castle|cave|barn|lighthouse|island|container|farm|ranch|tower|earthen"), "Unique Stay")
    .otherwise("Other")
)
df.groupBy("property_category").count().orderBy(desc("count")).show()

+-------------------+-----+
|  property_category|count|
+-------------------+-----+
|        Entire Home|93385|
|       Private Room|20102|
|Hotel / Hospitality| 3568|
|        Shared Room|  709|
|        Unique Stay|  465|
|              Other|   11|
+-------------------+-----+



## 10. Listing Popularity Score

In [37]:
df = df.withColumn(
    "popularity_score",
    (col("number_of_reviews") / 100) * 0.4 +
    (col("review_scores_rating") / 5) * 0.3 +
    col("occupancy_rate") * 0.3
)
df.select("id", "number_of_reviews", "review_scores_rating", "occupancy_rate", "popularity_score").limit(10).show(truncate=False)

+-----+-----------------+--------------------+--------------------+-------------------+
|id   |number_of_reviews|review_scores_rating|occupancy_rate      |popularity_score   |
+-----+-----------------+--------------------+--------------------+-------------------+
|3109 |4                |5.0                 |0.04109589041095896 |0.3283287671232877 |
|5396 |425              |4.62                |0.7890410958904109  |2.2139123287671234 |
|7397 |367              |4.73                |0.4547945205479452  |1.8882383561643836 |
|9359 |0                |4.87                |0.019178082191780854|0.2979534246575342 |
|9952 |52               |4.92                |0.31506849315068497 |0.5977205479452055 |
|11487|13               |4.92                |0.32602739726027397 |0.44500821917808214|
|11798|121              |4.84                |0.05753424657534245 |0.7916602739726027 |
|12452|62               |4.82                |0.9068493150684931  |0.8092547945205479 |
|12887|108              |4.63   

## 11. Feature Summary

In [38]:
print("=" * 70)
print("feature engineering summary")
print("=" * 70)
new_features = [
    "host_tenure_days", "host_tenure_years",
    "review_density", "reviews_per_year_hosting",
    "occupancy_rate", "people_per_bedroom", "people_per_bed",
    "distance_to_center",
    "price_per_person", "price_per_bedroom",
    "price_index", "neighborhood_avg_price",
    "host_performance_score",
    "min_nights_category", "property_category",
    "popularity_score"
]
print(f"\nnew features created: {len(new_features)}")
print(f"total columns: {len(df.columns)}")
print("\nsample of engineered features:")
df.select(
    "id", "city", "price", "distance_to_center",
    "price_per_person", "host_performance_score", "popularity_score"
).limit(10).show()

feature engineering summary

new features created: 16
total columns: 44

sample of engineered features:
+-----+-----+-----+------------------+-----------------+----------------------+-------------------+
|   id| city|price|distance_to_center| price_per_person|host_performance_score|   popularity_score|
+-----+-----+-----+------------------+-----------------+----------------------+-------------------+
| 3109|Paris|100.0|         3.4369042|             50.0|    0.8123613595962524| 0.3283287671232877|
| 5396|Paris| 88.0|          4.717698|             44.0|    0.8022586941719054| 2.2139123287671234|
| 7397|Paris|136.0|         4.2913504|             34.0|    1.2160574793815613| 1.8882383561643836|
| 9359|Paris| 75.0|          3.842025|             75.0|   0.48870638608932493| 0.2979534246575342|
| 9952|Paris|150.0|         5.5095263|             75.0|    0.7865708470344543| 0.5977205479452055|
|11487|Paris| 80.0|         5.6323214|             80.0|    1.1834497213363646|0.445008219178082

## 12. Save Engineered Dataset

In [39]:
output_path = "../data/processed/listings_features.parquet"
df.cache()
print("saving feature-engineered data...")
df.write.mode("overwrite").parquet(output_path)
print(f"data saved to {output_path}")
print(f"rows: {df.count():,}")
print(f"columns: {len(df.columns)}")


saving feature-engineered data...
data saved to ../data/processed/listings_features.parquet
rows: 118,240
columns: 44


25/11/13 00:10:24 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 638270 ms exceeds timeout 120000 ms
25/11/13 00:10:24 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/13 00:10:27 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$