In [1]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
import time
print(f" starting ml pipeline")

 starting ml pipeline


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("AirbnbPricePredictor") \
    .master("local[*]") \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()
print(f" spark session connected")


25/11/13 01:28:27 WARN Utils: Your hostname, MacBook-Pro-110.local resolves to a loopback address: 127.0.0.1; using 10.2.1.42 instead (on interface en0)
25/11/13 01:28:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/13 01:28:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


 spark session connected


25/11/13 01:28:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.parquet("../data/processed/listings_features_no_outliers.parquet")
print(f"data loaded: {df.count():,} rows (outliers removed)")

                                                                                

data loaded: 108,251 rows (outliers removed)


## 1. Feature Selection

In [4]:
numeric_features = [
    "accommodates", "bedrooms", "beds", "bathrooms",
    "minimum_nights", "latitude", "longitude",
    "host_tenure_years", "host_response_rate", "host_listings_count",
    "number_of_reviews", "reviews_per_month", "review_scores_rating",
    "availability_365", "occupancy_rate",
    "distance_to_center", "people_per_bedroom",
    "host_performance_score", "popularity_score",
    "neighborhood_listing_count"
]
categorical_features = [
    "room_type", "property_category", "min_nights_category",
    "city", "host_is_superhost", "instant_bookable"
]
target = "price"
numeric_features = [f for f in numeric_features if f in df.columns]
categorical_features = [f for f in categorical_features if f in df.columns]
print(f"numeric features: {len(numeric_features)}")
print(f"categorical features: {len(categorical_features)}")

numeric features: 20
categorical features: 6


25/11/13 01:28:40 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## 2. Encode Categorical Variables

In [5]:
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="keep")
    for col in categorical_features
]
indexer_pipeline = Pipeline(stages=indexers)
df_indexed = indexer_pipeline.fit(df).transform(df)
indexed_features = [f"{col}_index" for col in categorical_features]
print(f" categorical features encoded")

                                                                                

 categorical features encoded


## 3. Assemble Feature Vector

In [6]:
all_features = numeric_features + indexed_features
df_ml = df_indexed.select(all_features + [target]).dropna()
print(f"data after removing nulls: {df_ml.count():,} rows")
assembler = VectorAssembler(inputCols=all_features, outputCol="features")
df_ml = assembler.transform(df_ml)
df_ml = df_ml.select("features", target)
print(f" feature vector assembled")

25/11/13 01:28:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


data after removing nulls: 106,822 rows
 feature vector assembled


## 4. Train/Test Split

In [7]:
train_data, test_data = df_ml.randomSplit([0.8, 0.2], seed=42)
train_data.cache()
test_data.cache()
print(f"training set: {train_data.count():,} rows")
print(f"test set: {test_data.count():,} rows")

training set: 85,444 rows
test set: 21,378 rows


## 5. Model 1: Linear Regression (Baseline)

In [8]:
print(f"=" * 70)
print(f"model 1: linear regression")
print(f"=" * 70)
start_time = time.time()
lr = LinearRegression(
    featuresCol="features",
    labelCol=target,
    maxIter=10,
    regParam=0.01,
    elasticNetParam=0.0
)
lr_model = lr.fit(train_data)
training_time = time.time() - start_time
print(f"\ntraining time: {training_time:.2f} seconds")
lr_predictions = lr_model.transform(test_data)
evaluator_rmse = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="r2")
lr_rmse = evaluator_rmse.evaluate(lr_predictions)
lr_mae = evaluator_mae.evaluate(lr_predictions)
lr_r2 = evaluator_r2.evaluate(lr_predictions)
print(f"\n**linear regression results:**")
print(f"rmse: ${lr_rmse:.2f}")
print(f"mae:  ${lr_mae:.2f}")
print(f"r²:   {lr_r2:.4f}")
print(f"\nsample predictions:")
lr_predictions.select(target, "prediction").limit(10).show(truncate=False)

model 1: linear regression


25/11/13 01:28:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/11/13 01:28:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/11/13 01:28:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK



training time: 0.55 seconds

**linear regression results:**
rmse: $79.59
mae:  $59.19
r²:   0.3697

sample predictions:
+-----+------------------+
|price|prediction        |
+-----+------------------+
|300.0|157.09520447410102|
|150.0|159.83930852729952|
|160.0|152.09799452927155|
|112.0|149.75648039858066|
|112.0|162.73515810436692|
|410.0|200.1114091229424 |
|221.0|203.17362248674272|
|101.0|194.6090694044994 |
|129.0|157.13931191091342|
|52.0 |148.5594811683281 |
+-----+------------------+



## 6. Model 2: Random Forest Regressor

In [9]:
print(f"=" * 70)
print(f"model 2: random forest")
print(f"=" * 70)
start_time = time.time()
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol=target,
    numTrees=50,
    maxDepth=10,
    seed=42
)
rf_model = rf.fit(train_data)
training_time = time.time() - start_time
print(f"\ntraining time: {training_time:.2f} seconds")
rf_predictions = rf_model.transform(test_data)
rf_rmse = evaluator_rmse.evaluate(rf_predictions)
rf_mae = evaluator_mae.evaluate(rf_predictions)
rf_r2 = evaluator_r2.evaluate(rf_predictions)
print(f"\n**random forest results:**")
print(f"rmse: ${rf_rmse:.2f}")
print(f"mae:  ${rf_mae:.2f}")
print(f"r²:   {rf_r2:.4f}")
feature_importance = rf_model.featureImportances
importance_list = [(all_features[i], float(feature_importance[i]))
                   for i in range(len(all_features))]
importance_df = spark.createDataFrame(importance_list, ["feature", "importance"]) \
    .orderBy(desc("importance"))
print(f"\ntop 15 most important features:")
importance_df.limit(15).show(truncate=False)

model 2: random forest


25/11/13 01:29:03 WARN DAGScheduler: Broadcasting large task binary with size 1202.7 KiB
25/11/13 01:29:04 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/11/13 01:29:05 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
25/11/13 01:29:05 WARN DAGScheduler: Broadcasting large task binary with size 1042.6 KiB
25/11/13 01:29:06 WARN DAGScheduler: Broadcasting large task binary with size 7.2 MiB
25/11/13 01:29:07 WARN DAGScheduler: Broadcasting large task binary with size 1812.1 KiB
                                                                                


training time: 7.27 seconds

**random forest results:**
rmse: $64.89
mae:  $46.71
r²:   0.5810

top 15 most important features:
+-------------------------+--------------------+
|feature                  |importance          |
+-------------------------+--------------------+
|bedrooms                 |0.21570916998974685 |
|accommodates             |0.13245132332367032 |
|property_category_index  |0.0934478769494848  |
|bathrooms                |0.08968153862584256 |
|beds                     |0.05237194341597224 |
|distance_to_center       |0.05194085262048233 |
|room_type_index          |0.04113169315974795 |
|longitude                |0.03753677024170569 |
|city_index               |0.03416958627979777 |
|host_listings_count      |0.030582216863634226|
|people_per_bedroom       |0.02962633133832439 |
|latitude                 |0.02646326633373878 |
|minimum_nights           |0.026065539737205445|
|min_nights_category_index|0.02224698174109056 |
|review_scores_rating     |0.021556674

## 7. Model 3: Gradient Boosted Trees

In [10]:
print(f"=" * 70)
print(f"model 3: gradient boosted trees")
print(f"=" * 70)
start_time = time.time()
gbt = GBTRegressor(
    featuresCol="features",
    labelCol=target,
    maxIter=50,
    maxDepth=5,
    seed=42
)
gbt_model = gbt.fit(train_data)
training_time = time.time() - start_time
print(f"\ntraining time: {training_time:.2f} seconds")
gbt_predictions = gbt_model.transform(test_data)
gbt_rmse = evaluator_rmse.evaluate(gbt_predictions)
gbt_mae = evaluator_mae.evaluate(gbt_predictions)
gbt_r2 = evaluator_r2.evaluate(gbt_predictions)
print(f"\n**gradient boosted trees results:**")
print(f"rmse: ${gbt_rmse:.2f}")
print(f"mae:  ${gbt_mae:.2f}")
print(f"r²:   {gbt_r2:.4f}")
gbt_feature_importance = gbt_model.featureImportances
gbt_importance_list = [(all_features[i], float(gbt_feature_importance[i]))
                       for i in range(len(all_features))]
gbt_importance_df = spark.createDataFrame(gbt_importance_list, ["feature", "importance"]) \
    .orderBy(desc("importance"))
print(f"\ntop 15 most important features (gbt):")
gbt_importance_df.limit(15).show(truncate=False)

model 3: gradient boosted trees

training time: 7.78 seconds

**gradient boosted trees results:**
rmse: $61.86
mae:  $43.66
r²:   0.6193

top 15 most important features (gbt):
+--------------------------+--------------------+
|feature                   |importance          |
+--------------------------+--------------------+
|bedrooms                  |0.11206675382719887 |
|distance_to_center        |0.10470007359276688 |
|longitude                 |0.10151934450171603 |
|latitude                  |0.09488215214704386 |
|minimum_nights            |0.07246659377456735 |
|review_scores_rating      |0.0658107763066241  |
|accommodates              |0.054752281837974426|
|host_response_rate        |0.05178471551622767 |
|host_listings_count       |0.04890012784792459 |
|city_index                |0.04498899071532918 |
|property_category_index   |0.04284766763384124 |
|people_per_bedroom        |0.027700732064641985|
|reviews_per_month         |0.026898818377831295|
|neighborhood_listing_co

## 8. Model Comparison

In [14]:
import pandas as pd
print(f"=" * 70)
print(f"model comparison summary")
print(f"=" * 70)
comparison = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "Gradient Boosted Trees"],
    "RMSE ($)": [lr_rmse, rf_rmse, gbt_rmse],
    "MAE ($)": [lr_mae, rf_mae, gbt_mae],
    "R² Score": [lr_r2, rf_r2, gbt_r2]
})
print(f"\n")
print(comparison.to_string(index=False))
best_model_idx = comparison["R² Score"].idxmax()
best_model_name = comparison.loc[best_model_idx, "Model"]
print(f"\n best model: {best_model_name}")
print(f"   r² score: {comparison.loc[best_model_idx, 'R² Score']:.4f}")
print(f"   rmse: ${comparison.loc[best_model_idx, 'RMSE ($)']:.2f}")

model comparison summary


                 Model  RMSE ($)   MAE ($)  R² Score
     Linear Regression 79.586787 59.190476  0.369747
         Random Forest 64.888396 46.714432  0.581045
Gradient Boosted Trees 61.858598 43.656816  0.619256

 best model: Gradient Boosted Trees
   r² score: 0.6193
   rmse: $61.86


## 9. Prediction Error Analysis

In [15]:
error_analysis = gbt_predictions.withColumn(
    "error",
    col("prediction") - col(target)
).withColumn(
    "abs_error",
    abs(col("error"))
).withColumn(
    "pct_error",
    (abs(col("error")) / col(target)) * 100
)
print(f"error statistics:")
error_analysis.select(
    mean("error").alias("mean_error"),
    mean("abs_error").alias("mean_abs_error"),
    mean("pct_error").alias("mean_pct_error"),
    expr("percentile(abs_error, 0.5)").alias("median_abs_error"),
    expr("percentile(pct_error, 0.5)").alias("median_pct_error")
).show(truncate=False)
within_50_count = error_analysis.filter(col("abs_error") <= 50).count()
total_count = error_analysis.count()
percent_within_50 = (within_50_count / total_count) * 100 if total_count > 0 else 0.0
print(f"\npercentage of predictions within $50: {percent_within_50:.2f}%")
print(f"\nworst 10 predictions (highest error):")
error_analysis.select(target, "prediction", "error", "abs_error") \
    .orderBy(desc("abs_error")) \
    .limit(10) \
    .show(truncate=False)


error statistics:
+--------------------+-----------------+------------------+------------------+-----------------+
|mean_error          |mean_abs_error   |mean_pct_error    |median_abs_error  |median_pct_error |
+--------------------+-----------------+------------------+------------------+-----------------+
|-0.26373672091718936|43.65681588637874|29.562648082475675|30.953707133511486|22.29909701791363|
+--------------------+-----------------+------------------+------------------+-----------------+


percentage of predictions within $50: 68.97%

worst 10 predictions (highest error):
+-----+------------------+-------------------+------------------+
|price|prediction        |error              |abs_error         |
+-----+------------------+-------------------+------------------+
|550.0|32.17890297866269 |-517.8210970213373 |517.8210970213373 |
|540.0|95.63878105557335 |-444.36121894442664|444.36121894442664|
|500.0|83.54068709937494 |-416.45931290062504|416.45931290062504|
|500.0|98.12947

## 10. Save Best Model

In [20]:
model_path = "../outputs/models/gbt_model"
try:
    gbt_model.write().overwrite().save(model_path)
    print(f" model saved to: {model_path}")
except:
    print(f" could not save model to dbfs (permission issue)")
    print(f"  model is still available in memory for this session")
metrics = {
    "model_type": "RandomForest",
    "rmse": float(gbt_rmse),
    "mae": float(gbt_mae),
    "r2": float(gbt_r2),
    "num_trees": 50,
    "max_depth": 10
}
print(f"\n model training complete")

 model saved to: ../outputs/models/gbt_model

 model training complete
