In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Create Spark session
spark = (
    SparkSession.builder
        .master("spark://spark-master:7077")
        .appName("UsedCarsPricePipeline")
        .getOrCreate()
)

spark



In [2]:
# Load the cleaned + enriched dataset created by the first batch pipeline
df_clean_enriched = spark.read.parquet(
    "/home/jovyan/data/used_cars_clean_enriched"
)

df_clean_enriched.printSchema()
df_clean_enriched.show(5)


root
 |-- state: string (nullable = true)
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- region: string (nullable = true)
 |-- price: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- condition: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- odometer: double (nullable = true)
 |-- posting_date: string (nullable = true)
 |-- region_group: string (nullable = true)

+-----+----------+--------------------+------------------+-------+----+-------------+--------------------+---------+------+--------+--------------------+------------+
|state|        id|                 url|            region|  price|year| manufacturer|               model|condition|  fuel|odometer|        posting_date|region_group|
+-----+----------+--------------------+------------------+-------+----+-------------+--------------------+---------+------+--------+--------------------+---

In [3]:
# Select relevant columns
columns_for_model = [
    "price",
    "year",
    "odometer",
    "manufacturer",
    "model",
    "condition",
    "fuel",
    "state",
    "region",
    "region_group",
    "posting_date",
]

df_model_base = df_clean_enriched.select(*columns_for_model)


In [4]:
#feture engineering
current_year = 2025  #for computing car age


df_features = (
    df_model_base
    # parse posting_date to timestamp
    .withColumn(
        "posting_ts",
        F.to_timestamp("posting_date")
    )
    .withColumn("posting_year", F.year("posting_ts"))
    .withColumn("posting_month", F.month("posting_ts"))
    # car age
    .withColumn("car_age", F.lit(current_year) - F.col("year"))
    # smoother odometer distribution
    .withColumn("log_odometer", F.log(F.col("odometer")))
)

df_features.select(
    "price", "year", "car_age", "odometer", "log_odometer",
    "posting_year", "posting_month"
).show(5)


+-------+----+-------+--------+------------------+------------+-------------+
|  price|year|car_age|odometer|      log_odometer|posting_year|posting_month|
+-------+----+-------+--------+------------------+------------+-------------+
| 8000.0|2005|     20|435000.0|12.983101310070822|        2021|            5|
|18990.0|2020|      5|  6395.0| 8.763271714012943|        2021|            5|
|32590.0|2015|     10| 34811.0|10.457688707770052|        2021|            5|
|27590.0|2011|     14| 61445.0|11.025897744718698|        2021|            5|
|31990.0|2017|      8| 15498.0| 9.648466262323895|        2021|            5|
+-------+----+-------+--------+------------------+------------+-------------+
only showing top 5 rows



In [5]:
# Categorical columns 
categorical_cols = [
    "manufacturer",
    "condition",
    "fuel",
    "state",
    "region",
    "region_group",
]

# Numeric columns
numeric_cols = [
    "car_age",
    "log_odometer",
    "posting_year",
    "posting_month",
]

# 1) StringIndexers
indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=f"{col}_index",
        handleInvalid="keep"
    )
    for col in categorical_cols
]

#OneHotEncoders
encoders = [
    OneHotEncoder(
        inputCols=[f"{col}_index"],
        outputCols=[f"{col}_vec"]
    )
    for col in categorical_cols
]

# VectorAssembler (skip rows with nulls)
feature_cols = [f"{col}_vec" for col in categorical_cols] + numeric_cols

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)



In [6]:
from pyspark.ml.regression import LinearRegression

# Regularised linear regression model
linear_model = LinearRegression(
    labelCol="price",
    featuresCol="features",
    maxIter=20,         # number of iterations
    regParam=0.1,       # L2/L1 regularisation strength
    elasticNetParam=0.5 # 0 = ridge, 1 = lasso, 0.5 = mix
)

# Full pipeline: index → encode → assemble → linear regression
stages = indexers + encoders + [assembler, linear_model]
price_pipeline = Pipeline(stages=stages)



In [7]:
# Use a very small random sample for modelling to avoid overloading the cluster
df_sample = df_features.sample(withReplacement=False, fraction=0.05, seed=42).limit(50000)
# 5% of rows, capped at 50k rows

# Train / test split on the sample
train_data, test_data = df_sample.randomSplit([0.8, 0.2], seed=42)

# Fit the pipeline on training data
price_model = price_pipeline.fit(train_data)

# Evaluate on test data
test_predictions = price_model.transform(test_data)

evaluator_rmse = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)
evaluator_r2 = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2"
)

rmse = evaluator_rmse.evaluate(test_predictions)
r2 = evaluator_r2.evaluate(test_predictions)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R^2:  {r2:.3f}")



Test RMSE: 10374.01
Test R^2:  0.468


In [8]:
# Apply the model on the full filtered feature dataset
predictions_all = price_model.transform(df_features)

# Compute price difference + price level classification
predictions_export = (
    predictions_all
    .withColumn("predicted_price", F.col("prediction"))
    .withColumn("price_diff", F.col("price") - F.col("predicted_price"))
    .withColumn(
        "price_level",  #determine whether the listing is overpriced/underpriced/fair
        F.when(F.col("price_diff") > 2000, "overpriced")
         .when(F.col("price_diff") < -2000, "underpriced")
         .otherwise("fair")
    )
    .select(
        "manufacturer",
        "model",
        "year",
        "car_age",
        "odometer",
        "price",
        "predicted_price",
        "price_diff",
        "price_level"
    )
)

predictions_export.show(10)


+-------------+--------------------+----+-------+--------+-------+------------------+-------------------+-----------+
| manufacturer|               model|year|car_age|odometer|  price|   predicted_price|         price_diff|price_level|
+-------------+--------------------+----+-------+--------+-------+------------------+-------------------+-----------+
|         ford|f-250 super duty ...|2005|     20|435000.0| 8000.0|22349.919087441587|-14349.919087441587|underpriced|
|    chevrolet|spark activ hatch...|2020|      5|  6395.0|18990.0| 35225.72348846463|-16235.723488464631|underpriced|
|mercedes-benz|    gla-class gla 45|2015|     10| 34811.0|32590.0| 29587.08004707827| 3002.9199529217294| overpriced|
|mercedes-benz|     cl-class cl 550|2011|     14| 61445.0|27590.0|24449.742648672378|  3140.257351327622| overpriced|
|          ram|1500 crew cab exp...|2017|      8| 15498.0|31990.0| 32772.66142793298| -782.6614279329806|       fair|
|       nissan|frontier crew cab sv|2019|      6| 21939.

In [9]:
# Save full predictions table
predictions_export.write.mode("overwrite").parquet(
    "/home/jovyan/data/used_cars_price_predictions"
)

# Save metrics as a tiny single-row table
metrics_schema = T.StructType([
    T.StructField("rmse", T.DoubleType(), False),
    T.StructField("r2", T.DoubleType(), False),
])

metrics_df = spark.createDataFrame(
    [(float(rmse), float(r2))],
    schema=metrics_schema
)

metrics_df.write.mode("overwrite").parquet(
    "/home/jovyan/data/used_cars_price_model_metrics"
)
