In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, length, when, log1p, expm1, lower, percentile_approx, count

spark = SparkSession.builder \
    .appName("Amazon Price Prediction DT") \
    .config("spark.driver.memory", "8g") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

DATA_PATH = "../../data/regression_price"

In [2]:
df = spark.read.parquet(DATA_PATH)

has_array_features = [f.dataType for f in df.schema.fields if f.name == "features"]
if str(has_array_features[0]).startswith("ArrayType"):
    df = df.withColumn("features_count", size(col("features")))
else:
    if "features_count" not in df.columns and "features" in df.columns:
         df = df.withColumn("features_count", length(col("features")))

required_cols = ["rating_number", "average_rating", "main_category", "price"]
df_clean = df.dropna(subset=required_cols)

store_counts = df_clean.groupBy("store").agg(count("*").alias("store_freq"))
df_improved = df_clean.join(store_counts, on="store", how="left")
df_improved = df_improved.na.fill(0, subset=["store_freq", "features_count"])

In [3]:
price_stats = df_improved.select(
    percentile_approx("price", 0.25).alias("q1"),
    percentile_approx("price", 0.75).alias("q3")
).collect()[0]

q1, q3 = price_stats["q1"], price_stats["q3"]
iqr = q3 - q1
lower_bound = max(1.0, q1 - 1.5 * iqr)
upper_bound = q3 + 1.5 * iqr

df_featured = df_improved.filter(
    (col("price") >= lower_bound) & (col("price") <= upper_bound)
)

if "title" in df_featured.columns:
    df_featured = df_featured.withColumn("title_len", length(col("title")))
    df_featured = df_featured.withColumn("is_premium", when(lower(col("title")).rlike("premium|pro|deluxe"), 1).otherwise(0))
    df_featured = df_featured.withColumn("is_bundle", when(lower(col("title")).rlike("bundle|set|pack"), 1).otherwise(0))

df_featured = df_featured.withColumn("log_rating_number", log1p(col("rating_number")))
df_featured = df_featured.withColumn("log_store_freq", log1p(col("store_freq")))
df_featured = df_featured.withColumn("label", log1p(col("price")))

print(f"Ready for Decision Tree: {df_featured.count()} rows")

Ready for Decision Tree: 565678 rows


In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, HashingTF, IDF, StringIndexer, 
    VectorAssembler, VectorIndexer
)
from pyspark.ml.regression import DecisionTreeRegressor

stages = []

if "title" in df_featured.columns:
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    stages.append(tokenizer)
    
    hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=1000)
    stages.append(hashingTF)
    
    idf = IDF(inputCol="raw_features", outputCol="title_features")
    stages.append(idf)

indexer = StringIndexer(inputCol="main_category", outputCol="category_idx", handleInvalid="keep")
stages.append(indexer)

numeric_cols = ["average_rating", "log_rating_number", "log_store_freq", "category_idx"]
optional_cols = ["title_len", "is_premium", "is_bundle", "features_count"]

for c in optional_cols:
    if c in df_featured.columns:
        numeric_cols.append(c)

input_cols = numeric_cols
if "title" in df_featured.columns:
    input_cols.append("title_features")

assembler = VectorAssembler(inputCols=input_cols, outputCol="features_raw")
stages.append(assembler)

feature_indexer = VectorIndexer(
    inputCol="features_raw", 
    outputCol="features_vector", 
    maxCategories=10, 
    handleInvalid="keep"
)
stages.append(feature_indexer)

dt = DecisionTreeRegressor(
    featuresCol="features_vector", 
    labelCol="label",
    maxDepth=12,          
    minInstancesPerNode=2,
    maxBins=128,          
    seed=42
)
stages.append(dt)

pipeline = Pipeline(stages=stages)

In [10]:
train_data, test_data = df_featured.randomSplit([0.8, 0.2], seed=42)

print("Training Decision Tree...")
model = pipeline.fit(train_data)
print("Done!")

predictions = model.transform(test_data)
predictions = predictions.withColumn("prediction_price", expm1(col("prediction")))

Training Decision Tree...
Done!


In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

r2_eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
rmse_eval = RegressionEvaluator(labelCol="price", predictionCol="prediction_price", metricName="rmse")
mae_eval = RegressionEvaluator(labelCol="price", predictionCol="prediction_price", metricName="mae")

print("Decision Tree Results:")
print(f"  R2 (Log Scale): {r2_eval.evaluate(predictions):.4f}")
print(f"  RMSE (Real $):  {rmse_eval.evaluate(predictions):.2f}")
print(f"  MAE (Real $):   {mae_eval.evaluate(predictions):.2f}")

predictions.select("title", "price", "prediction_price").show(5, truncate=False)

Decision Tree Results:


  R2 (Log Scale): 0.1711
  RMSE (Real $):  21.77
  MAE (Real $):   14.53
+---------------------------------------------+-----+------------------+
|title                                        |price|prediction_price  |
+---------------------------------------------+-----+------------------+
|Warhammer 40k Adeptus Mechanicus Codex       |22.41|15.290051444866316|
|IA N64 Mem Asst.                             |10.49|28.579394862120477|
|Beetle Adventure Racing                      |43.98|28.579394862120477|
|Scubapro Quick Release Mouthpiece Clamp      |14.95|22.632618188801377|
|Replacement Lens (Screen) for Game Boy Pocket|12.94|18.178456447566855|
+---------------------------------------------+-----+------------------+
only showing top 5 rows



In [12]:
dt_model = model.stages[-1]
print(f"Learned regression tree depth: {dt_model.depth}")
print(f"Num nodes: {dt_model.numNodes}")
print("\nTree Structure (Top levels):")
print(dt_model.toDebugString[:1000]) 

Learned regression tree depth: 12
Num nodes: 3291

Tree Structure (Top levels):
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_b64a5d93f8e1, depth=12, numNodes=3291, numFeatures=1008
  If (feature 174 in {1.0,2.0,3.0,4.0,5.0,6.0,7.0})
   If (feature 3 in {3.0,5.0,11.0,15.0,16.0,29.0})
    If (feature 3 in {5.0,16.0})
     If (feature 2 <= 2.8029010331479984)
      If (feature 489 in {0.0})
       If (feature 388 in {0.0})
        If (feature 4 <= 133.5)
         If (feature 482 in {0.0})
          If (feature 910 in {0.0})
           If (feature 741 in {0.0})
            If (feature 162 in {0.0})
             If (feature 888 in {0.0})
              Predict: 2.0362640180706943
             Else (feature 888 not in {0.0})
              Predict: 3.5051302991935245
            Else (feature 162 not in {0.0})
             Predict: 3.2638956279612708
           Else (feature 741 not in {0.0})
            If (feature 71 in {1.0})
             Predict: 2.0702966732627304
            El

In [13]:
model_path = "../../models/regression/dt_price_log_v1"
model.write().overwrite().save(model_path)
print(f"Model saved to: {model_path}")

Model saved to: ../../models/regression/dt_price_log_v1
