In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, length, when, log1p, expm1, lower, percentile_approx, count

spark = SparkSession.builder \
    .appName("Amazon Price Prediction LR") \
    .config("spark.driver.memory", "8g") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

DATA_PATH = "../../data/regression_price"

In [2]:
df = spark.read.parquet(DATA_PATH)
print(f"Total rows: {df.count()}")

has_array_features = [f.dataType for f in df.schema.fields if f.name == "features"]
if str(has_array_features[0]).startswith("ArrayType"):
    df = df.withColumn("features_count", size(col("features")))
else:
    if "features_count" not in df.columns and "features" in df.columns:
         df = df.withColumn("features_count", length(col("features")))

required_cols = ["rating_number", "average_rating", "main_category", "price"]
df_clean = df.dropna(subset=required_cols)

store_counts = df_clean.groupBy("store").agg(count("*").alias("store_freq"))
df_improved = df_clean.join(store_counts, on="store", how="left")
df_improved = df_improved.na.fill(0, subset=["store_freq", "features_count"])

Total rows: 699283


In [3]:
price_stats = df_improved.select(
    percentile_approx("price", 0.25).alias("q1"),
    percentile_approx("price", 0.75).alias("q3")
).collect()[0]

q1, q3 = price_stats["q1"], price_stats["q3"]
iqr = q3 - q1
lower_bound = max(1.0, q1 - 1.5 * iqr)  
upper_bound = q3 + 1.5 * iqr

print(f"Price Bounds: {lower_bound:.2f} - {upper_bound:.2f}")

df_featured = df_improved.filter(
    (col("price") >= lower_bound) & (col("price") <= upper_bound)
)

if "title" in df_featured.columns:
    df_featured = df_featured.withColumn("title_len", length(col("title")))
    df_featured = df_featured.withColumn("is_premium", when(lower(col("title")).rlike("premium|pro|deluxe"), 1).otherwise(0))
    df_featured = df_featured.withColumn("is_bundle", when(lower(col("title")).rlike("bundle|set|pack"), 1).otherwise(0))

df_featured = df_featured.withColumn("log_rating_number", log1p(col("rating_number")))
df_featured = df_featured.withColumn("log_store_freq", log1p(col("store_freq")))

df_featured = df_featured.withColumn("label", log1p(col("price")))

print(f"Rows for training: {df_featured.count()}")

Price Bounds: 1.00 - 105.49
Rows for training: 565678


In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, HashingTF, StringIndexer, 
    OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml.regression import LinearRegression

stages = []

if "title" in df_featured.columns:
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    stages.append(tokenizer)
    hashingTF = HashingTF(inputCol="words", outputCol="title_features", numFeatures=300)
    stages.append(hashingTF)

indexer = StringIndexer(inputCol="main_category", outputCol="category_idx", handleInvalid="keep")
stages.append(indexer)
encoder = OneHotEncoder(inputCol="category_idx", outputCol="category_vec")
stages.append(encoder)

numeric_cols = ["average_rating", "log_rating_number", "log_store_freq"]
optional_cols = ["title_len", "is_premium", "is_bundle", "features_count"]

for c in optional_cols:
    if c in df_featured.columns:
        numeric_cols.append(c)

input_cols = numeric_cols + ["category_vec"]
if "title" in df_featured.columns:
    input_cols.append("title_features")

assembler = VectorAssembler(inputCols=input_cols, outputCol="features_raw")
stages.append(assembler)

scaler = StandardScaler(inputCol="features_raw", outputCol="features_vector", withStd=True, withMean=False)
stages.append(scaler)

lr = LinearRegression(
    featuresCol="features_vector", 
    labelCol="label",
    maxIter=50,
    regParam=0.1,
    elasticNetParam=0.5
)
stages.append(lr)

pipeline = Pipeline(stages=stages)

In [5]:
train_data, test_data = df_featured.randomSplit([0.8, 0.2], seed=42)

print("Training Linear Regression model...")
model = pipeline.fit(train_data)
print("Training Complete!")

predictions = model.transform(test_data)

predictions = predictions.withColumn("prediction_price", expm1(col("prediction")))

Training Linear Regression model...
Training Complete!


In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

r2_eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
rmse_eval = RegressionEvaluator(labelCol="price", predictionCol="prediction_price", metricName="rmse")
mae_eval = RegressionEvaluator(labelCol="price", predictionCol="prediction_price", metricName="mae")

print("Evaluation Results:")
print(f"  R2 (Log Scale): {r2_eval.evaluate(predictions):.4f}")
print(f"  RMSE (Real $):  {rmse_eval.evaluate(predictions):.2f}")
print(f"  MAE (Real $):   {mae_eval.evaluate(predictions):.2f}")

print("\nExample Predictions:")
predictions.select("title", "price", "prediction_price").show(5, truncate=False)

Evaluation Results:
  R2 (Log Scale): 0.0539
  RMSE (Real $):  22.82
  MAE (Real $):   15.49

Example Predictions:
+---------------------------------------------+-----+------------------+
|title                                        |price|prediction_price  |
+---------------------------------------------+-----+------------------+
|Warhammer 40k Adeptus Mechanicus Codex       |22.41|22.089285612794676|
|IA N64 Mem Asst.                             |10.49|22.089285612794676|
|Beetle Adventure Racing                      |43.98|22.089285612794676|
|Scubapro Quick Release Mouthpiece Clamp      |14.95|22.089285612794676|
|Replacement Lens (Screen) for Game Boy Pocket|12.94|22.089285612794676|
+---------------------------------------------+-----+------------------+
only showing top 5 rows



In [7]:
lr_model = model.stages[-1]

print(f"Intercept (Базова логарифмічна ціна): {lr_model.intercept:.4f}")

print("\nApproximate Coefficients for Numeric Features:")
coeffs = lr_model.coefficients
for i, name in enumerate(numeric_cols):
    if i < len(coeffs):
        print(f"  {name}: {coeffs[i]:.4f}")

Intercept (Базова логарифмічна ціна): 3.1394

Approximate Coefficients for Numeric Features:
  average_rating: 0.0000
  log_rating_number: 0.0000
  log_store_freq: 0.0000
  title_len: 0.0000
  is_premium: 0.0000
  is_bundle: 0.0000
  features_count: 0.0000


In [8]:
model_path = "../../models/regression/lr_price_log_v1"
model.write().overwrite().save(model_path)
print(f"Model saved to: {model_path}")

Model saved to: ../../models/regression/lr_price_log_v1
