In [31]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

In [32]:
spark = SparkSession.builder \
    .appName("modelling") \
    .config("spark.driver.memory", "6g") \
    .getOrCreate()

In [33]:
df = spark.read.parquet("../dataset/cooked/feature_sales")

In [34]:
df.printSchema()
df.show(5)

root
 |-- features: vector (nullable = true)
 |-- avg_sales: double (nullable = true)

+--------------------+-------------------+
|            features|          avg_sales|
+--------------------+-------------------+
|[0.0,4.0,2.0,2015...|0.26294343240651963|
|[1.0,0.0,12.0,201...|  2.123805256869773|
|[1.0,3.0,11.0,201...|  0.842948717948718|
|[1.0,4.0,9.0,2011...| 0.1284116331096197|
|[1.0,6.0,11.0,201...| 0.5502912621359223|
+--------------------+-------------------+
only showing top 5 rows



In [35]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=1)

In [36]:
gbt = GBTRegressor(
    labelCol="avg_sales",
    featuresCol="features",
    maxIter=100,
    maxDepth=6,
    stepSize=0.1,
    subsamplingRate=0.8
)

In [37]:
model = gbt.fit(train_df)

In [38]:
predic = model.transform(test_df)
predic.select("avg_sales", "prediction").show(5)

+------------------+------------------+
|         avg_sales|        prediction|
+------------------+------------------+
| 1.038231780167264|0.9268097888557923|
|1.7888291517323776|1.5988951820915114|
|1.4645061728395061|1.3644772097216993|
|1.4003086419753086|1.6271540592157534|
| 0.879783950617284|0.8622393481893971|
+------------------+------------------+
only showing top 5 rows



In [39]:
rmse = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="rmse"
)

mae = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="mae"
)

mse = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="mse"
)

r2 = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="r2"
)

rmse = rmse.evaluate(predic)
mae = mae.evaluate(predic)
mse = mse.evaluate(predic)
r2 = r2.evaluate(predic)

rmse, mae, mse, r2

(0.09295132803010996,
 0.05966771584220571,
 0.008639949382561105,
 0.9842046902612431)

In [40]:
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [4, 6, 8]) \
    .addGrid(gbt.maxIter, [50, 100]) \
    .addGrid(gbt.stepSize, [0.05, 0.1, 0.15]) \
    .addGrid(gbt.subsamplingRate, [0.7, 0.8, 0.9]) \
    .addGrid(gbt.minInstancesPerNode, [10, 50]) \
    .build()

In [41]:
evaluator_rmse = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="rmse"
)

# evaluator_mae = RegressionEvaluator(
#     labelCol="avg_sales",
#     predictionCol="prediction",
#     metricName="mae"
# )

# evaluator_mse = RegressionEvaluator(
#     labelCol="avg_sales",
#     predictionCol="prediction",
#     metricName="mse"
# )

# evaluator_r2 = RegressionEvaluator(
#     labelCol="avg_sales",
#     predictionCol="prediction",
#     metricName="r2"
# )

In [42]:
cv_rmse = CrossValidator(
    estimator=gbt,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator_rmse,
    numFolds=3,
    parallelism=2
)

# cv_mae = CrossValidator(
#     estimator=gbt,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator_mae,
#     numFolds=3,
#     parallelism=2
# )

# cv_mse = CrossValidator(
#     estimator=gbt,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator_mse,
#     numFolds=3,
#     parallelism=2
# )

# cv_r2 = CrossValidator(
#     estimator=gbt,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator_r2,
#     numFolds=3,
#     parallelism=2
# )

In [43]:
cv_model = cv_rmse.fit(train_df)

In [44]:
best_model = cv_model.bestModel

In [45]:
print("Best maxDepth:", best_model.getMaxDepth())
print("Best maxIter:", best_model.getMaxIter())
print("Best stepSize:", best_model.getStepSize())
print("Best subsamplingRate:", best_model.getSubsamplingRate())


Best maxDepth: 6
Best maxIter: 100
Best stepSize: 0.15
Best subsamplingRate: 0.9


In [46]:
predic = best_model.transform(test_df)

In [47]:
rmse = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="rmse"
).evaluate(predic)

mae = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="mae"
).evaluate(predic)

mse = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="mse"
).evaluate(predic)

r2 = RegressionEvaluator(
    labelCol="avg_sales",
    predictionCol="prediction",
    metricName="r2"
).evaluate(predic)

rmse, mae, mse, r2


(0.10310474509456531,
 0.06496936093912246,
 0.01063058846101529,
 0.9805654605123142)