In [1]:
import findspark
from common.const import DATASET, FILEPATH, STAGING_FILENAME
from common.utils import change_case
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (DecisionTreeRegressor, GBTRegressor,
                                   RandomForestRegressor)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

In [2]:
findspark.init("/home/ubuntu/spark-3.2.1-bin-hadoop2.7")
spark = SparkSession.builder.appName("basics").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/23 22:11:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [4]:
df_dict = {
    "dt_imp_df": spark.read.parquet(
        f"{FILEPATH.TEMP_STAGING_PATH}/{STAGING_FILENAME.DMA}_dt_importance.parquet"
    ),
    "gbt_imp_df": spark.read.parquet(
        f"{FILEPATH.TEMP_STAGING_PATH}/{STAGING_FILENAME.DMA}_gbt_importance.parquet"
    ),
    "rf_imp_df": spark.read.parquet(
        f"{FILEPATH.TEMP_STAGING_PATH}/{STAGING_FILENAME.DMA}_rf_importance.parquet"
    ),
    "03_df": spark.read.parquet(
        f"{FILEPATH.TEMP_STAGING_PATH}/{STAGING_FILENAME.DP}.parquet"
    ),
}

                                                                                

In [5]:
ran_for = RandomForestRegressor(featuresCol="features")
gbt = GBTRegressor(featuresCol="features")
dt = DecisionTreeRegressor(featuresCol="features")
ml_models = {
    "random_forest": ran_for,
    "gradient_boost": gbt,
    "decision_tree": dt,
}
param_grid = {
    "random_forest": ParamGridBuilder()
    .addGrid(ran_for.maxDepth, [6, 7])
    .addGrid(ran_for.maxBins, [16, 32])
    .addGrid(ran_for.numTrees, [20, 30])
    .addGrid(ran_for.subsamplingRate, [0.1, 1])
    .build(),
    "gradient_boost": ParamGridBuilder()
    .addGrid(gbt.maxDepth, [6, 7])
    .addGrid(gbt.maxBins, [16, 32])
    .addGrid(gbt.maxIter, [20])
    .build(),
    "decision_tree": ParamGridBuilder()
    .addGrid(dt.maxDepth, [6, 7])
    .addGrid(dt.maxBins, [16, 32])
    .build(),
}
best_models = {}
for df_name, df in df_dict.items():
    print(df_name)
    feature_columns = df.columns.copy()
    feature_columns.remove(change_case(DATASET.TARGET))
    vec_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    vec_df = vec_assembler.transform(df)
    vec_df = vec_df.withColumnRenamed(change_case(DATASET.TARGET), "label")
    train, test = vec_df.randomSplit([0.8, 0.2])
    best_models[df_name] = {}
    for model_name, ml_model in ml_models.items():
        print(model_name)
        cv = CrossValidator(
            estimator=ml_model,
            estimatorParamMaps=param_grid[model_name],
            evaluator=RegressionEvaluator(),
            parallelism=2,
            numFolds=3,
        )
        model = cv.fit(train)
        best_models[df_name][model_name] = {
            "model": model.bestModel,
        }
    for model_name, value in best_models[df_name].items():
        y_pred = value["model"].transform(test)
        valuesAndPreds = y_pred.select(["label", "prediction"])
        valuesAndPreds = valuesAndPreds.withColumn(
            "label", col("label").cast(DoubleType())
        )
        valuesAndPreds = valuesAndPreds.rdd.map(tuple)
        metrics = RegressionMetrics(valuesAndPreds)
        best_models[df_name][model_name]["score"] = {
            "r2": metrics.r2,
            "mse": metrics.meanSquaredError,
            "rmse": metrics.rootMeanSquaredError,
            "mae": metrics.meanAbsoluteError,
        }

dt_imp_df
random_forest


                                                                                

gradient_boost


24/05/23 22:14:14 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/23 22:14:14 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


decision_tree


                                                                                

gbt_imp_df
random_forest


24/05/23 22:16:30 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


gradient_boost


24/05/23 22:19:55 WARN BlockManager: Asked to remove block broadcast_12697, which does not exist


decision_tree


24/05/23 22:20:13 WARN BlockManager: Block rdd_19919_0 already exists on this machine; not re-adding it


rf_imp_df
random_forest
gradient_boost


24/05/23 22:22:22 WARN BlockManager: Block rdd_24694_0 already exists on this machine; not re-adding it


decision_tree
03_df
random_forest
gradient_boost
decision_tree


In [6]:
for data_name, model in best_models.items():
    print(data_name)
    for model_name, data in model.items():
        print(f"\t{model_name}")
        for key, value in data.items():
            if key == "model":
                # print(f"\t{key}: {value.explainParams()}")
                print(f"\t\t{key}: {value}")
            else:
                print(f"\t\t{key}:")
                for score_name, score_value in value.items():
                    print(f"\t\t\t{score_name}: {score_value}")

dt_imp_df
	random_forest
		model: RandomForestRegressionModel: uid=RandomForestRegressor_bd7a78c895d0, numTrees=30, numFeatures=7
		score:
			r2: 0.5166498879386539
			mse: 320017.85300358065
			rmse: 565.7012047040209
			mae: 408.944532935262
	gradient_boost
		model: GBTRegressionModel: uid=GBTRegressor_a4ee322a2040, numTrees=20, numFeatures=7
		score:
			r2: 0.5797021589551029
			mse: 348193.37101294973
			rmse: 590.0791226716547
			mae: 417.1171920268038
	decision_tree
		model: DecisionTreeRegressionModel: uid=DecisionTreeRegressor_0a946d3fbc42, depth=6, numNodes=127, numFeatures=7
		score:
			r2: 0.4991844028890783
			mse: 372402.58319812926
			rmse: 610.2479686145045
			mae: 437.5841602669964
gbt_imp_df
	random_forest
		model: RandomForestRegressionModel: uid=RandomForestRegressor_bd7a78c895d0, numTrees=20, numFeatures=15
		score:
			r2: 0.4344631200650574
			mse: 351928.608907014
			rmse: 593.2357110854117
			mae: 425.77634001048904
	gradient_boost
		model: GBTRegressionModel: ui