In [50]:
import findspark
import seaborn as sns
from common.const import DATASET, FILEPATH, STAGING_FILENAME
from common.utils import change_case, describe_dataframe_details
from matplotlib import pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (DecisionTreeRegressor, GBTRegressor,
                                   RandomForestRegressor)
from pyspark.sql import SparkSession
from pyspark.sql.functions import abs, col, log, log10, sqrt
from pyspark.sql.types import DoubleType, StringType, StructField, StructType

In [51]:
findspark.init("/home/ubuntu/spark-3.2.1-bin-hadoop2.7")
spark = SparkSession.builder.appName("basics").getOrCreate()

In [52]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [53]:
df = spark.read.parquet(f"{FILEPATH.TEMP_STAGING_PATH}/{STAGING_FILENAME.DP}.parquet")

In [54]:
df.show()

+---------------+---------------+---------------+-----------------+---------------+---------------+-----------------+----------------------+---------------------+------------------+------------------+------------------+-----------------------------+---------+---+----+---------------------+---------------+---------+------------+---------+------------+-----------------------------+--------------+--------------------+---------------------------+----------------------+----------------------------+----------------------------+
|carbon_emission|recycling_metal|recycling_glass|recycling_plastic|recycling_paper|recycling_count|cooking_with_oven|cooking_with_microwave|cooking_with_airfryer|cooking_with_grill|cooking_with_stove|cooking_with_count|monthly_grocery_bill_quantile|body_type|sex|diet|heating_energy_source|social_activity|recycling|cooking_with|transport|vehicle_type|frequency_of_traveling_by_air|waste_bag_size|monthly_grocery_bill|vehicle_monthly_distance_km|waste_bag_weekly_count|how

In [55]:
feature_columns = df.columns
feature_columns.remove(change_case(DATASET.TARGET))
df_assembler = df.alias("df_assembler")
vec_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembler = vec_assembler.transform(df_assembler)
df_assembler.show()

+---------------+---------------+---------------+-----------------+---------------+---------------+-----------------+----------------------+---------------------+------------------+------------------+------------------+-----------------------------+---------+---+----+---------------------+---------------+---------+------------+---------+------------+-----------------------------+--------------+--------------------+---------------------------+----------------------+----------------------------+----------------------------+--------------------+
|carbon_emission|recycling_metal|recycling_glass|recycling_plastic|recycling_paper|recycling_count|cooking_with_oven|cooking_with_microwave|cooking_with_airfryer|cooking_with_grill|cooking_with_stove|cooking_with_count|monthly_grocery_bill_quantile|body_type|sex|diet|heating_energy_source|social_activity|recycling|cooking_with|transport|vehicle_type|frequency_of_traveling_by_air|waste_bag_size|monthly_grocery_bill|vehicle_monthly_distance_km|waste

In [56]:
ml_models = {
    "dt": DecisionTreeRegressor,
    "gbt": GBTRegressor,
    "rf": RandomForestRegressor,
}
feature_importances = {}
for model_name, ml_model in ml_models.items():
    dt = ml_model(featuresCol="features", labelCol=change_case(DATASET.TARGET))
    model = dt.fit(df_assembler)
    feature_importances[f"{model_name}_importance"] = model.featureImportances

feature_importances_list = [
    (
        column_name,
        float(descision_tree_importance),
        float(gradient_boosted_tree),
        float(random_forest_importance),
    )
    for column_name, descision_tree_importance, gradient_boosted_tree, random_forest_importance in zip(
        feature_columns,
        feature_importances["dt_importance"],
        feature_importances["gbt_importance"],
        feature_importances["rf_importance"],
    )
]

fields = [
    StructField(f"{model_name}_importance", DoubleType(), True)
    for model_name in ml_models.keys()
]

fields.insert(0, StructField("feature", StringType(), True))
schema = StructType(fields)
feature_importances_df = spark.createDataFrame(feature_importances_list, schema=schema)

In [62]:
feature_importances_df.sort("dt_importance", ascending=False).show(
    len(feature_importances_list), truncate=False
)

+-----------------------------+---------------------+---------------------+---------------------+
|feature                      |dt_importance        |gbt_importance       |rf_importance        |
+-----------------------------+---------------------+---------------------+---------------------+
|vehicle_type                 |0.5519874209196011   |0.2826818152375831   |0.36538048778067567  |
|frequency_of_traveling_by_air|0.3425347598272173   |0.14248726887960547  |0.3116564836068295   |
|body_type                    |0.059793460341281746 |0.06003489229426605  |0.04682219429212149  |
|sex                          |0.02420603804804404  |0.05384658189519959  |0.02427851703270535  |
|heating_energy_source        |0.014114709095020717 |0.06194783372754551  |0.017840733576496584 |
|waste_bag_size               |0.005608222059463289 |0.08650204341043664  |0.010229670238508562 |
|recycling                    |0.0017553897093718345|0.12231328887351613  |0.010168671695450255 |
|diet               

In [61]:
feature_importances_df.sort("gbt_importance", ascending=False).show(
    len(feature_importances_list), truncate=False
)

+-----------------------------+---------------------+---------------------+---------------------+
|feature                      |dt_importance        |gbt_importance       |rf_importance        |
+-----------------------------+---------------------+---------------------+---------------------+
|vehicle_type                 |0.5519874209196011   |0.2826818152375831   |0.36538048778067567  |
|frequency_of_traveling_by_air|0.3425347598272173   |0.14248726887960547  |0.3116564836068295   |
|recycling                    |0.0017553897093718345|0.12231328887351613  |0.010168671695450255 |
|waste_bag_size               |0.005608222059463289 |0.08650204341043664  |0.010229670238508562 |
|cooking_with                 |0.0                  |0.08195591890103371  |0.0057549037268010035|
|heating_energy_source        |0.014114709095020717 |0.06194783372754551  |0.017840733576496584 |
|body_type                    |0.059793460341281746 |0.06003489229426605  |0.04682219429212149  |
|sex                

In [59]:
feature_importances_df.sort("rf_importance", ascending=False).show(
    len(feature_importances_list), truncate=False
)

+-----------------------------+---------------------+---------------------+---------------------+
|feature                      |dt_importance        |gbt_importance       |rf_importance        |
+-----------------------------+---------------------+---------------------+---------------------+
|vehicle_type                 |0.5519874209196011   |0.2826818152375831   |0.36538048778067567  |
|frequency_of_traveling_by_air|0.3425347598272173   |0.14248726887960547  |0.3116564836068295   |
|transport                    |0.0                  |0.005040589001385832 |0.20012827082632106  |
|body_type                    |0.059793460341281746 |0.06003489229426605  |0.04682219429212149  |
|sex                          |0.02420603804804404  |0.05384658189519959  |0.02427851703270535  |
|heating_energy_source        |0.014114709095020717 |0.06194783372754551  |0.017840733576496584 |
|waste_bag_size               |0.005608222059463289 |0.08650204341043664  |0.010229670238508562 |
|recycling          

In [64]:
from pyspark.sql.functions import monotonically_increasing_id

ranked_feature_importances_df = feature_importances_df.sort(
    [col("dt_importance")], ascending=False
).withColumn("dt_rank", monotonically_increasing_id())
ranked_feature_importances_df = ranked_feature_importances_df.sort(
    [col("gbt_importance")], ascending=False
).withColumn("gbt_rank", monotonically_increasing_id())
ranked_feature_importances_df = ranked_feature_importances_df.sort(
    [col("rf_importance")], ascending=False
).withColumn("rf_rank", monotonically_increasing_id())
ranked_feature_importances_df = ranked_feature_importances_df.withColumn(
    "rank", col("dt_rank") + col("gbt_rank") + col("rf_rank")
)
ranked_feature_importances_df.sort("rank").show(
    len(feature_importances_list), truncate=False
)

+-----------------------------+---------------------+---------------------+---------------------+-------+--------+-------+----+
|feature                      |dt_importance        |gbt_importance       |rf_importance        |dt_rank|gbt_rank|rf_rank|rank|
+-----------------------------+---------------------+---------------------+---------------------+-------+--------+-------+----+
|vehicle_type                 |0.5519874209196011   |0.2826818152375831   |0.36538048778067567  |0      |0       |0      |0   |
|frequency_of_traveling_by_air|0.3425347598272173   |0.14248726887960547  |0.3116564836068295   |1      |1       |1      |3   |
|body_type                    |0.059793460341281746 |0.06003489229426605  |0.04682219429212149  |2      |6       |3      |11  |
|heating_energy_source        |0.014114709095020717 |0.06194783372754551  |0.017840733576496584 |4      |5       |5      |14  |
|sex                          |0.02420603804804404  |0.05384658189519959  |0.02427851703270535  |3      

In [60]:
importance_columns = [
    "dt_importance",
    "gbt_importance",
    "rf_importance",
]
for importance_columns in importance_columns:
    tranformed_df = df.select(
        [
            "carbon_emission",
            *[
                row.feature
                for row in feature_importances_df.where(col(importance_columns) > 0)
                .select("feature")
                .collect()
            ],
        ]
    )
    tranformed_df.write.mode("overwrite").parquet(
        f"{FILEPATH.TEMP_STAGING_PATH}/{STAGING_FILENAME.DT}_{importance_columns}.parquet"
    )

In [None]:
describe_dataframe_details(spark=spark, df=tranformed_df)

In [None]:
column_name = "vehicle_monthly_distance_km"
tmp_df = df.select(column_name, "carbon_emission")
tmp_df = tmp_df.withColumn("sqrt", sqrt(abs(col(column_name))))
tmp_df = tmp_df.withColumn("log", log(abs(col(column_name))))
tmp_df = tmp_df.withColumn("log10", log10(abs(col(column_name))))
tmp_df = tmp_df.toPandas()
plt.figure(figsize=(30, 5))
fig, axs = plt.subplots(ncols=4)
sns.set_theme(context="talk", rc={"figure.figsize": (30, 5)})
sns.histplot(data=tmp_df, x=column_name, ax=axs[0]).set(title="Original")
sns.histplot(x="sqrt", data=tmp_df, ax=axs[1]).set(title="Square Root")
sns.histplot(x="log", data=tmp_df, ax=axs[2]).set(title="LogN")
sns.histplot(x="log10", data=tmp_df, ax=axs[3]).set(title="Log10")