## Decision Tree

In [None]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("TLC Decision Tree") \
    .getOrCreate()

In [None]:
# Path lists
fact_trip = "hdfs://10.128.0.59:8020/data_warehouse/fact_trip"
dim_datetime = "hdfs://10.128.0.59:8020/data_warehouse/dim_datetime"

output = "uber-analysis-439804.query_result.model_evaluation"

In [None]:
df_fact = spark.read \
    .format("parquet") \
    .option("path", fact_trip) \
    .load()

df_datetime = spark.read \
    .format("parquet") \
    .option("path", dim_datetime) \
    .load() \
    .select(
        "datetime_id",
        "pick_hour",
        "pick_weekday_id",
        "drop_hour",
        "drop_weekday_id"
    )

df_joined = df_fact \
    .join(
        df_datetime,
        df_fact.datetimestamp_id == df_datetime.datetime_id,
        "inner"
    ) \
    .drop(
        "datetimestamp_id", 
        "datetime_id"
    )

df_joined.printSchema()

In [None]:
selected_columns = [
    "vendor_id",
    "pu_location_id",
    "do_location_id",
    "ratecode_id",
    "payment_id",
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "pick_hour",
    "pick_weekday_id",
    "drop_hour",
    "drop_weekday_id"
]

assembler = VectorAssembler(
    inputCols=selected_columns,
    outputCol="features"
)

data_transformed = assembler.transform(df_joined)

In [None]:
# Split dataset
train_data, test_data = data_transformed.randomSplit([0.8, 0.2], seed=1234)

In [None]:
# Decision Tree 
dt_model = DecisionTreeRegressor(
    featuresCol="features",
    labelCol="total_amount"
)

trained_model = dt_model.fit(train_data)
predictions = trained_model.transform(test_data)

In [None]:
# Evaluation
rmse_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="mae")
r2_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="r2")

rmse = rmse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)
r2 = r2_evaluator.evaluate(predictions)

In [None]:
print("Decision Tree Regression Model Performance:")
print(f" RMSE: {rmse}")
print(f" MAE: {mae}")
print(f" R²: {r2}")

In [None]:
evaluation_data = spark.createDataFrame([
    Row(name="Decision Tree Regression", rmse=rmse, mae=mae, r2=r2)
])

evaluation_data.show()

evaluation_data.write \
    .format("bigquery") \
    .option("table", output) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("append") \
    .save()