## Aggregation Each Year and Aggregation Payment

In [1]:
# Import
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# Start Spark Session
spark = SparkSession \
    .builder \
    .appName("General Aggregation Per Year") \
    .getOrCreate()

24/12/28 16:17:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Path lists
fact_trip = "hdfs://10.128.0.59:8020/data_warehouse/fact_trip"
dim_datetime = "hdfs://10.128.0.59:8020/data_warehouse/dim_datetime"
dim_payment = "hdfs://10.128.0.59:8020/data_warehouse/dim_payment"

# uber-analysis-439804.query_result. + the table's name
output_year = "uber-analysis-439804.query_result.agg_per_year"
output_payment_year = "uber-analysis-439804.query_result.payment_per_year"
output_payment = "uber-analysis-439804.query_result.agg_per_payment"

In [4]:
# Read data into dataframe
df_fact = spark.read \
    .format("parquet") \
    .option("path", fact_trip) \
    .load()

df_datetime = spark.read \
    .format("parquet") \
    .option("path", dim_datetime) \
    .load() \
    .select("pick_year", "datetime_id")

df_payment = spark.read \
    .format("parquet") \
    .option("path", dim_payment) \
    .load() \
    .filter(~col("payment_type").isin([4, 6]))

                                                                                

In [5]:
# Join
df_joined = df_fact \
    .join(df_datetime, 
          df_fact.datetimestamp_id == df_datetime.datetime_id, "inner") \
    .join(broadcast(df_payment),
          df_fact.payment_id == df_payment.payment_type, "inner")

# Aggregation each year
df_year_result = df_joined.groupBy("pick_year") \
    .agg(
        count("trip_id").alias("total_trips"),
        sum("passenger_count").alias("total_passengers"),
        sum("trip_distance").alias("total_distance"),
        sum("total_amount").alias("total_amount"),
        avg("passenger_count").alias("average_passengers_per_trip"),
        avg("trip_distance").alias("average_distance_per_trip"),
        avg("total_amount").alias("average_amount_per_trip")) \
    .select(
        col("pick_year").alias("year"),
        col("total_trips"),
        col("total_passengers"),
        col("total_distance"),
        col("total_amount"),
        col("average_passengers_per_trip"),
        col("average_distance_per_trip"),
        col("average_amount_per_trip")
    )

df_payment_each_year_result = df_joined.groupBy("pick_year", "payment_id", "payment_type_name") \
    .agg(count("trip_id").alias("total_trips")) \
    .select(
        col("pick_year").alias("year"),
        col("payment_id"),
        col("payment_type_name").alias("payment_name"),
        col("total_trips")
    )

df_payment_result = df_joined.groupBy("payment_id", "payment_type_name") \
    .agg(count("trip_id").alias("total_trips")) \
    .select(
        col("payment_id"),
        col("payment_type_name").alias("payment_name"),
        col("total_trips")
    )

In [6]:
# Save to BigQuery
df_year_result.write \
    .format("bigquery") \
    .option("table", output_year) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

df_payment_each_year_result.write \
    .format("bigquery") \
    .option("table", output_payment_year) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

df_payment_result.write \
    .format("bigquery") \
    .option("table", output_payment) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

24/12/28 16:17:50 WARN DAGScheduler: Broadcasting large task binary with size 1049.8 KiB
24/12/28 16:26:27 WARN DAGScheduler: Broadcasting large task binary with size 1159.5 KiB
24/12/28 16:31:37 WARN DAGScheduler: Broadcasting large task binary with size 1387.6 KiB
24/12/28 16:31:46 WARN DAGScheduler: Broadcasting large task binary with size 1047.7 KiB
24/12/28 16:38:04 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
24/12/28 16:43:59 WARN DAGScheduler: Broadcasting large task binary with size 1343.9 KiB
24/12/28 16:44:06 WARN DAGScheduler: Broadcasting large task binary with size 1047.7 KiB
24/12/28 16:59:16 WARN DAGScheduler: Broadcasting large task binary with size 1121.3 KiB
24/12/28 17:11:14 WARN DAGScheduler: Broadcasting large task binary with size 1342.6 KiB
                                                                                

In [7]:
spark.stop()