## Number of trips per day each year

In [1]:
# Import
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# Start Spark Session
spark = SparkSession \
    .builder \
    .appName("Example") \
    .getOrCreate()

24/12/25 16:10:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Path lists
zone_lookup = "hdfs://10.128.0.59:8020/raw_data/taxi_zone_lookup.csv"
fact_trip = "hdfs://10.128.0.59:8020/data_warehouse/fact_trip"
dim_vendor = "hdfs://10.128.0.59:8020/data_warehouse/dim_vendor"
dim_datetime = "hdfs://10.128.0.59:8020/data_warehouse/dim_datetime"
dim_rate_code = "hdfs://10.128.0.59:8020/data_warehouse/dim_rate_code"
dim_pickup_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_pickup_location"
dim_dropoff_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_dropoff_location"
dim_payment = "hdfs://10.128.0.59:8020/data_warehouse/dim_payment"

# uber-analysis-439804.query_result. + the table's name
output = "uber-analysis-439804.query_result.trips_per_day"

In [4]:
# Read data into dataframe
df_fact = spark.read \
    .format("parquet") \
    .option("path", fact_trip) \
    .load() \
    .select("trip_id", "datetimestamp_id")

df_datetime = spark.read \
    .format("parquet") \
    .option("path", dim_datetime) \
    .load() \
    .select("pick_year", "pick_weekday", "datetime_id")

                                                                                

In [5]:
# SELECT year, day, COUNT(trip_id)
# FROM df_fact 
#     INNER JOIN df_datetime 
#         ON df_fact.datetimestamp_id = df_datetime.datetime_id
# GROUP BY year, day;

In [6]:
# Join
df_joined = df_fact.join(df_datetime, 
                         df_fact.datetimestamp_id == df_datetime.datetime_id,
                        "inner")

# Query
df_result = df_joined.groupBy("pick_year", "pick_weekday") \
    .agg(count("trip_id").alias("total_trips")) \
    .select(
        col("pick_year").alias("year"),
        col("pick_weekday").alias("day"),
        col("total_trips")
    )

# df_result.show()

In [7]:
# Save to BigQuery
df_result.write \
    .format("bigquery") \
    .option("table", output) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

                                                                                

In [8]:
spark.stop()