## Number of trips per dropoff location

In [1]:
# Import
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# Start Spark Session
spark = SparkSession \
    .builder \
    .appName("Trip per dropoff location") \
    .getOrCreate()

24/12/25 15:44:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Path lists
zone_lookup = "hdfs://10.128.0.59:8020/raw_data/updated_zone_lookup.csv"
fact_trip = "hdfs://10.128.0.59:8020/data_warehouse/fact_trip"
dim_vendor = "hdfs://10.128.0.59:8020/data_warehouse/dim_vendor"
dim_datetime = "hdfs://10.128.0.59:8020/data_warehouse/dim_datetime"
dim_rate_code = "hdfs://10.128.0.59:8020/data_warehouse/dim_rate_code"
dim_pickup_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_pickup_location"
dim_dropoff_location = "hdfs://10.128.0.59:8020/data_warehouse/dim_dropoff_location"
dim_payment = "hdfs://10.128.0.59:8020/data_warehouse/dim_payment"

# uber-analysis-439804.query_result. + the table's name
output = "uber-analysis-439804.query_result.trips_per_do_location"

In [4]:
# Read data into dataframe
df_fact = spark.read \
    .format("parquet") \
    .option("path", fact_trip) \
    .load() \
    .select("trip_id", "datetimestamp_id", "do_location_id")

df_datetime = spark.read \
    .format("parquet") \
    .option("path", dim_datetime) \
    .load() \
    .select("pick_year", "datetime_id")

df_dropoff_location = spark.read \
    .format("parquet") \
    .option("path", dim_dropoff_location) \
    .load()

                                                                                

In [5]:
# Joining
df_joined = df_fact.alias("fact_data") \
    .join(df_datetime.alias("dim_datetime"), 
          col("fact_data.datetimestamp_id") == col("dim_datetime.datetime_id"), "inner") \
    .join(broadcast(df_dropoff_location.alias("dim_dropoff_location")), 
          col("fact_data.do_location_id") == col("dim_dropoff_location.DOLocationID"), "inner") \
    .select(
        col("fact_data.trip_id").alias("trip_id"),
        col("dim_datetime.pick_year").alias("year"),
        col("dim_dropoff_location.DOLocationID").alias("LocationID"),
        col("dim_dropoff_location.X").alias("dropoff_x"),
        col("dim_dropoff_location.Y").alias("dropoff_y"),
        col("dim_dropoff_location.zone").alias("zone"),
        col("dim_dropoff_location.borough").alias("borough"),
        col("dim_dropoff_location.service_zone").alias("service_zone")
    )

# Aggregation
df_result = df_joined \
    .groupBy("year", "LocationID", "dropoff_x", "dropoff_y", "zone", "borough", "service_zone") \
    .agg(count("trip_id").alias("total_trips"))

In [6]:
# Save to BigQuery
df_result.write \
    .format("bigquery") \
    .option("table", output) \
    .option("temporaryGcsBucket", "uber-pyspark-jobs/temp") \
    .mode("overwrite") \
    .save()

                                                                                

In [7]:
spark.stop()