In [None]:
from pyspark.sql import SparkSession, functions as F

In [45]:
spark = SparkSession.builder\
          .master("local[*]")\
          .appName("test")\
          .getOrCreate()

## Green Taxi

In [46]:
df_green = spark.read.parquet('./data/pq/green/*/*')

In [47]:
df_green = df_green\
            .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime')\
            .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')

## Yellow Taxi

In [48]:
df_yellow = spark.read.parquet('./data/pq/yellow/*/*')

In [49]:
df_yellow = df_yellow\
            .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime')\
            .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

## Merge

In [50]:
common_cols = [col for col in df_green.columns if col in df_yellow.columns]

In [51]:
df_green_sel = df_green.select(common_cols).withColumn('service_type', F.lit('green'))
df_yellow_sel = df_yellow.select(common_cols).withColumn('service_type', F.lit('yellow'))

In [52]:
df_trips_data = df_green_sel.unionAll(df_yellow_sel)

In [53]:
df_trips_data.groupBy('service_type').count().show()



+------------+--------+
|service_type|   count|
+------------+--------+
|       green| 2304517|
|      yellow|39649199|
+------------+--------+



                                                                                

In [55]:
df_trips_data.createTempView('trips_data')

In [56]:
spark.sql("""
  SELECT
    service_type,
    count(1)
  FROM trips_data
  GROUP BY 1
""").show()



+------------+--------+
|service_type|count(1)|
+------------+--------+
|       green| 2304517|
|      yellow|39649199|
+------------+--------+



                                                                                

In [57]:
df_result = spark.sql("""
  SELECT 
      -- Revenue grouping 
      PULocationID AS revenue_zone,
      date_trunc('month', pickup_datetime) AS revenue_month, 
      service_type, 

      -- Revenue calculation 
      SUM(fare_amount) AS revenue_monthly_fare,
      SUM(extra) AS revenue_monthly_extra,
      SUM(mta_tax) AS revenue_monthly_mta_tax,
      SUM(tip_amount) AS revenue_monthly_tip_amount,
      SUM(tolls_amount) AS revenue_monthly_tolls_amount,
      SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
      SUM(total_amount) AS revenue_monthly_total_amount,
      SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,

      -- Additional calculations
      AVG(passenger_count) AS avg_monthly_passenger_count,
      AVG(trip_distance) AS avg_monthly_trip_distance
  FROM
      trips_data
  GROUP BY
      1, 2, 3
  """)

In [58]:
df_result.coalesce(1).write.parquet('data/report/revenue/', mode='overwrite')

                                                                                

In [59]:
spark.stop()