In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession\
  .builder\
  .master("local[*]")\
  .appName('test')\
  .getOrCreate()

25/03/18 16:34:38 WARN Utils: Your hostname, Bastiens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.30 instead (on interface en0)
25/03/18 16:34:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/18 16:34:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df_green = spark.read.parquet('./data/pq/green/*/*')

                                                                                

In [6]:
df_green = df_green\
  .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime')\
  .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')

In [7]:
df_yellow = spark.read.parquet('./data/pq/yellow/*/*')

In [8]:
df_yellow = df_yellow\
  .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime')\
  .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

In [9]:
common_columns = [col for col in df_green.columns if col in df_yellow.columns]

In [10]:
df_green_sel = df_green.select(common_columns).withColumn('service_type', F.lit('green'))
df_yellow_sel = df_yellow.select(common_columns).withColumn('service_type', F.lit('yellow'))

In [11]:
df_trips_data = df_green_sel.unionAll(df_yellow_sel)

In [12]:
df_trips_data.groupBy('service_type').count().show()

                                                                                

+------------+--------+
|service_type|   count|
+------------+--------+
|       green| 2304517|
|      yellow|39649199|
+------------+--------+



In [13]:
df_trips_data.createOrReplaceTempView('trip_data')

In [14]:
df_result = spark.sql("""
  SELECT 
    PULocationID AS revenue_zone,
    date_trunc("month", "pickup_datetime") AS revenue_month,
    service_type, 
    SUM(fare_amount) AS revenue_monthly_fare,
    SUM(extra) AS revenue_monthly_extra,
    SUM(mta_tax) AS revenue_monthly_mta_tax,
    SUM(tip_amount) AS revenue_monthly_tip_amount,
    SUM(tolls_amount) AS revenue_monthly_tolls_amount,
    SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
    SUM(total_amount) AS revenue_monthly_total_amount,
    AVG(passenger_count) AS avg_monthly_passenger_count,
    AVG(trip_distance) AS avg_monthly_trip_distance
  FROM trip_data
  GROUP BY revenue_zone, revenue_month, service_type
""")

In [15]:
df_result.coalesce(1).write.parquet('data/report/revenue/', mode='overwrite')

                                                                                

In [16]:
spark.stop()