In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import types, functions as F

In [3]:
spark = SparkSession.builder\
          .master("local[*]")\
          .config("spark.driver.bindAddress", "127.0.0.1")\
          .appName("test")\
          .getOrCreate()

25/03/01 23:13:47 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:75)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:53)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.command

## Green Taxi

In [4]:
df_pd = pd.read_parquet('./data/raw/green/2020/01/green_tripdata_2020_01.parquet')
df_pd.VendorID.unique()

array([2, 1])

In [5]:
green_taxi_schema = types.StructType([
  types.StructField('VendorID', types.LongType(), True),
  types.StructField('lpep_pickup_datetime', types.TimestampType(), True),
  types.StructField('lpep_dropoff_datetime', types.TimestampType(), True),
  types.StructField('store_and_fwd_flag', types.StringType(), True),
  types.StructField('RatecodeID', types.IntegerType(), True),
  types.StructField('PULocationID', types.IntegerType(), True),
  types.StructField('DOLocationID', types.IntegerType(), True),
  types.StructField('passenger_count', types.IntegerType(), True),
  types.StructField('trip_distance', types.DoubleType(), True),
  types.StructField('fare_amount', types.DoubleType(), True),
  types.StructField('extra', types.DoubleType(), True),
  types.StructField('mta_tax', types.DoubleType(), True),
  types.StructField('tip_amount', types.DoubleType(), True),
  types.StructField('tolls_amount', types.DoubleType(), True),
  types.StructField('ehail_fee', types.DoubleType(), True),
  types.StructField('improvement_surcharge', types.DoubleType(), True),
  types.StructField('total_amount', types.DoubleType(), True),
  types.StructField('payment_type', types.IntegerType(), True),
  types.StructField('trip_type', types.IntegerType(), True),
  types.StructField('congestion_surcharge', types.DoubleType(), True)
])

In [6]:
year = 2021
for year in range(2020, 2022):
  for month in range(1, 13):
    input_path = f'data/raw/green/{year}/{month:02d}/'
    output_path = f'data/pq/green/{year}/{month:02d}/'

    df_green = spark\
      .read\
      .option("header", True)\
      .parquet(input_path)

    df_green\
    .repartition(4)\
    .write\
    .parquet(output_path)

25/03/01 23:14:01 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

## Yellow taxi

In [7]:
yellow_taxi_schema = types.StructType([
  types.StructField('VendorID', types.LongType(), True),
  types.StructField('tpep_pickup_datetime', types.TimestampType(), True),
  types.StructField('tpep_dropoff_datetime', types.TimestampType(), True),
  types.StructField('passenger_count', types.IntegerType(), True),
  types.StructField('trip_distance', types.DoubleType(), True),
  types.StructField('RatecodeID', types.IntegerType(), True),
  types.StructField('store_and_fwd_flag', types.StringType(), True),
  types.StructField('PULocationID', types.IntegerType(), True),
  types.StructField('DOLocationID', types.IntegerType(), True),
  types.StructField('payment_type', types.IntegerType(), True),
  types.StructField('fare_amount', types.DoubleType(), True),
  types.StructField('extra', types.DoubleType(), True),
  types.StructField('mta_tax', types.DoubleType(), True),
  types.StructField('tip_amount', types.DoubleType(), True),
  types.StructField('tolls_amount', types.DoubleType(), True),
  types.StructField('improvement_surcharge', types.DoubleType(), True),
  types.StructField('total_amount', types.DoubleType(), True),
  types.StructField('congestion_surcharge', types.DoubleType(), True),
  types.StructField('airport_fee', types.DoubleType(), True)
])

In [8]:
for year in range(2020, 2022):
  for month in range(1, 13):
    input_path = f'data/raw/yellow/{year}/{month:02d}/'
    output_path = f'data/pq/yellow/{year}/{month:02d}/'

    df_yellow = spark\
      .read\
      .option("header", True)\
      .parquet(input_path)
    
    df_yellow\
    .repartition(4)\
    .write.parquet(output_path)

                                                                                

In [9]:
spark.stop()