In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import types
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window
import os
import sys

In [2]:
year = sys.argv[1]
month = sys.argv[2]

In [10]:
minio_access_key = os.getenv('MINIO_ROOT_USER')
minio_secret_key = os.getenv('MINIO_ROOT_PASSWORD')
s3_path = 's3a://data/'

In [5]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('data_transformation') \
    .config("spark.hadoop.fs.s3a.access.key", minio_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", minio_secret_key) \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/17 17:47:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
new_schema = types.StructType([
    types.StructField('VendorID', types.IntegerType(), True), 
    types.StructField('tpep_pickup_datetime', types.TimestampNTZType(), True), 
    types.StructField('tpep_dropoff_datetime', types.TimestampNTZType(), True), 
    types.StructField('passenger_count', types.IntegerType(), True), 
    types.StructField('trip_distance', types.FloatType(), True), 
    types.StructField('RatecodeID', types.IntegerType(), True), 
    types.StructField('store_and_fwd_flag', types.StringType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('payment_type', types.IntegerType(), True), 
    types.StructField('fare_amount', types.FloatType(), True), 
    types.StructField('extra', types.FloatType(), True), 
    types.StructField('mta_tax', types.FloatType(), True), 
    types.StructField('tip_amount', types.FloatType(), True), 
    types.StructField('tolls_amount', types.FloatType(), True), 
    types.StructField('improvement_surcharge', types.FloatType(), True), 
    types.StructField('total_amount', types.FloatType(), True), 
    types.StructField('congestion_surcharge', types.FloatType(), True), 
    types.StructField('airport_fee', types.IntegerType(), True)])

In [11]:
df = spark.read.option('headers', True).parquet(f'{s3_path}/2024/01/*.parquet')

24/09/17 17:48:42 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [12]:
old_schema = df.schema

In [13]:
for old_field, new_field in zip(old_schema.fields, new_schema.fields):
    df = df.withColumn(new_field.name, col(old_field.name).cast(new_field.dataType))

In [14]:
df = df.withColumnRenamed('VendorID', 'vendor_id') \
    .withColumnRenamed('RatecodeID', 'ratecode_id') \
    .withColumnRenamed('payment_type', 'payment_type_id') \
    .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
    .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime') \
    .withColumnRenamed('PULocationID', 'pickup_location_id') \
    .withColumnRenamed('DOLocationID', 'dropoff_location_id')

In [15]:
df = df.filter((col('fare_amount') > 0) \
               & (col('trip_distance') > 0) \
               & (col('extra') > 0))

df = df.filter((col('ratecode_id') <= 6))

df = df.withColumn('congestion_surcharge', F.when(col('congestion_surcharge').isNull(), 0).otherwise(col('congestion_surcharge')))
df = df.withColumn('airport_fee', F.when(col('airport_fee').isNull(), 0).otherwise(col('airport_fee')))

In [16]:
window_spec = Window.orderBy('passenger_count')

df_rn = df.select(['passenger_count']).withColumn('rn', F.row_number().over(window_spec))
total_rows = df.count()
                                                         
if total_rows % 2 == 0:
    lower_mid = total_rows // 2
    upper_mid = lower_mid + 1
else:
    lower_mid = total_rows // 2 + 1
    upper_mid = lower_mid

median_df = df_rn.filter((col('rn') == lower_mid) | (col('rn') == upper_mid))

median_value = median_df.agg(F.avg(col('passenger_count'))).collect()[0][0]

df = df.withColumn('passenger_count', F.when(col('passenger_count') == 0, median_value).otherwise(col('passenger_count')))

24/09/17 17:48:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/17 17:48:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/17 17:48:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/17 17:48:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [17]:
def index_id(date_column):
    year = F.year(date_column)
    month = F.lpad(F.month(date_column).cast("string"), 2, "0")
    day = F.lpad(F.dayofmonth(date_column).cast("string"), 2, "0")
    hour = F.lpad(F.hour(date_column).cast("string"), 2, "0")
    minute = F.lpad(F.minute(date_column).cast("string"), 2, "0")
    second = F.lpad(F.second(date_column).cast("string"), 2, "0")
    index = F.concat(year, month, day, hour, minute, second)
    return index

In [18]:
pickup_datetime_dim = df.select(['pickup_datetime']) \
    .distinct() \
    .withColumn('pickup_datetime_id', index_id(col('pickup_datetime'))) \
    .withColumn('pickup_hour', F.hour(col('pickup_datetime'))) \
    .withColumn('pickup_day', F.dayofmonth(col('pickup_datetime'))) \
    .withColumn('pickup_month', F.month(col('pickup_datetime'))) \
    .withColumn('pickup_year', F.year(col('pickup_datetime'))) \
    .withColumn('pickup_weekday', F.date_format(col('pickup_datetime'), 'EEEE'))

pickup_datetime_dim = pickup_datetime_dim.select(
    'pickup_datetime_id',
    'pickup_datetime',
    'pickup_hour',
    'pickup_day',
    'pickup_month',
    'pickup_year',
    'pickup_weekday'
)

In [19]:
dropoff_datetime_dim = df.select(['dropoff_datetime']) \
    .distinct() \
    .withColumn('dropoff_datetime_id', index_id(col('dropoff_datetime'))) \
    .withColumn('dropoff_hour', F.hour(col('dropoff_datetime'))) \
    .withColumn('dropoff_day', F.dayofmonth(col('dropoff_datetime'))) \
    .withColumn('dropoff_month', F.month(col('dropoff_datetime'))) \
    .withColumn('dropoff_year', F.year(col('dropoff_datetime'))) \
    .withColumn('dropoff_weekday', F.date_format(col('dropoff_datetime'), 'EEEE'))

dropoff_datetime_dim = dropoff_datetime_dim.select(
    'dropoff_datetime_id',
    'dropoff_datetime',
    'dropoff_hour',
    'dropoff_day',
    'dropoff_month',
    'dropoff_year',
    'dropoff_weekday'
)

In [20]:
df = df.withColumn('dropoff_datetime_id', index_id(col('dropoff_datetime'))) \
    .withColumn('pickup_datetime_id', index_id(col('pickup_datetime')))

In [21]:
df = df.select(['vendor_id',
 'pickup_datetime_id',
 'dropoff_datetime_id',
 'pickup_location_id',
 'dropoff_location_id',
 'ratecode_id',
 'passenger_count',
 'trip_distance',
 'payment_type_id',
 'store_and_fwd_flag',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'congestion_surcharge',
 'airport_fee',
 'total_amount'])

In [22]:
df.show()

+---------+------------------+-------------------+------------------+-------------------+-----------+---------------+-------------+---------------+------------------+-----------+-----+-------+----------+------------+---------------------+--------------------+-----------+------------+
|vendor_id|pickup_datetime_id|dropoff_datetime_id|pickup_location_id|dropoff_location_id|ratecode_id|passenger_count|trip_distance|payment_type_id|store_and_fwd_flag|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|congestion_surcharge|airport_fee|total_amount|
+---------+------------------+-------------------+------------------+-------------------+-----------+---------------+-------------+---------------+------------------+-----------+-----+-------+----------+------------+---------------------+--------------------+-----------+------------+
|        2|    20240101005755|     20240101011743|               186|                 79|          1|            1.0|         1.72|              

                                                                                