In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, unix_timestamp, dayofweek, hour
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType, FloatType, StringType

# Create a Spark session
spark = SparkSession.builder \
    .appName("NYC Taxi Trip Data Analysis") \
    .getOrCreate()
    
# Define paths
folder_path = "dbfs:/FileStore/tables/"  # Folder where files are stored
delta_path = "dbfs:/FileStore/tables/nyc_taxi_data_delta/"  # Destination Delta path

# Generate a list of parquet files for each month of 2023 with the updated filename format
months = [f"yellow_tripdata_2023_{str(i).zfill(2)}.parquet" for i in range(1, 13)]

# Load the first month's data with all necessary columns
trip_data = spark.read.parquet(folder_path + months[0]) \
                      .select("tpep_pickup_datetime", "tpep_dropoff_datetime", 
                              "PULocationID", "DOLocationID", 
                              "passenger_count", "trip_distance", 
                              "payment_type", "fare_amount", 
                              "tolls_amount", "improvement_surcharge", 
                              "total_amount") \
                      .withColumn("month", lit(months[0][21:28])) \
                      .withColumn("trip_duration", 
                                  (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60) \
                      .withColumn("pickup_day_of_week", dayofweek("tpep_pickup_datetime")) \
                      .withColumn("pickup_hour", hour("tpep_pickup_datetime"))

# Union the rest of the months
for month in months[1:]:
    temp_df = spark.read.parquet(folder_path + month) \
                        .select("tpep_pickup_datetime", "tpep_dropoff_datetime", 
                                "PULocationID", "DOLocationID", 
                                "passenger_count", "trip_distance", 
                                "payment_type", "fare_amount", 
                                "tolls_amount", "improvement_surcharge", 
                                "total_amount") \
                        .withColumn("month", lit(month[21:28])) \
                        .withColumn("trip_duration", 
                                    (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60) \
                        .withColumn("pickup_day_of_week", dayofweek("tpep_pickup_datetime")) \
                        .withColumn("pickup_hour", hour("tpep_pickup_datetime"))
    
    # Union the new month's data with the existing data
    trip_data = trip_data.union(temp_df)

# Check the schema and data types
trip_data.printSchema()

# Convert data types if necessary
nyc_taxi_df = trip_data \
    .withColumn("tpep_pickup_datetime", trip_data["tpep_pickup_datetime"].cast(TimestampType())) \
    .withColumn("tpep_dropoff_datetime", trip_data["tpep_dropoff_datetime"].cast(TimestampType())) \
    .withColumn("PULocationID", trip_data["PULocationID"].cast(IntegerType())) \
    .withColumn("DOLocationID", trip_data["DOLocationID"].cast(IntegerType())) \
    .withColumn("passenger_count", trip_data["passenger_count"].cast(IntegerType())) \
    .withColumn("trip_distance", trip_data["trip_distance"].cast(FloatType())) \
    .withColumn("payment_type", trip_data["payment_type"].cast(StringType())) \
    .withColumn("fare_amount", trip_data["fare_amount"].cast(FloatType())) \
    .withColumn("tolls_amount", trip_data["tolls_amount"].cast(FloatType())) \
    .withColumn("improvement_surcharge", trip_data["improvement_surcharge"].cast(FloatType())) \
    .withColumn("total_amount", trip_data["total_amount"].cast(FloatType())) \
    .withColumn("month", trip_data["month"].cast(StringType())) \
    .withColumn("trip_duration", trip_data["trip_duration"].cast(FloatType())) \
    .withColumn("pickup_day_of_week", trip_data["pickup_day_of_week"].cast(IntegerType())) \
    .withColumn("pickup_hour", trip_data["pickup_hour"].cast(IntegerType()))

# Check the first few rows to verify
display(nyc_taxi_df.limit(5))

root
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- month: string (nullable = false)
 |-- trip_duration: double (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)



tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,month,trip_duration,pickup_day_of_week,pickup_hour
2023-01-01T00:32:10Z,2023-01-01T00:40:36Z,161,141,1,0.97,2,9.3,0.0,1.0,14.3,01.parq,8.433333,1,0
2023-01-01T00:55:08Z,2023-01-01T01:01:27Z,43,237,1,1.1,1,7.9,0.0,1.0,16.9,01.parq,6.3166666,1,0
2023-01-01T00:25:04Z,2023-01-01T00:37:49Z,48,238,1,2.51,1,14.9,0.0,1.0,34.9,01.parq,12.75,1,0
2023-01-01T00:03:48Z,2023-01-01T00:13:25Z,138,7,0,1.9,1,12.1,0.0,1.0,20.85,01.parq,9.616667,1,0
2023-01-01T00:10:29Z,2023-01-01T00:21:19Z,107,79,1,1.43,1,11.4,0.0,1.0,19.68,01.parq,10.833333,1,0


In [0]:

# Write the combined DataFrame in Delta Lake format
nyc_taxi_df.write.format("delta").mode("overwrite").save(delta_path)

In [0]:
loaded_df = spark.read.format("delta").load(delta_path)
loaded_df.show()

+--------------------+---------------------+------------+------------+---------------+-------------+------------+-----------+------------+---------------------+------------+-------+-------------+------------------+-----------+
|tpep_pickup_datetime|tpep_dropoff_datetime|PULocationID|DOLocationID|passenger_count|trip_distance|payment_type|fare_amount|tolls_amount|improvement_surcharge|total_amount|  month|trip_duration|pickup_day_of_week|pickup_hour|
+--------------------+---------------------+------------+------------+---------------+-------------+------------+-----------+------------+---------------------+------------+-------+-------------+------------------+-----------+
| 2023-05-01 00:33:13|  2023-05-01 00:53:01|         138|          43|              0|          7.8|           1|       33.8|         0.0|                  1.0|       51.65|05.parq|         19.8|                 2|          0|
| 2023-05-01 00:42:49|  2023-05-01 01:11:18|         138|         262|              2|      

In [0]:
# Display count of rows
print(f"Total records: {nyc_taxi_df.count()}")

# Describe dataset to get an overview of each column
nyc_taxi_df.describe().show()


Total records: 38310226
+-------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------+-----------------+--------+------------------+------------------+-----------------+
|summary|     PULocationID|      DOLocationID|   passenger_count|     trip_distance|      payment_type|       fare_amount|      tolls_amount|improvement_surcharge|     total_amount|   month|     trip_duration|pickup_day_of_week|      pickup_hour|
+-------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------+-----------------+--------+------------------+------------------+-----------------+
|  count|         38310226|          38310226|          37000870|          38310226|          38310226|          38310226|          38310226|             38310226|         38310226|38310226|          38310226|          38310226|       