In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("NYC Taxi Fare Predi ction").getOrCreate()

In [0]:
from azureml.opendatasets import PublicHolidays
from datetime import datetime

# Define the start and end dates for the year 2023
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

In [0]:
# Fetch public holidays data
hol = PublicHolidays(start_date=start_date, end_date=end_date)
hol_df = hol.to_spark_dataframe()



In [0]:
# Display holidays
hol_df.show()

+---------------+--------------------+--------------------+-------------+-----------------+-------------------+
|countryOrRegion|         holidayName|normalizeHolidayName|isPaidTimeOff|countryRegionCode|               date|
+---------------+--------------------+--------------------+-------------+-----------------+-------------------+
|      Argentina|Año Nuevo [New Ye...|Año Nuevo [New Ye...|         NULL|               AR|2023-01-01 00:00:00|
|      Australia|      New Year's Day|      New Year's Day|         NULL|               AU|2023-01-01 00:00:00|
|        Austria|             Neujahr|             Neujahr|         NULL|               AT|2023-01-01 00:00:00|
|        Belarus|           Новый год|           Новый год|         NULL|               BY|2023-01-01 00:00:00|
|        Belgium|       Nieuwjaarsdag|       Nieuwjaarsdag|         NULL|               BE|2023-01-01 00:00:00|
|         Brazil|            Ano novo|            Ano novo|         NULL|               BR|2023-01-01 00

In [0]:
# removing this column since it has only null values
hol_df = hol_df.drop("isPaidTimeOff")

In [0]:
display(hol_df)

In [0]:
# Load the Delta table
taxi_df = spark.read.format("delta").load("/dbfs/FileStore/tables/data_processed_lat_long/")

# Show the first few rows to verify
display(taxi_df.limit(5))

DOLocationID,PULocationID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_day_of_week,pickup_hour,Month_Num,pickup_month,dropoff_hour,dropoff_day_of_week,dropoff_month,dropoff_week_of_year,pickup_date,pickup_time,dropoff_date,dropoff_time,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,distance_km,pickup_borough,pickup_zone,pickup_service_zone,LocationID,dropoff_borough,dropoff_zone,dropoff_service_zone
138,170,2023-10-01T09:29:24Z,2023-10-01T09:43:44Z,2,8.7,1,33.8,6.94,1.0,59.64,14.333333,1,9,10,10,9,1,10,39,2023-10-01,09:29:24,2023-10-01,09:43:44,40.7477457936439,-73.97849159965229,40.77437570593244,-73.87362864289085,9.315085974107843,Manhattan,Murray Hill,Yellow Zone,138,Queens,LaGuardia Airport,Airports
79,170,2023-10-01T10:14:39Z,2023-10-01T10:20:34Z,3,1.4,1,8.6,0.0,1.0,14.1,5.9166665,1,10,10,10,10,1,10,39,2023-10-01,10:14:39,2023-10-01,10:20:34,40.7477457936439,-73.97849159965229,40.72762019590954,-73.98593745682462,2.3241314426512654,Manhattan,Murray Hill,Yellow Zone,79,Manhattan,East Village,Yellow Zone
170,170,2023-10-01T13:14:30Z,2023-10-01T13:28:54Z,1,0.47,1,12.8,0.0,1.0,18.3,14.4,1,13,10,10,13,1,10,39,2023-10-01,13:14:30,2023-10-01,13:28:54,40.7477457936439,-73.97849159965229,40.7477457936439,-73.97849159965229,0.0,Manhattan,Murray Hill,Yellow Zone,170,Manhattan,Murray Hill,Yellow Zone
229,170,2023-10-01T13:37:30Z,2023-10-01T13:44:26Z,2,1.14,1,8.6,0.0,1.0,15.12,6.9333334,1,13,10,10,13,1,10,39,2023-10-01,13:37:30,2023-10-01,13:44:26,40.7477457936439,-73.97849159965229,40.75672894163307,-73.96514579918423,1.5038408173485176,Manhattan,Murray Hill,Yellow Zone,229,Manhattan,Sutton Place/Turtle Bay North,Yellow Zone
79,170,2023-10-01T17:30:51Z,2023-10-01T17:48:13Z,1,3.83,1,20.5,0.0,1.0,29.4,17.366667,1,17,10,10,17,1,10,39,2023-10-01,17:30:51,2023-10-01,17:48:13,40.7477457936439,-73.97849159965229,40.72762019590954,-73.98593745682462,2.3241314426512654,Manhattan,Murray Hill,Yellow Zone,79,Manhattan,East Village,Yellow Zone


In [0]:
from pyspark.sql import functions as F

# Step 1: Filter for U.S. holidays in 2023 in holiday_df
hol_df = hol_df.filter(
    (hol_df["countryRegionCode"] == "US") &
    (F.year(F.to_date("date")) == 2023)
)

In [0]:
# Step 2: Convert date columns to compatible format
taxi_df = taxi_df.withColumn("pickup_date", F.to_date("pickup_datetime"))
hol_df = hol_df.withColumn("holiday_date", F.to_date("date"))

In [0]:
# Step 3: Perform a left join to add holiday information to taxi_df based on pickup date
taxi_with_holidays_df = taxi_df.join(
    hol_df,
    taxi_df["pickup_date"] == hol_df["holiday_date"],
    how="left"
)

In [0]:
# Step 4: Add a holiday indicator (1 if it's a holiday, otherwise 0)
taxi_with_holidays_df = taxi_with_holidays_df.withColumn(
    "is_holiday",
    F.when(F.col("holidayName").isNotNull(), 1).otherwise(0)
)

In [0]:
# Fill null values with a default value for holidayName and normalizeHolidayName
taxi_with_holidays_df = taxi_with_holidays_df.fillna({
    "holidayName": "No Holiday",
    "normalizeHolidayName": "No Holiday"
})

In [0]:
display(taxi_with_holidays_df)

In [0]:
# Get unique values in the 'holidayName' column
hol_df.select("holidayName").distinct().show(truncate=False)

+--------------------------+
|holidayName               |
+--------------------------+
|Veterans Day (Observed)   |
|Veterans Day              |
|Independence Day          |
|Columbus Day              |
|Memorial Day              |
|Thanksgiving              |
|New Year's Day (Observed) |
|Christmas Day             |
|Martin Luther King Jr. Day|
|Labor Day                 |
|New Year's Day            |
|Washington's Birthday     |
+--------------------------+



In [0]:
# Get unique values in the 'normalizeHolidayName' column
hol_df.select("normalizeHolidayName").distinct().show(truncate=False)

+--------------------------+
|normalizeHolidayName      |
+--------------------------+
|Veterans Day              |
|Independence Day          |
|Columbus Day              |
|Memorial Day              |
|Thanksgiving              |
|Christmas Day             |
|Martin Luther King Jr. Day|
|Labor Day                 |
|New Year's Day            |
|Washington's Birthday     |
+--------------------------+



In [0]:
taxi_with_holidays_df = taxi_with_holidays_df.drop("countryOrRegion", "countryRegionCode", "date", "holiday_date")

In [0]:
display(taxi_with_holidays_df.limit(5))

DOLocationID,PULocationID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_day_of_week,pickup_hour,Month_Num,pickup_month,dropoff_hour,dropoff_day_of_week,dropoff_month,dropoff_week_of_year,pickup_date,pickup_time,dropoff_date,dropoff_time,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,distance_km,pickup_borough,pickup_zone,pickup_service_zone,LocationID,dropoff_borough,dropoff_zone,dropoff_service_zone,holidayName,normalizeHolidayName,is_holiday
138,170,2023-10-01T09:29:24Z,2023-10-01T09:43:44Z,2,8.7,1,33.8,6.94,1.0,59.64,14.333333,1,9,10,10,9,1,10,39,2023-10-01,09:29:24,2023-10-01,09:43:44,40.7477457936439,-73.97849159965229,40.77437570593244,-73.87362864289085,9.315085974107843,Manhattan,Murray Hill,Yellow Zone,138,Queens,LaGuardia Airport,Airports,No Holiday,No Holiday,0
79,170,2023-10-01T10:14:39Z,2023-10-01T10:20:34Z,3,1.4,1,8.6,0.0,1.0,14.1,5.9166665,1,10,10,10,10,1,10,39,2023-10-01,10:14:39,2023-10-01,10:20:34,40.7477457936439,-73.97849159965229,40.72762019590954,-73.98593745682462,2.3241314426512654,Manhattan,Murray Hill,Yellow Zone,79,Manhattan,East Village,Yellow Zone,No Holiday,No Holiday,0
170,170,2023-10-01T13:14:30Z,2023-10-01T13:28:54Z,1,0.47,1,12.8,0.0,1.0,18.3,14.4,1,13,10,10,13,1,10,39,2023-10-01,13:14:30,2023-10-01,13:28:54,40.7477457936439,-73.97849159965229,40.7477457936439,-73.97849159965229,0.0,Manhattan,Murray Hill,Yellow Zone,170,Manhattan,Murray Hill,Yellow Zone,No Holiday,No Holiday,0
229,170,2023-10-01T13:37:30Z,2023-10-01T13:44:26Z,2,1.14,1,8.6,0.0,1.0,15.12,6.9333334,1,13,10,10,13,1,10,39,2023-10-01,13:37:30,2023-10-01,13:44:26,40.7477457936439,-73.97849159965229,40.75672894163307,-73.96514579918423,1.5038408173485176,Manhattan,Murray Hill,Yellow Zone,229,Manhattan,Sutton Place/Turtle Bay North,Yellow Zone,No Holiday,No Holiday,0
79,170,2023-10-01T17:30:51Z,2023-10-01T17:48:13Z,1,3.83,1,20.5,0.0,1.0,29.4,17.366667,1,17,10,10,17,1,10,39,2023-10-01,17:30:51,2023-10-01,17:48:13,40.7477457936439,-73.97849159965229,40.72762019590954,-73.98593745682462,2.3241314426512654,Manhattan,Murray Hill,Yellow Zone,79,Manhattan,East Village,Yellow Zone,No Holiday,No Holiday,0


In [0]:
# Check the Shape and Schema
# Get the number of rows and columns
num_rows = taxi_with_holidays_df.count()
num_cols = len(taxi_with_holidays_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")


Shape: (36395607, 39)


In [0]:
# Check for Missing Values in each column
# Check for missing values
missing_values = taxi_with_holidays_df.select([((taxi_with_holidays_df[column].isNull()).cast("int")).alias(column) for column in taxi_with_holidays_df.columns]) \
                                .agg({column: 'sum' for column in taxi_with_holidays_df.columns})

display(missing_values.limit(5))


sum(dropoff_time),sum(DOLocationID),sum(improvement_surcharge),sum(pickup_hour),sum(PULocationID),sum(trip_distance),sum(dropoff_borough),sum(pickup_service_zone),sum(pickup_date),sum(dropoff_longitude),sum(LocationID),sum(pickup_latitude),sum(dropoff_day_of_week),sum(tolls_amount),sum(dropoff_date),sum(pickup_time),sum(Month_Num),sum(pickup_zone),sum(pickup_borough),sum(payment_type),sum(fare_amount),sum(pickup_longitude),sum(passenger_count),sum(dropoff_month),sum(dropoff_hour),sum(pickup_day_of_week),sum(dropoff_zone),sum(distance_km),sum(dropoff_datetime),sum(normalizeHolidayName),sum(dropoff_week_of_year),sum(trip_duration),sum(total_amount),sum(holidayName),sum(pickup_datetime),sum(dropoff_latitude),sum(is_holiday),sum(dropoff_service_zone),sum(pickup_month)
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# Get value counts for each column
for column in taxi_with_holidays_df.columns:
    print(f"Value counts for {column}:")
    display(taxi_with_holidays_df.groupBy(column).count())

In [0]:
# Check for Duplicates
# Count the number of duplicate rows
duplicates_count = taxi_with_holidays_df.count() - taxi_with_holidays_df.distinct().count()
print(f"Number of duplicate rows: {duplicates_count}")


Number of duplicate rows: 0


In [0]:
# Define a more descriptive Delta Lake storage path
delta_path = "/dbfs/FileStore/tables/Data_Processing_Pulic_Holiday/"
# Write the DataFrame to Delta format
taxi_with_holidays_df.write.format("delta").mode("overwrite").save(delta_path)