In [0]:
# Load the Delta table
data_processing_df = spark.read.format("delta").load("dbfs:/FileStore/tables/nyc_taxi_data_delta/")

# Show the first few rows to verify
display(data_processing_df.limit(5))

tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,month,trip_duration,pickup_day_of_week,pickup_hour
2023-05-01T00:33:13Z,2023-05-01T00:53:01Z,138,43,0,7.8,1,33.8,0.0,1.0,51.65,05.parq,19.8,2,0
2023-05-01T00:42:49Z,2023-05-01T01:11:18Z,138,262,2,8.1,1,35.9,0.0,1.0,57.15,05.parq,28.483334,2,0
2023-05-01T00:56:34Z,2023-05-01T01:13:39Z,138,141,2,9.1,1,35.2,6.55,1.0,64.2,05.parq,17.083334,2,0
2023-05-01T00:00:52Z,2023-05-01T00:20:12Z,138,140,1,8.21,1,33.1,0.0,1.0,47.09,05.parq,19.333334,2,0
2023-05-01T00:05:50Z,2023-05-01T00:19:41Z,138,263,0,7.9,1,31.0,6.55,1.0,59.15,05.parq,13.85,2,0


In [0]:
# Check the Shape and Schema
# Get the number of rows and columns
num_rows = data_processing_df.count()
num_cols = len(data_processing_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

# Get DataFrame information
data_processing_df.printSchema()

Shape: (38310226, 15)
root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- month: string (nullable = true)
 |-- trip_duration: float (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)



In [0]:
# Check for Duplicates
# Count the number of duplicate rows
duplicates_count = data_processing_df.count() - data_processing_df.distinct().count()
print(f"Number of duplicate rows: {duplicates_count}")

Number of duplicate rows: 2


In [0]:
# Check for Missing Values in each column
# Check for missing values
missing_values = data_processing_df.select([((data_processing_df[column].isNull()).cast("int")).alias(column) for column in data_processing_df.columns]) \
                                .agg({column: 'sum' for column in data_processing_df.columns})

display(missing_values.limit(5))


sum(DOLocationID),sum(improvement_surcharge),sum(pickup_hour),sum(tpep_dropoff_datetime),sum(PULocationID),sum(trip_distance),sum(tolls_amount),sum(payment_type),sum(fare_amount),sum(passenger_count),sum(pickup_day_of_week),sum(trip_duration),sum(total_amount),sum(tpep_pickup_datetime),sum(month)
0,0,0,0,0,0,0,0,0,1309356,0,0,0,0,0


In [0]:
# Get value counts for each column
for column in data_processing_df.columns:
    print(f"Value counts for {column}:")
    display(data_processing_df.groupBy(column).count())

In [0]:
# Function to display value counts for a specific column using display()
def display_value_counts(dataframe, column):
    print(f"Value counts for {column}:")
    value_counts = dataframe.groupBy(column).count().orderBy("count", ascending=False)
    display(value_counts)


In [0]:
# Example usage for a specific column
display_value_counts(data_processing_df, 'month')

Value counts for month:


month,count
10.parq,3522285
05.parq,3513649
03.parq,3403766
12.parq,3376567
11.parq,3339715
06.parq,3307234
04.parq,3288250
01.parq,3066766
02.parq,2913955
07.parq,2907108


In [0]:
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.types import IntegerType

# Assuming 'month' is the name of the column in data_processing_df that contains values like '10.parq'
data_processing_df = data_processing_df.withColumn("Month_Num", regexp_extract("month", r"(\d{2})", 1).cast(IntegerType()))

# Drop the original 'month' column if it's not needed
data_processing_df = data_processing_df.drop("month")

# Display the updated DataFrame
display(data_processing_df)

In [0]:
from pyspark.sql.functions import hour, dayofweek, month, weekofyear

data_processing_df = data_processing_df.withColumn("pickup_month", month("tpep_pickup_datetime"))

In [0]:
# Extract features from dropoff datetime
data_processing_df = data_processing_df.withColumn("dropoff_hour", hour("tpep_dropoff_datetime"))
data_processing_df = data_processing_df.withColumn("dropoff_day_of_week", dayofweek("tpep_dropoff_datetime"))
data_processing_df = data_processing_df.withColumn("dropoff_month", month("tpep_dropoff_datetime"))
data_processing_df = data_processing_df.withColumn("dropoff_week_of_year", weekofyear("tpep_dropoff_datetime"))

In [0]:
display(data_processing_df.limit(5))

tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_day_of_week,pickup_hour,Month_Num,pickup_month,dropoff_hour,dropoff_day_of_week,dropoff_month,dropoff_week_of_year
2023-05-01T00:33:13Z,2023-05-01T00:53:01Z,138,43,0,7.8,1,33.8,0.0,1.0,51.65,19.8,2,0,5,5,0,2,5,18
2023-05-01T00:42:49Z,2023-05-01T01:11:18Z,138,262,2,8.1,1,35.9,0.0,1.0,57.15,28.483334,2,0,5,5,1,2,5,18
2023-05-01T00:56:34Z,2023-05-01T01:13:39Z,138,141,2,9.1,1,35.2,6.55,1.0,64.2,17.083334,2,0,5,5,1,2,5,18
2023-05-01T00:00:52Z,2023-05-01T00:20:12Z,138,140,1,8.21,1,33.1,0.0,1.0,47.09,19.333334,2,0,5,5,0,2,5,18
2023-05-01T00:05:50Z,2023-05-01T00:19:41Z,138,263,0,7.9,1,31.0,6.55,1.0,59.15,13.85,2,0,5,5,0,2,5,18


In [0]:
# Remove duplicate rows across all columns
data_processing_df = data_processing_df.dropDuplicates()

In [0]:
# Example usage for a specific column
display_value_counts(data_processing_df, 'passenger_count')

Value counts for passenger_count:


passenger_count,count
1.0,27823457
2.0,5609105
3.0,1394693
,1309356
4.0,789997
0.0,583005
5.0,483233
6.0,316969
8.0,261
7.0,93


In [0]:
# Filter rows with null values in passenger_count
null_passenger_count_df = data_processing_df.filter(data_processing_df.passenger_count.isNull())

# Show the rows with null passenger_count
display(null_passenger_count_df)

In [0]:
from pyspark.sql import functions as F

# Step 1: Count total rows in the DataFrame
total_rows = data_processing_df.count()

# Step 2: Count rows with null values in passenger_count
null_count = data_processing_df.filter(data_processing_df.passenger_count.isNull()).count()

# Step 3: Calculate the percentage of null values
null_percentage = (null_count / total_rows) * 100

print(f"Total rows: {total_rows}")
print(f"Null count in passenger_count: {null_count}")
print(f"Percentage of null values in passenger_count: {null_percentage:.2f}%")

Total rows: 38310224
Null count in passenger_count: 1309356
Percentage of null values in passenger_count: 3.42%


In [0]:
# Drop rows with any missing values
data_processing_df = data_processing_df.dropna()

In [0]:
# Get the number of rows and columns
num_rows = data_processing_df.count()
num_cols = len(data_processing_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (37000868, 20)


In [0]:
from pyspark.sql.functions import date_format, col

# Extract date from the pickup datetime
data_processing_df = data_processing_df.withColumn("pickup_date", date_format("tpep_pickup_datetime", "yyyy-MM-dd"))

In [0]:
# Extract time from the pickup datetime
data_processing_df = data_processing_df.withColumn("pickup_time", date_format("tpep_pickup_datetime", "HH:mm:ss"))

In [0]:
# updated DataFrame with new columns
data_processing_df.select("tpep_pickup_datetime", "pickup_date", "pickup_time").show(truncate=False)

+--------------------+-----------+-----------+
|tpep_pickup_datetime|pickup_date|pickup_time|
+--------------------+-----------+-----------+
|2023-10-01 00:25:12 |2023-10-01 |00:25:12   |
|2023-10-01 00:58:16 |2023-10-01 |00:58:16   |
|2023-10-01 00:33:16 |2023-10-01 |00:33:16   |
|2023-10-01 00:40:43 |2023-10-01 |00:40:43   |
|2023-10-01 00:42:44 |2023-10-01 |00:42:44   |
|2023-10-01 00:09:13 |2023-10-01 |00:09:13   |
|2023-10-01 00:05:42 |2023-10-01 |00:05:42   |
|2023-10-01 00:14:55 |2023-10-01 |00:14:55   |
|2023-10-01 00:32:53 |2023-10-01 |00:32:53   |
|2023-10-01 00:34:11 |2023-10-01 |00:34:11   |
|2023-10-01 00:29:56 |2023-10-01 |00:29:56   |
|2023-10-01 00:06:17 |2023-10-01 |00:06:17   |
|2023-10-01 00:58:12 |2023-10-01 |00:58:12   |
|2023-10-01 00:51:37 |2023-10-01 |00:51:37   |
|2023-10-01 00:31:27 |2023-10-01 |00:31:27   |
|2023-10-01 00:54:04 |2023-10-01 |00:54:04   |
|2023-10-01 00:42:25 |2023-10-01 |00:42:25   |
|2023-10-01 00:28:44 |2023-10-01 |00:28:44   |
|2023-10-01 0

In [0]:
# Group by pickup_time and count occurrences
peak_hours = data_processing_df.groupBy("pickup_time").count().orderBy("count", ascending=False)

In [0]:
# Show peak hours
display(peak_hours)

In [0]:
# Extract date from the dropoff datetime
data_processing_df = data_processing_df.withColumn("dropoff_date", date_format("tpep_dropoff_datetime", "yyyy-MM-dd"))

In [0]:
# Extract time from the dropoff datetime
data_processing_df = data_processing_df.withColumn("dropoff_time", date_format("tpep_dropoff_datetime", "HH:mm:ss"))

In [0]:
# updated DataFrame with new columns
data_processing_df.select("tpep_dropoff_datetime", "dropoff_date", "dropoff_time").show(truncate=False)

+---------------------+------------+------------+
|tpep_dropoff_datetime|dropoff_date|dropoff_time|
+---------------------+------------+------------+
|2023-10-01 00:42:53  |2023-10-01  |00:42:53    |
|2023-10-01 01:05:30  |2023-10-01  |01:05:30    |
|2023-10-01 00:44:20  |2023-10-01  |00:44:20    |
|2023-10-01 00:46:51  |2023-10-01  |00:46:51    |
|2023-10-01 00:47:50  |2023-10-01  |00:47:50    |
|2023-10-01 00:24:59  |2023-10-01  |00:24:59    |
|2023-10-01 00:22:36  |2023-10-01  |00:22:36    |
|2023-10-01 00:42:25  |2023-10-01  |00:42:25    |
|2023-10-01 00:44:14  |2023-10-01  |00:44:14    |
|2023-10-01 00:41:01  |2023-10-01  |00:41:01    |
|2023-10-01 00:39:44  |2023-10-01  |00:39:44    |
|2023-10-01 00:14:12  |2023-10-01  |00:14:12    |
|2023-10-01 01:00:59  |2023-10-01  |01:00:59    |
|2023-10-01 00:58:13  |2023-10-01  |00:58:13    |
|2023-10-01 00:41:01  |2023-10-01  |00:41:01    |
|2023-10-01 01:07:34  |2023-10-01  |01:07:34    |
|2023-10-01 00:45:16  |2023-10-01  |00:45:16    |


In [0]:
from pyspark.sql.functions import year

data_processing_df = data_processing_df.filter(year("pickup_date") == 2023)

data_processing_df = data_processing_df.filter(year("dropoff_date") == 2023)

data_processing_df.show()

In [0]:
# Peak Pickup Analysis
peak_pickup_hours = data_processing_df.groupBy("pickup_time").count().orderBy("count", ascending=False)
display(peak_pickup_hours)

In [0]:
# Peak Drop-off Analysis
peak_dropoff_hours = data_processing_df.groupBy("dropoff_time").count().orderBy("count", ascending=False)
display(peak_dropoff_hours)

In [0]:
# Daily Pickup Analysis
daily_pickups = data_processing_df.groupBy("pickup_date").count().orderBy("pickup_date")
display(daily_pickups)

pickup_date,count
2023-01-01,73286
2023-01-02,64544
2023-01-03,83755
2023-01-04,92977
2023-01-05,98829
2023-01-06,100271
2023-01-07,102898
2023-01-08,83341
2023-01-09,83358
2023-01-10,97824


In [0]:
# Daily Drop-off Analysis
daily_dropoffs = data_processing_df.groupBy("dropoff_date").count().orderBy("dropoff_date")
display(daily_dropoffs)


dropoff_date,count
2023-01-01,72717
2023-01-02,64585
2023-01-03,83701
2023-01-04,92887
2023-01-05,98732
2023-01-06,99883
2023-01-07,102786
2023-01-08,84002
2023-01-09,83564
2023-01-10,97697


In [0]:
# Renaming columns for better clarity
data_processing_df = data_processing_df \
    .withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
    .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")

In [0]:
data_processing_df.printSchema()

root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: float (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- Month_Num: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- dropoff_hour: integer (nullable = true)
 |-- dropoff_day_of_week: integer (nullable = true)
 |-- dropoff_month: integer (nullable = true)
 |-- dropoff_week_of_year: integer (nullable = true)
 |-- pickup_date: string (nullable = true)
 |-- pickup_time: strin

In [0]:
display(data_processing_df)

In [0]:
num_rows = data_processing_df.count()
num_cols = len(data_processing_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (37000275, 24)


In [0]:
# Define a more descriptive Delta Lake storage path
delta_path = "/dbfs/FileStore/tables/data_processed_trip_data/"
# Write the DataFrame to Delta format
data_processing_df.write.format("delta").mode("overwrite").save(delta_path)