In [0]:
display(dbutils.fs.ls("/dbfs/FileStore/tables/"))

path,name,size,modificationTime
dbfs:/dbfs/FileStore/tables/Data_Processing_Pulic_Holiday/,Data_Processing_Pulic_Holiday/,0,1732620173544
dbfs:/dbfs/FileStore/tables/cleaned_nyc_taxi_fare/,cleaned_nyc_taxi_fare/,0,1732620173544
dbfs:/dbfs/FileStore/tables/data_processed_lat_long/,data_processed_lat_long/,0,1732620173544
dbfs:/dbfs/FileStore/tables/data_processed_taxi_zones/,data_processed_taxi_zones/,0,1732620173544
dbfs:/dbfs/FileStore/tables/data_processed_trip_data/,data_processed_trip_data/,0,1732620173545


In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("NYC Taxi Fare Prediction") \
    .getOrCreate()

In [0]:
# Load Delta table with PySpark
taxi_df = spark.read.format("delta").load("dbfs:/dbfs/FileStore/tables/cleaned_nyc_taxi_fare/")

In [0]:
display(taxi_df.limit(5))

DOLocationID,PULocationID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_day_of_week,pickup_hour,Month_Num,pickup_month,dropoff_hour,dropoff_day_of_week,dropoff_month,dropoff_week_of_year,pickup_date,pickup_time,dropoff_date,dropoff_time,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,distance_km,pickup_borough,pickup_zone,pickup_service_zone,LocationID,dropoff_borough,dropoff_zone,dropoff_service_zone,holidayName,normalizeHolidayName,is_holiday,temp
198,132,2023-10-09T02:08:03Z,2023-10-09T02:32:05Z,3,14.2,1,54.1,0.0,1.0,63.35,24.033333,2,2,10,10,2,2,10,41,2023-10-09,02:08:03,2023-10-09,02:32:05,40.64698489239528,-73.78653298335001,40.70652679441958,-73.90170926444722,11.75471842872516,Queens,JFK Airport,Airports,198,Queens,Ridgewood,Boro Zone,Columbus Day,Columbus Day,1,26.5
216,132,2023-10-09T07:31:13Z,2023-10-09T07:41:36Z,2,3.75,1,17.0,0.0,1.0,23.95,10.383333,2,7,10,10,7,2,10,41,2023-10-09,07:31:13,2023-10-09,07:41:36,40.64698489239528,-73.78653298335001,40.676154037213855,-73.81945963682767,4.270088763464719,Queens,JFK Airport,Airports,216,Queens,South Ozone Park,Boro Zone,Columbus Day,Columbus Day,1,26.5
132,132,2023-10-09T07:04:21Z,2023-10-09T07:33:49Z,1,14.88,1,70.0,6.94,1.0,98.88,29.466667,2,7,10,10,7,2,10,41,2023-10-09,07:04:21,2023-10-09,07:33:49,40.64698489239528,-73.78653298335001,40.64698489239528,-73.78653298335001,0.0,Queens,JFK Airport,Airports,132,Queens,JFK Airport,Airports,Columbus Day,Columbus Day,1,26.5
232,132,2023-10-09T07:15:55Z,2023-10-09T08:09:17Z,1,17.12,1,70.0,0.0,1.0,94.25,53.366665,2,7,10,10,8,2,10,41,2023-10-09,07:15:55,2023-10-09,08:09:17,40.64698489239528,-73.78653298335001,40.71473250693941,-73.9830245583349,18.201235453317874,Queens,JFK Airport,Airports,232,Manhattan,Two Bridges/Seward Park,Yellow Zone,Columbus Day,Columbus Day,1,26.5
229,132,2023-10-09T09:12:56Z,2023-10-09T09:49:21Z,1,20.6,1,70.0,6.94,1.0,99.19,36.416668,2,9,10,10,9,2,10,41,2023-10-09,09:12:56,2023-10-09,09:49:21,40.64698489239528,-73.78653298335001,40.75672894163307,-73.96514579918423,19.380882456819247,Queens,JFK Airport,Airports,229,Manhattan,Sutton Place/Turtle Bay North,Yellow Zone,Columbus Day,Columbus Day,1,26.5


In [0]:
taxi_df.printSchema()

root
 |-- DOLocationID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- trip_duration: float (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- Month_Num: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- dropoff_hour: integer (nullable = true)
 |-- dropoff_day_of_week: integer (nullable = true)
 |-- dropoff_month: integer (nullable = true)
 |-- dropoff_week_of_year: integer (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_time: string 

In [0]:
from pyspark.sql import functions as F

# Define the columns and the rounding precision
columns = {
    'trip_distance': 2,
    'fare_amount': 2,
    'tolls_amount': 2,
    'improvement_surcharge': 2,
    'total_amount': 2,
    'trip_duration': 2,
    'distance_km': 2,
    'pickup_latitude': 5,
    'pickup_longitude': 5,
    'dropoff_latitude': 5,
    'dropoff_longitude': 5,
    'temp': 1
}

# Apply rounding
for column, precision in columns.items():
    taxi_df = taxi_df.withColumn(column, F.format_number(F.col(column), precision).cast("double"))

# Show to verify
taxi_df.show()

In [0]:
display(taxi_df.limit(5))

DOLocationID,PULocationID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_day_of_week,pickup_hour,Month_Num,pickup_month,dropoff_hour,dropoff_day_of_week,dropoff_month,dropoff_week_of_year,pickup_date,pickup_time,dropoff_date,dropoff_time,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,distance_km,pickup_borough,pickup_zone,pickup_service_zone,LocationID,dropoff_borough,dropoff_zone,dropoff_service_zone,holidayName,normalizeHolidayName,is_holiday,temp
198,132,2023-10-09T02:08:03Z,2023-10-09T02:32:05Z,3,14.2,1,54.1,0.0,1.0,63.35,24.03,2,2,10,10,2,2,10,41,2023-10-09,02:08:03,2023-10-09,02:32:05,40.64698,-73.78653,40.70653,-73.90171,11.75,Queens,JFK Airport,Airports,198,Queens,Ridgewood,Boro Zone,Columbus Day,Columbus Day,1,26.5
216,132,2023-10-09T07:31:13Z,2023-10-09T07:41:36Z,2,3.75,1,17.0,0.0,1.0,23.95,10.38,2,7,10,10,7,2,10,41,2023-10-09,07:31:13,2023-10-09,07:41:36,40.64698,-73.78653,40.67615,-73.81946,4.27,Queens,JFK Airport,Airports,216,Queens,South Ozone Park,Boro Zone,Columbus Day,Columbus Day,1,26.5
132,132,2023-10-09T07:04:21Z,2023-10-09T07:33:49Z,1,14.88,1,70.0,6.94,1.0,98.88,29.47,2,7,10,10,7,2,10,41,2023-10-09,07:04:21,2023-10-09,07:33:49,40.64698,-73.78653,40.64698,-73.78653,0.0,Queens,JFK Airport,Airports,132,Queens,JFK Airport,Airports,Columbus Day,Columbus Day,1,26.5
232,132,2023-10-09T07:15:55Z,2023-10-09T08:09:17Z,1,17.12,1,70.0,0.0,1.0,94.25,53.37,2,7,10,10,8,2,10,41,2023-10-09,07:15:55,2023-10-09,08:09:17,40.64698,-73.78653,40.71473,-73.98302,18.2,Queens,JFK Airport,Airports,232,Manhattan,Two Bridges/Seward Park,Yellow Zone,Columbus Day,Columbus Day,1,26.5
229,132,2023-10-09T09:12:56Z,2023-10-09T09:49:21Z,1,20.6,1,70.0,6.94,1.0,99.19,36.42,2,9,10,10,9,2,10,41,2023-10-09,09:12:56,2023-10-09,09:49:21,40.64698,-73.78653,40.75673,-73.96515,19.38,Queens,JFK Airport,Airports,229,Manhattan,Sutton Place/Turtle Bay North,Yellow Zone,Columbus Day,Columbus Day,1,26.5


# Fare Amount

# 1.	Fare Amount Distribution

In [0]:
# Basic descriptive stats for fare_amount
taxi_df.select("fare_amount").describe().show()

+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|          36395600|
|   mean| 19.01973833485358|
| stddev|17.479247950729835|
|    min|            -900.0|
|    max|             999.0|
+-------+------------------+



In [0]:
from pyspark.sql.functions import col, hour, dayofweek, month, year, mean
import matplotlib.pyplot as plt
import seaborn as sns

# sns.histplot(fare_data, kde=True)

# Convert fare amount to Pandas for plotting
fare_df = taxi_df.select("fare_amount").toPandas()

plt.figure(figsize=(10, 6))
plt.hist(fare_df['fare_amount'], bins=50, color='skyblue')
plt.xlabel("Fare Amount")
plt.ylabel("Frequency")
plt.title("Distribution of Fare Amounts")
plt.show()

In [0]:
# Get row and column count
row_count = taxi_df.count()
column_count = len(taxi_df.columns)
print(f"Rows: {row_count}, Columns: {column_count}")

Rows: 36395607, Columns: 40


In [0]:
# Check for Duplicates
# Count the number of duplicate rows
duplicates_count = taxi_df.count() - taxi_df.distinct().count()
print(f"Number of duplicate rows: {duplicates_count}")

Number of duplicate rows: 0


In [0]:
taxi_df = taxi_df.dropDuplicates()

In [0]:
# Summary Statistics for 'fare_amount'
taxi_df.describe("fare_amount").show()

+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|          36395600|
|   mean|19.019738334853354|
| stddev|17.479247950729853|
|    min|            -900.0|
|    max|             999.0|
+-------+------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'fare_amount' column to a Pandas DataFrame or Series
fare_amount = taxi_df.select("fare_amount").toPandas()["fare_amount"]

# Assuming 'fare_amount' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(fare_amount, bins=50, kde=True)
plt.title("Distribution of Fare Amounts")

plt.subplot(1, 2, 2)
sns.boxplot(x=fare_amount)
plt.title("Box Plot of Fare Amounts")
plt.show()

In [0]:
from pyspark.sql.functions import col

# Filter out rows where fare_amount is less than or equal to zero
taxi_df = taxi_df.filter(col("fare_amount") > 0)

In [0]:
from pyspark.sql import functions as F

# Calculate Q1 and Q3
q1, q3 = taxi_df.approxQuantile("fare_amount", [0.25, 0.75], 0.05)
iqr = q3 - q1

# Define lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [0]:
# Filter for outliers based on updated bounds
outliers = taxi_df.filter((col("fare_amount") < lower_bound) | (col("fare_amount") > upper_bound))
outliers.select("fare_amount").describe().show()

+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|           3462029|
|   mean|63.296190361189225|
| stddev|15.380235804776735|
|    min|             42.59|
|    max|             999.0|
+-------+------------------+



In [0]:
from pyspark.sql.functions import when

# Cap fare_amount values at the upper bound
taxi_df = taxi_df.withColumn("fare_amount", when(col("fare_amount") > upper_bound, upper_bound).otherwise(col("fare_amount")))

In [0]:
# Summary Statistics for 'fare_amount'
taxi_df.describe("fare_amount").show()

+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|          36019170|
|   mean|17.427675134114097|
| stddev|11.268637530283332|
|    min|              0.01|
|    max|42.550000000000004|
+-------+------------------+



In [0]:
from pyspark.sql import functions as F

# Calculate skewness and kurtosis for the fare_amount column
fare_stats = taxi_df.select(
    F.skewness("fare_amount").alias("skewness"),
    F.kurtosis("fare_amount").alias("kurtosis")
)

# Display the results
fare_stats.show()

+------------------+-------------------+
|          skewness|           kurtosis|
+------------------+-------------------+
|1.1446969329269765|0.16730835347353823|
+------------------+-------------------+



A positive skewness (like 0.8633) suggests that the tail on the right side of the distribution (higher fare amounts) is longer or fatter than the left side. This means that there are more low fares and a few extremely high fares pulling the mean to the right.

A negative kurtosis (like -0.4961) indicates that the distribution has lighter tails and a flatter peak than a normal distribution. In this case, it suggests that there are fewer extreme outliers compared to a normal distribution.

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'fare_amount' column to a Pandas DataFrame or Series
fare_amount = taxi_df.select("fare_amount").toPandas()["fare_amount"]

# Assuming 'fare_amount' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(fare_amount, bins=50, kde=True)
plt.title("Distribution of Fare Amounts")

plt.subplot(1, 2, 2)
sns.boxplot(x=fare_amount)
plt.title("Box Plot of Fare Amounts")
plt.show()

# Total Amount

In [0]:
# Basic descriptive stats for total_amount
taxi_df.select("total_amount").describe().show()

+-------+------------------+
|summary|      total_amount|
+-------+------------------+
|  count|          36019167|
|   mean| 28.51376624647264|
| stddev|21.557119489078847|
|    min|              0.01|
|    max|             951.0|
+-------+------------------+



In [0]:
from pyspark.sql.functions import col, hour, dayofweek, month, year, mean
import matplotlib.pyplot as plt
import seaborn as sns

# sns.histplot(fare_data, kde=True)

# Convert fare amount to Pandas for plotting
amount_df = taxi_df.select("total_amount").toPandas()

plt.figure(figsize=(10, 6))
plt.hist(amount_df['total_amount'], bins=50, color='skyblue')
plt.xlabel("Total Amount")
plt.ylabel("Frequency")
plt.title("Distribution of Total Amount")
plt.show()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'Total Amount' column to a Pandas DataFrame or Series
amount_df = taxi_df.select("total_amount").toPandas()["total_amount"]

# Assuming Total Amount column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(amount_df, bins=50, kde=True)
plt.title("Distribution of total_amount")

plt.subplot(1, 2, 2)
sns.boxplot(x=amount_df)
plt.title("Box Plot of Total Amount")
plt.show()


In [0]:
from pyspark.sql.functions import col

# Filter out rows where total_amount is less than or equal to zero
taxi_df = taxi_df.filter(col("total_amount") > 0)

In [0]:
from pyspark.sql import functions as F

# Calculate Q1 and Q3
q1, q3 = taxi_df.approxQuantile("total_amount", [0.25, 0.75], 0.05)
iqr = q3 - q1

# Define lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [0]:
from pyspark.sql.functions import when

# Cap total amount values at the upper bound
taxi_df = taxi_df.withColumn("total_amount", when(col("total_amount") > upper_bound, upper_bound).otherwise(col("total_amount")))

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'total_amount' column to a Pandas DataFrame or Series
total_amount = taxi_df.select("total_amount").toPandas()["total_amount"]

# Assuming 'total_amount' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(total_amount, bins=50, kde=True)
plt.title("Distribution of total_amount")

plt.subplot(1, 2, 2)
sns.boxplot(x=fare_amount)
plt.title("Box Plot of total_amount")
plt.show()

In [0]:
# Summary Statistics for 'total_amount'
taxi_df.describe("total_amount").show()

+-------+------------------+
|summary|      total_amount|
+-------+------------------+
|  count|          36019167|
|   mean| 25.17980459990595|
| stddev|12.620968537033155|
|    min|              0.01|
|    max|            51.675|
+-------+------------------+



In [0]:
from pyspark.sql import functions as F

# Calculate skewness and kurtosis for the total_amount column
total_amount_stats = taxi_df.select(
    F.skewness("total_amount").alias("skewness"),
    F.kurtosis("total_amount").alias("kurtosis")
)

# Display the results
total_amount_stats.show()

+------------------+-------------------+
|          skewness|           kurtosis|
+------------------+-------------------+
|1.0196290571307127|-0.1069624073534845|
+------------------+-------------------+



# Passenger Count

In [0]:
# Distribution of passenger_count
passenger_dist = taxi_df.groupBy("passenger_count").count().orderBy("count", ascending=False)
passenger_dist.show()

+---------------+--------+
|passenger_count|   count|
+---------------+--------+
|              1|27096950|
|              2| 5451149|
|              3| 1353777|
|              4|  758553|
|              0|  569086|
|              5|  476294|
|              6|  313095|
|              8|     167|
|              7|      59|
|              9|      37|
+---------------+--------+



In [0]:
# Bar plot for passenger count
passenger_count_data = taxi_df.groupBy("passenger_count").count().orderBy("passenger_count").collect()
x = [row['passenger_count'] for row in passenger_count_data]
y = [row['count'] for row in passenger_count_data]

plt.bar(x, y, color="purple")
plt.title("Passenger Count Distribution")
plt.xlabel("Passenger Count")
plt.ylabel("Frequency")
plt.show()

Most Common Values: The vast majority of records have 1-2 passengers, which seems reasonable for taxi rides in a city. Rows with values from 1 to 6 appear commonly and represent realistic group sizes for taxi usage.

Zero Passengers: There are 569,086 records with a passenger_count of 0, which seems unrealistic for taxi rides. These could be errors or placeholder values. It might be best to remove these rows, as they likely don’t contribute meaningful information for fare prediction

Outliers (Values of 7 to 9): Rows with passenger counts from 7 to 9 are very rare (a total of 263 occurrences in 36 million rows). These could be data entry errors, or they could represent unusual cases (e.g., shared rides). Given the low frequency, removing them should have a minimal impact, but you could keep them if you want a model that handles unusual cases.

In [0]:
# Filter out rows with passenger_count of 0 or greater than 6
taxi_df = taxi_df.filter((taxi_df["passenger_count"] > 0) & (taxi_df["passenger_count"] <= 6))

# Verify the distribution after filtering
taxi_df.groupBy("passenger_count").count().orderBy("passenger_count").show()

+---------------+--------+
|passenger_count|   count|
+---------------+--------+
|              1|27096950|
|              2| 5451149|
|              3| 1353777|
|              4|  758553|
|              5|  476294|
|              6|  313095|
+---------------+--------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_distance' column to a Pandas DataFrame or Series
passenger_count_df = taxi_df.select("passenger_count").toPandas()["passenger_count"]

# Assuming Trip Distance column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
sns.boxplot(x=passenger_count_df)
plt.title("Box Plot of Passenger Count")
plt.show()


# Payment Type

In [0]:
payment_mapping = {
    1: "Credit Card",
    2: "Cash",
    3: "No Charge",
    4: "Dispute",
    5: "Unknown",
    6: "Voided Trip"
}

In [0]:
# Flatten the dictionary items into a list of alternating keys and values
payment_type_expr = F.create_map([F.lit(x) for kv in payment_mapping.items() for x in kv])

# Use the map to create the new column
taxi_df = taxi_df.withColumn("payment_type_label", payment_type_expr[F.col("payment_type")])

In [0]:
payment_counts = taxi_df.groupBy("payment_type_label").count()

In [0]:
total_count = taxi_df.count()
payment_percentage = payment_counts.withColumn("percentage", (F.col("count") / total_count) * 100)

In [0]:
# Show the frequency and percentage distribution
payment_percentage.show()

+------------------+--------+-------------------+
|payment_type_label|   count|         percentage|
+------------------+--------+-------------------+
|       Credit Card|28946703|  81.65543473312049|
|         No Charge|  158006|0.44571737998767724|
|              Cash| 6082246|  17.15734055390637|
|           Dispute|  262863| 0.7415073329854612|
+------------------+--------+-------------------+



In [0]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert to Pandas DataFrame for visualization
payment_percentage_pd = payment_percentage.toPandas()

# Bar Plot
plt.figure(figsize=(10, 5))
plt.bar(payment_percentage_pd['payment_type_label'], payment_percentage_pd['count'], color='blue')
plt.title('Frequency Distribution of Payment Types')
plt.xlabel('Payment Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [0]:
# Pie Chart
plt.figure(figsize=(8, 8))
plt.pie(payment_percentage_pd['percentage'], labels=payment_percentage_pd['payment_type_label'], autopct='%1.1f%%', startangle=140)
plt.title('Percentage Distribution of Payment Types')
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
plt.show()

# Distribution of Trip Distance

In [0]:
# Summary Statistics for 'fare_amount'
taxi_df.describe("trip_distance").show()

+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|          35449519|
|   mean|3.4072736318368664|
| stddev| 4.397237518729402|
|    min|               0.0|
|    max|             837.5|
+-------+------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_distance' column to a Pandas DataFrame or Series
trip_distance_df = taxi_df.select("trip_distance").toPandas()["trip_distance"]

# Assuming Trip Distance column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_distance_df, bins=50, kde=True)
plt.title("Distribution of Trip Distance")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_distance_df)
plt.title("Box Plot of Trip Distance")
plt.show()


Zero values for trip_distance might indicate canceled trips, errors in data entry, or other anomalies. It’s essential to investigate these zero distances to determine if they are valid entries or should be excluded from analysis.

In [0]:
taxi_df = taxi_df.filter(taxi_df["trip_distance"] > 0)

In [0]:
from pyspark.sql import functions as F

# Calculate Q1 and Q3
q1, q3 = taxi_df.approxQuantile("trip_distance", [0.25, 0.75], 0.05)
iqr = q3 - q1

# Define lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [0]:
# Filter for outliers based on updated bounds
outliers = taxi_df.filter((col("fare_amount") < lower_bound) | (col("trip_distance") > upper_bound))
outliers.select("trip_distance").describe().show()

+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|           4549323|
|   mean|13.369781097098638|
| stddev| 4.919646726149456|
|    min|              7.01|
|    max|             837.5|
+-------+------------------+



In [0]:
from pyspark.sql.functions import when

# Cap trip_distance values at the upper bound
taxi_df = taxi_df.withColumn("trip_distance", when(col("trip_distance") > upper_bound, upper_bound).otherwise(col("trip_distance")))


In [0]:
from pyspark.sql import functions as F

# Calculate skewness and kurtosis for the trip_distance column
trip_distance_stats = taxi_df.select(
    F.skewness("trip_distance").alias("skewness"),
    F.kurtosis("trip_distance").alias("kurtosis")
)

# Display the results
trip_distance_stats.show()

+------------------+--------------------+
|          skewness|            kurtosis|
+------------------+--------------------+
|1.1270119683794957|-0.06933936469401525|
+------------------+--------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_distance' column to a Pandas DataFrame or Series
trip_distance_df = taxi_df.select("trip_distance").toPandas()["trip_distance"]

# Assuming Trip Distance column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_distance_df, bins=50, kde=True)
plt.title("Distribution of Trip Distance")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_distance_df)
plt.title("Box Plot of Trip Distance")
plt.show()

In [0]:
# Summary Statistics for 'trip_distance'
taxi_df.describe("trip_distance").show()

+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|          35089527|
|   mean|2.6170420112761126|
| stddev|2.1125514372669647|
|    min|              0.01|
|    max| 7.004999999999999|
+-------+------------------+



In [0]:
# Frequency distribution for trip distance bins
frequency_bins = taxi_df.select(
    when(col("trip_distance") > 20, "Very High (>20 km)").when((col("trip_distance") >= 10) & (col("trip_distance") <= 20), "High (10-20 km)").when((col("trip_distance") >= 5) & (col("trip_distance") < 10), "Average (5-10 km)").when((col("trip_distance") > 1) & (col("trip_distance") < 5), "Low (1-5 km)").otherwise("Very Low (<1 km)").alias("distance_category")
).groupBy("distance_category").count()

frequency_bins.show()

+-----------------+--------+
|distance_category|   count|
+-----------------+--------+
| Very Low (<1 km)| 7923228|
|     Low (1-5 km)|21141051|
|Average (5-10 km)| 6025248|
+-----------------+--------+



In [0]:
from pyspark.sql import functions as F

# Define bin boundaries for trip_distance and fare_amount
trip_distance_bins = [2, 5, 10, 20]  # Example: 0-2 km, 2-5 km, 5-10 km, 10-20 km, 20+ km
fare_amount_bins = [10, 20, 50, 100]  # Example: $0-10, $10-20, $20-50, $50-100, $100+

# Function to create bins and calculate frequency for a given column
def create_frequency_bins(df, column_name, bins, bin_labels):
    # Start bin column using the first condition
    bin_col = F.when(F.col(column_name) < bins[0], bin_labels[0])
    
    # Iterate through bins to set conditions for each range
    for i in range(1, len(bins)):
        bin_col = bin_col.when((F.col(column_name) >= bins[i-1]) & (F.col(column_name) < bins[i]), bin_labels[i])
    
    # Add final bin for the last range (values greater than the last bin value)
    bin_col = bin_col.when(F.col(column_name) >= bins[-1], bin_labels[-1])

    # Add bin column to DataFrame
    df = df.withColumn(f"{column_name}_bin", bin_col)

    # Calculate frequency for each bin
    frequency_df = df.groupBy(f"{column_name}_bin").count().orderBy(F.col("count").desc())

    return frequency_df

# Define bin labels (must match the length of bins + 1)
trip_distance_labels = ["0-2 km", "2-5 km", "5-10 km", "10-20 km", "20+ km"]
fare_amount_labels = ["$0-10", "$10-20", "$20-50", "$50-100", "$100+"]

# Create frequency distributions
trip_distance_freq = create_frequency_bins(taxi_df, "trip_distance", trip_distance_bins, trip_distance_labels)
fare_amount_freq = create_frequency_bins(taxi_df, "fare_amount", fare_amount_bins, fare_amount_labels)

# Show the frequency tables
trip_distance_freq.show()
fare_amount_freq.show()

+-----------------+--------+
|trip_distance_bin|   count|
+-----------------+--------+
|           0-2 km|19252240|
|           2-5 km| 9812039|
|          5-10 km| 6025248|
+-----------------+--------+

+---------------+--------+
|fare_amount_bin|   count|
+---------------+--------+
|         $10-20|15566278|
|         $20-50| 9819975|
|          $0-10| 9703274|
+---------------+--------+



In [0]:
import zipfile
import os

# Define paths
dbfs_zip_path = "/FileStore/tables/taxi_zones/taxi_zones.zip"
local_zip_path = "/tmp/taxi_zones.zip"
local_extract_path = "/tmp/taxi_zones/"

# Copy the zip file from DBFS to a local path
dbutils.fs.cp(dbfs_zip_path, "file:" + local_zip_path)

# Unzip the file locally and list the contents
with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
    zip_ref.extractall(local_extract_path)

# Verify extracted files
extracted_files = os.listdir(local_extract_path)
print("Extracted files:", extracted_files)

Extracted files: ['taxi_zones.dbf', 'taxi_zones.shp.xml', 'taxi_zones.shp', 'taxi_zones.prj', 'taxi_zones.shx', 'taxi_zones.sbx', 'taxi_zones.sbn']


In [0]:
# Check for the .shp file in the extracted directory
if "taxi_zones.shp" in extracted_files:
    import geopandas as gpd
    # Load the shapefile
    zones = gpd.read_file(os.path.join(local_extract_path, "taxi_zones.shp"))
    
    # Convert to WGS 84 coordinate system if necessary
    zone_gdf = zones.to_crs(epsg=4326)
else:
    print("Error: 'taxi_zones.shp' not found in extracted files.")



In [0]:
from pyspark.sql import functions as F

# Count pickups by borough
pickup_borough_agg = (
    taxi_df.groupBy("pickup_borough")
    .agg(F.count("PULocationID").alias("pickup_count"))
)

pickup_borough_agg.show()

+--------------+------------+
|pickup_borough|pickup_count|
+--------------+------------+
|        Queens|     3412817|
|           EWR|         857|
|      Brooklyn|      181687|
| Staten Island|        1713|
|     Manhattan|    31444933|
|         Bronx|       47520|
+--------------+------------+



In [0]:
# Count dropoffs by borough
dropoff_borough_agg = (
    taxi_df.groupBy("dropoff_borough")
    .agg(F.count("DOLocationID").alias("dropoff_count"))
)

dropoff_borough_agg.show()


+---------------+-------------+
|dropoff_borough|dropoff_count|
+---------------+-------------+
|         Queens|      1816689|
|            EWR|       102619|
|       Brooklyn|      1360259|
|  Staten Island|         9796|
|      Manhattan|     31597604|
|          Bronx|       202560|
+---------------+-------------+



In [0]:
# Convert to Pandas DataFrames
pickup_borough_agg_pd = pickup_borough_agg.toPandas()
dropoff_borough_agg_pd = dropoff_borough_agg.toPandas()

# If you have a GeoDataFrame called zone_gdf, ensure it is in Pandas
zone_gdf_pd = zone_gdf.copy()  # Assuming zone_gdf is a GeoDataFrame

In [0]:
# Convert to Pandas DataFrame and rename columns for consistency
pickup_borough_agg_pd = pickup_borough_agg.toPandas().rename(columns={"pickup_borough": "borough"})
dropoff_borough_agg_pd = dropoff_borough_agg.toPandas().rename(columns={"dropoff_borough": "borough"})

In [0]:
# Merge pickup and dropoff counts with the original zone data for plotting
pickup_borough_merged = zone_gdf.merge(pickup_borough_agg_pd, on='borough', how='left')
dropoff_borough_merged = zone_gdf.merge(dropoff_borough_agg_pd, on='borough', how='left')

In [0]:
# Plot pickup and dropoff counts
fig, ax = plt.subplots(1, 2, figsize=(16, 8))

# Plotting Pickup Counts
pickup_borough_merged.plot(column="pickup_count", cmap="Oranges", legend=True, ax=ax[0])
ax[0].set_title("NYC Pickup Counts by Borough")

# Plotting Dropoff Counts
dropoff_borough_merged.plot(column="dropoff_count", cmap="Blues", legend=True, ax=ax[1])
ax[1].set_title("NYC Dropoff Counts by Borough")

plt.show()


In [0]:
# Check the Shape and Schema
# Get the number of rows and columns
num_rows = taxi_df.count()
num_cols = len(taxi_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (35089527, 41)


In [0]:
# Define a more descriptive Delta Lake storage path
delta_path = "/dbfs/FileStore/tables/Eda_univariate_taxi_data/" 
# Write the DataFrame to Delta format
taxi_df.write.format("delta").mode("overwrite").save(delta_path)
