In [0]:
display(dbutils.fs.ls("/dbfs/FileStore/tables/"))

path,name,size,modificationTime
dbfs:/dbfs/FileStore/tables/Data_Processing_Pulic_Holiday/,Data_Processing_Pulic_Holiday/,0,1732625555348
dbfs:/dbfs/FileStore/tables/Eda_univariate_taxi_data/,Eda_univariate_taxi_data/,0,1732625555349
dbfs:/dbfs/FileStore/tables/cleaned_nyc_taxi_fare/,cleaned_nyc_taxi_fare/,0,1732625555349
dbfs:/dbfs/FileStore/tables/data_processed_lat_long/,data_processed_lat_long/,0,1732625555349
dbfs:/dbfs/FileStore/tables/data_processed_taxi_zones/,data_processed_taxi_zones/,0,1732625555349
dbfs:/dbfs/FileStore/tables/data_processed_trip_data/,data_processed_trip_data/,0,1732625555349


In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("NYC Taxi Fare Analysis") \
    .getOrCreate()

In [0]:
# Load the Delta table
taxi_df = spark.read.format("delta").load("/dbfs/FileStore/tables/Eda_univariate_taxi_data/")

# Show the first few rows to verify
display(taxi_df.limit(5))

DOLocationID,PULocationID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_day_of_week,pickup_hour,Month_Num,pickup_month,dropoff_hour,dropoff_day_of_week,dropoff_month,dropoff_week_of_year,pickup_date,pickup_time,dropoff_date,dropoff_time,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,distance_km,pickup_borough,pickup_zone,pickup_service_zone,LocationID,dropoff_borough,dropoff_zone,dropoff_service_zone,holidayName,normalizeHolidayName,is_holiday,temp,payment_type_label
163,50,2023-10-09T11:04:08Z,2023-10-09T11:15:45Z,2,1.25,2,12.1,0.0,1.0,16.1,11.62,2,11,10,10,11,2,10,41,2023-10-09,11:04:08,2023-10-09,11:15:45,40.76624,-73.99514,40.76442,-73.97757,1.49,Manhattan,Clinton West,Yellow Zone,163,Manhattan,Midtown North,Yellow Zone,Columbus Day,Columbus Day,1,26.5,Cash
10,10,2023-07-17T15:09:40Z,2023-07-17T15:12:37Z,1,0.67,4,42.55,0.0,1.0,51.675,2.95,2,15,7,7,15,2,7,29,2023-07-17,15:09:40,2023-07-17,15:12:37,40.67895,-73.79099,40.67895,-73.79099,0.0,Queens,Baisley Park,Boro Zone,10,Queens,Baisley Park,Boro Zone,No Holiday,No Holiday,0,25.0,Dispute
238,10,2023-01-12T06:23:33Z,2023-01-12T07:07:51Z,1,7.004999999999999,1,42.55,6.55,1.0,51.675,44.3,5,6,1,1,7,5,1,2,2023-01-12,06:23:33,2023-01-12,07:07:51,40.67895,-73.79099,40.7917,-73.97305,19.81,Queens,Baisley Park,Boro Zone,238,Manhattan,Upper West Side North,Yellow Zone,No Holiday,No Holiday,0,18.0,Credit Card
68,10,2023-03-29T16:03:33Z,2023-03-29T16:56:57Z,1,7.004999999999999,1,42.55,6.55,1.0,51.675,53.4,4,16,3,3,16,4,3,13,2023-03-29,16:03:33,2023-03-29,16:56:57,40.67895,-73.79099,40.74843,-73.99992,19.23,Queens,Baisley Park,Boro Zone,68,Manhattan,East Chelsea,Yellow Zone,No Holiday,No Holiday,0,27.3,Credit Card
162,10,2023-07-16T12:59:19Z,2023-07-16T13:34:36Z,4,7.004999999999999,1,42.55,6.55,1.0,51.675,35.28,1,12,7,7,13,1,7,28,2023-07-16,12:59:19,2023-07-16,13:34:36,40.67895,-73.79099,40.75669,-73.97236,17.56,Queens,Baisley Park,Boro Zone,162,Manhattan,Midtown East,Yellow Zone,No Holiday,No Holiday,0,24.4,Credit Card


In [0]:
from pyspark.sql.functions import col
# Summary Statistics for 'trip_duration'
taxi_df.describe("trip_duration").show()

+-------+------------------+
|summary|     trip_duration|
+-------+------------------+
|  count|          35062796|
|   mean|16.378365347703202|
| stddev|14.850211599459241|
|    min|           -476.25|
|    max|            999.87|
+-------+------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_duration' column to a Pandas DataFrame or Series
trip_duration_df = taxi_df.select("trip_duration").toPandas()["trip_duration"]

# Assuming 'trip_duration' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_duration_df, bins=50, kde=True)
plt.title("Distribution of Trip Duration")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_duration_df)
plt.title("Box Plot of Trip Duration")
plt.show()

In [0]:
from pyspark.sql.functions import col

# Filter out rows where trip_duration is less than or equal to zero
taxi_df = taxi_df.filter(col("trip_duration") > 0)

from pyspark.sql import functions as F

# Calculate Q1 and Q3
q1, q3 = taxi_df.approxQuantile("trip_duration", [0.25, 0.75], 0.05)
iqr = q3 - q1

# Define lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [0]:
# Filter for outliers based on updated bounds
outliers = taxi_df.filter((col("trip_duration") < lower_bound) | (col("trip_duration") > upper_bound))
outliers.select("trip_duration").describe().show()

+-------+------------------+
|summary|     trip_duration|
+-------+------------------+
|  count|           1900880|
|   mean|57.221228457346214|
| stddev| 31.67904550480439|
|    min|             41.32|
|    max|            999.87|
+-------+------------------+



In [0]:
from pyspark.sql.functions import when

# Cap trip_duration values at the upper bound
taxi_df = taxi_df.withColumn("trip_duration", when(col("trip_duration") > upper_bound, upper_bound).otherwise(col("trip_duration")))


In [0]:
# Summary Statistics for 'trip_duration'
taxi_df.describe("trip_duration").show()

+-------+------------------+
|summary|     trip_duration|
+-------+------------------+
|  count|          35061515|
|   mean|15.516971539578186|
| stddev|10.418547501679267|
|    min|              0.02|
|    max|            41.305|
+-------+------------------+



In [0]:
from pyspark.sql import functions as F

# Calculate skewness and kurtosis for the trip_duration column
fare_stats = taxi_df.select(
    F.skewness("trip_duration").alias("skewness"),
    F.kurtosis("trip_duration").alias("kurtosis")
)

# Display the results
fare_stats.show()


+------------------+------------------+
|          skewness|          kurtosis|
+------------------+------------------+
|1.0610866267929013|0.3311170716022307|
+------------------+------------------+



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the 'trip_duration' column to a Pandas DataFrame or Series
trip_duration_df = taxi_df.select("trip_duration").toPandas()["trip_duration"]

# Assuming 'trip_duration' column has been converted to Pandas Series for plotting
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(trip_duration_df, bins=50, kde=True)
plt.title("Distribution of Trip Duration")

plt.subplot(1, 2, 2)
sns.boxplot(x=trip_duration_df )
plt.title("Box Plot of Trip Duration")
plt.show()

### Geographic Data Analysis (Latitude and Longitude)

In [0]:
from pyspark.sql.functions import col
import matplotlib.pyplot as plt

# Define city border coordinates
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)

In [0]:
# Filter data for valid latitude and longitude values
taxi_df_filtered = taxi_df.filter(
    (col("pickup_longitude") >= city_long_border[0]) & 
    (col("pickup_longitude") <= city_long_border[1]) &
    (col("pickup_latitude") >= city_lat_border[0]) & 
    (col("pickup_latitude") <= city_lat_border[1]) &
    (col("dropoff_longitude") >= city_long_border[0]) & 
    (col("dropoff_longitude") <= city_long_border[1]) &
    (col("dropoff_latitude") >= city_lat_border[0]) & 
    (col("dropoff_latitude") <= city_lat_border[1])
)

In [0]:
# Convert filtered DataFrame to Pandas for plotting
taxi_df_filtered_pd = taxi_df_filtered.select(
    "pickup_longitude", "pickup_latitude", 
    "dropoff_longitude", "dropoff_latitude"
).toPandas()

In [0]:
# Plot dropoffs
plt.figure(figsize=(10, 6))
plt.scatter(taxi_df_filtered_pd['dropoff_longitude'], taxi_df_filtered_pd['dropoff_latitude'],
            color='green', s=0.02, alpha=0.6)
plt.title("Dropoffs")
plt.xlim(city_long_border)
plt.ylim(city_lat_border)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

#### Summary Statistics

In [0]:
# Display summary statistics for numerical columns
numeric_cols = ["fare_amount", "trip_distance", "tolls_amount", "total_amount", "trip_duration", "distance_km", "temp"]
display(taxi_df.select(numeric_cols).describe())

summary,fare_amount,trip_distance,tolls_amount,total_amount,trip_duration,distance_km,temp
count,35061515.0,35061515.0,35061515.0,35061515.0,35061515.0,35061515.0,35061515.0
mean,17.432896390693585,2.616797826348777,0.575403015242253,25.206987892038093,15.516971539578186,3.961897404605748,24.68344940306801
stddev,11.215482668541451,2.1125163679377077,2.0819540206218856,12.562150494545998,10.418547501679267,4.647141762422746,4.244249209397398
min,0.01,0.01,0.0,0.01,0.02,0.0,12.7
max,42.55,7.004999999999999,665.56,51.675,41.305,50.45,34.2


In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Select numerical columns and assemble them into a feature vector
numeric_cols = ["fare_amount", "trip_distance", "tolls_amount", "total_amount", "trip_duration", "distance_km", "temp"]
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
taxi_df_vector = assembler.transform(taxi_df).select("features")

# Calculate correlation matrix
correlation_matrix = Correlation.corr(taxi_df_vector, "features").head()[0]
correlation_values = correlation_matrix.toArray().tolist()

# Convert to a DataFrame for seaborn heatmap
corr_df = pd.DataFrame(correlation_values, index=numeric_cols, columns=numeric_cols)

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_df, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Numerical Features")
plt.show()


# Numerical Features

## Fare Amount

In [0]:
from pyspark.sql.functions import hour

# Extract hour from timestamp and compute average fare
hourly_fare_df = taxi_df.withColumn("hour", hour("pickup_datetime")) \
                        .groupBy("hour") \
                        .avg("fare_amount") \
                        .orderBy("hour")

# Convert to Pandas for visualization
hourly_fare_pd = hourly_fare_df.toPandas()

plt.figure(figsize=(10, 6))
plt.plot(hourly_fare_pd["hour"], hourly_fare_pd["avg(fare_amount)"], marker='o')
plt.xlabel("Hour of Day")
plt.ylabel("Average Fare Amount")
plt.title("Average Fare Amount by Hour of Day")
plt.grid(True)
plt.show()

#### Average trip distance by day of the week

In [0]:
avg_distance_by_day = taxi_df.groupBy("pickup_day_of_week").agg(F.mean("trip_distance").alias("avg_distance")).orderBy("pickup_day_of_week")

# Convert to pandas for plotting
daily_distance_df = avg_distance_by_day.toPandas()

# Plot
plt.figure(figsize=(10, 6))
plt.bar(daily_distance_df["pickup_day_of_week"], daily_distance_df["avg_distance"], color="forestgreen")
plt.title("Average Trip Distance by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Average Trip Distance (miles)")
plt.show()


#### Average trips by weekdays

In [0]:
avg_trip_by_weekday = taxi_df.groupBy("pickup_day_of_week").count().orderBy("pickup_day_of_week")

# Convert to Pandas for visualization
avg_trip_by_weekday_pd = avg_trip_by_weekday.toPandas()

# Plot
plt.bar(avg_trip_by_weekday_pd["pickup_day_of_week"], avg_trip_by_weekday_pd["count"])
plt.xlabel("Weekday")
plt.ylabel("Average Trip Count")
plt.title("Average Trip by Weekdays")
plt.show()

In [0]:
# Trip proportion by weekday
trip_by_weekday = taxi_df.groupBy("pickup_day_of_week").count()
trip_by_weekday_pd = trip_by_weekday.toPandas()

# Plot as donut chart
plt.pie(trip_by_weekday_pd["count"], labels=trip_by_weekday_pd["pickup_day_of_week"], autopct="%1.1f%%", startangle=90)
plt.gca().add_artist(plt.Circle((0,0),0.7, color="white"))  # For donut chart
plt.title("Trip Proportion by Weekdays")
plt.show()

#### Proportion of trips for each weekday.

#### Fare by Payment Type

In [0]:
# Group by payment type and calculate average fare
payment_fare_df = taxi_df.groupBy("payment_type").avg("fare_amount")

# Convert to Pandas for plotting
payment_fare_pd = payment_fare_df.toPandas()

plt.figure(figsize=(10, 6))
sns.barplot(x="payment_type", y="avg(fare_amount)", data=payment_fare_pd, palette="viridis")
plt.xlabel("Payment Type")
plt.ylabel("Average Fare Amount")
plt.title("Average Fare Amount by Payment Type")
plt.show()

#### Total Amount vs. Trip Duration

In [0]:
# Sample and plot data for Total Amount vs Trip Duration
sample_df = taxi_df.select("trip_duration", "total_amount").sample(fraction=0.01).toPandas()

plt.figure(figsize=(8, 6))
plt.scatter(sample_df["trip_duration"], sample_df["total_amount"], alpha=0.5, color="orange")
plt.title("Trip Duration vs. Total Amount")
plt.xlabel("Trip Duration (seconds)")
plt.ylabel("Total Amount")
plt.show()

### Trip Distance vs. Fare Amount

In [0]:
# Sample data
sample_df = taxi_df.select("trip_distance", "fare_amount").sample(fraction=0.01).toPandas()

In [0]:
# Scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(sample_df["trip_distance"], sample_df["fare_amount"], alpha=0.5, color="teal")
plt.title("Trip Distance vs. Fare Amount")
plt.xlabel("Trip Distance (miles)")
plt.ylabel("Fare Amount")
plt.show()

#### Monthly trip distribution

In [0]:
monthly_trip_distribution = taxi_df.groupBy("Month_Num").count().orderBy("Month_Num")

# Convert to Pandas for visualization
monthly_trip_distribution_pd = monthly_trip_distribution.toPandas()

# Plot in matplotlib or seaborn
import matplotlib.pyplot as plt

plt.plot(monthly_trip_distribution_pd["Month_Num"], monthly_trip_distribution_pd["count"])
plt.xlabel("Month")
plt.ylabel("Total Trip Count")
plt.title("Total Trip Distribution by Month")
plt.show()


#### Total Trips by Day

In [0]:
# Daily trip distribution
daily_trip_distribution = taxi_df.groupBy("pickup_date").count().orderBy("pickup_date")

# Convert to Pandas for visualization
daily_trip_distribution_pd = daily_trip_distribution.toPandas()

# Plot
plt.plot(daily_trip_distribution_pd["pickup_date"], daily_trip_distribution_pd["count"])
plt.xlabel("Date")
plt.ylabel("Total Trip Count")
plt.title("Total Trip by Day")
plt.xticks(rotation=45)
plt.show()

#### Total Trip by Hour

In [0]:
import seaborn as sns

# Total trip by hour and weekday
hourly_trip = taxi_df.groupBy("pickup_hour", "pickup_day_of_week").count().orderBy("pickup_hour", "pickup_day_of_week")
hourly_trip_pd = hourly_trip.toPandas()

# Pivot the data
hourly_trip_pivot = hourly_trip_pd.pivot("pickup_hour", "pickup_day_of_week", "count")

# Plot heatmap
sns.heatmap(hourly_trip_pivot, cmap="YlGnBu")
plt.xlabel("Day of Week")
plt.ylabel("Hour of Day")
plt.title("Total Trip by Hour")
plt.show()

#### Scatter Plot of Fare vs. Trip Distance

In [0]:
import seaborn as sns

# Sample data to reduce memory load
sample_df = taxi_df.select("fare_amount", "trip_distance").sample(fraction=0.01).toPandas()

plt.figure(figsize=(10, 6))
sns.scatterplot(x="trip_distance", y="fare_amount", data=sample_df, alpha=0.5)
plt.xlabel("Trip Distance (miles)")
plt.ylabel("Fare Amount")
plt.title("Fare Amount vs. Trip Distance")
plt.show()

#### Duration in minutes

In [0]:
from pyspark.sql.functions import unix_timestamp

taxi_df = taxi_df.withColumn("trip_duration", 
                             (unix_timestamp("dropoff_datetime") - unix_timestamp("pickup_datetime")) / 60)

# Convert to Pandas for histogram
duration_pd = taxi_df.select("trip_duration").sample(fraction=0.1).toPandas()

plt.figure(figsize=(10, 6))
plt.hist(duration_pd['trip_duration'], bins=50, color='purple')
plt.xlabel("Trip Duration (minutes)")
plt.ylabel("Frequency")
plt.title("Distribution of Trip Durations")
plt.show()

### Pickup and Dropoff Hour

#### Analyze hourly pickup distribution

In [0]:
pickup_hour_data = taxi_df.groupBy("pickup_hour").count().orderBy("pickup_hour").collect()
x = [row['pickup_hour'] for row in pickup_hour_data]
y = [row['count'] for row in pickup_hour_data]

plt.plot(x, y, marker="o", color="blue")
plt.title("Pickup Hour Distribution")
plt.xlabel("Hour of Day")
plt.ylabel("Frequency")
plt.show()


# Analyze hourly dropoff distribution

In [0]:
pickup_hour_data = taxi_df.groupBy("dropoff_hour").count().orderBy("dropoff_hour").collect()
x = [row['dropoff_hour'] for row in pickup_hour_data]
y = [row['count'] for row in pickup_hour_data]

plt.plot(x, y, marker="o", color="blue")
plt.title("Dropoff Hour Distribution")
plt.xlabel("Hour of Day")
plt.ylabel("Frequency")
plt.show()


## Zone-Based Insights

#### Distribution of Total and Median Fare Amount based on Borough

In [0]:
# Calculate median fare amount by borough
borough_fare = taxi_df.groupBy("pickup_borough").agg(F.avg("fare_amount").alias("avg_fare"), F.expr("percentile_approx(fare_amount, 0.5)").alias("median_fare"))

# Convert to Pandas for visualization
borough_fare_pd = borough_fare.toPandas()

import squarify

# Plot as a tree map
squarify.plot(sizes=borough_fare_pd["avg_fare"], label=borough_fare_pd["pickup_borough"], alpha=0.7)
plt.title("Distribution of Average Fare Amount by Borough")
plt.axis("off")
plt.show()


#### Median Fare Amount based on Time and Borough

In [0]:
# Calculate median fare amount by hour and borough
time_borough_fare = taxi_df.groupBy("pickup_hour", "pickup_borough").agg(F.expr("percentile_approx(fare_amount, 0.5)").alias("median_fare"))

# Convert to Pandas for visualization
time_borough_fare_pd = time_borough_fare.toPandas()

# Plot
sns.barplot(x="pickup_hour", y="median_fare", hue="pickup_borough", data=time_borough_fare_pd)
plt.title("Median Fare Amount by Hour and Borough")
plt.show()


#### Total Trip by Pickup Borough and Pickup Zone

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, desc

top_trips_df = (
    taxi_df.groupBy("pickup_zone", "dropoff_zone")
    .count()
    .orderBy(desc("count"))
    .limit(10)
)

top_trips_pd = top_trips_df.toPandas()

top_trips_pivot = top_trips_pd.pivot(index="pickup_zone", columns="dropoff_zone", values="count")

plt.figure(figsize=(12, 8))
sns.heatmap(
    top_trips_pivot,
    annot=True,
    fmt="d",
    cmap="YlGnBu",
    linewidths=0.5,
    linecolor="gray",
    cbar_kws={"label": "Trip Count"},
)
plt.title("Top 10 Trips by Pickup and Dropoff Zone")
plt.xlabel("Dropoff Zone")
plt.ylabel("Pickup Zone")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-3885246210596891>, line 18[0m
[1;32m     15[0m top_trips_pivot [38;5;241m=[39m top_trips_pd[38;5;241m.[39mpivot(index[38;5;241m=[39m[38;5;124m"[39m[38;5;124mpickup_zone[39m[38;5;124m"[39m, columns[38;5;241m=[39m[38;5;124m"[39m[38;5;124mdropoff_zone[39m[38;5;124m"[39m, values[38;5;241m=[39m[38;5;124m"[39m[38;5;124mcount[39m[38;5;124m"[39m)
[1;32m     17[0m plt[38;5;241m.[39mfigure(figsize[38;5;241m=[39m([38;5;241m12[39m, [38;5;241m8[39m))
[0;32m---> 18[0m sns[38;5;241m.[39mheatmap(
[1;32m     19[0m     top_trips_pivot,
[1;32m     20[0m     annot[38;5;241m=[39m[38;5;28;01mTrue[39;00m,
[1;32m     21[0m     fmt[38;5;241m=[39m[38;5;124m"[39m[38;5;124md[39m[38;5;124m"[39m,
[1;32m     22[0m     cmap[38;5;241m=[39m[38;5;124m"[39m[38;

#### Top Pickup and Drop-off Zones

In [0]:

# Count the number of trips per pickup and dropoff zones
pickup_zones_df = taxi_df.groupBy("pickup_zone").count().orderBy("count", ascending=False).limit(10)
dropoff_zones_df = taxi_df.groupBy("dropoff_zone").count().orderBy("count", ascending=False).limit(10)

# Convert to Pandas for plotting
pickup_zones_pd = pickup_zones_df.toPandas()
dropoff_zones_pd = dropoff_zones_df.toPandas()

plt.figure(figsize=(10, 6))
sns.barplot(x="pickup_zone", y="count", data=pickup_zones_pd, color="skyblue")
plt.xlabel("Pickup Zone")
plt.ylabel("Number of Trips")
plt.title("Top 10 Pickup Zones")
plt.xticks(rotation=45)
plt.show()

In [0]:

plt.figure(figsize=(10, 6))
sns.barplot(x="dropoff_zone", y="count", data=dropoff_zones_pd, color="salmon")
plt.xlabel("Dropoff Zone")
plt.ylabel("Number of Trips")
plt.title("Top 10 Dropoff Zones")
plt.xticks(rotation=45)
plt.show()

### Passenger Count Analysis

#### Effect of Passenger Number on Fare

In [0]:
# Convert the necessary columns from PySpark DataFrame to Pandas DataFrame
taxi_pd = taxi_df.select("passenger_count", "fare_amount").toPandas()

fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(data=taxi_pd, x='passenger_count', y='fare_amount', ax=ax)
ax.set_title("Effect of Passenger Count on Fare Amount")
ax.set_xlabel("Passenger Count")
ax.set_ylabel("Fare Amount")
plt.show()

#### Passenger Count

In [0]:
# Count unique values in 'passenger_count'
taxi_df.groupBy("passenger_count").count().orderBy("count", ascending=False).show()

# Bar plot for passenger count
passenger_count_data = taxi_df.groupBy("passenger_count").count().orderBy("passenger_count").collect()
x = [row['passenger_count'] for row in passenger_count_data]
y = [row['count'] for row in passenger_count_data]

plt.bar(x, y, color="purple")
plt.title("Passenger Count Distribution")
plt.xlabel("Passenger Count")
plt.ylabel("Frequency")
plt.show()


#### Payment Type vs Passenger Count

In [0]:
# Group by payment type and passenger count to see their interaction
payment_passenger_df = taxi_df.groupBy("payment_type", "passenger_count").count().orderBy("payment_type", "passenger_count").toPandas()

# Pivot for a heatmap
pivot_df = payment_passenger_df.pivot(index="passenger_count", columns="payment_type", values="count").fillna(0)

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_df, annot=True, cmap="YlGnBu", fmt=".0f")
plt.title("Payment Type vs. Passenger Count")
plt.xlabel("Payment Type")
plt.ylabel("Passenger Count")
plt.show()


## Payment Type Analysis

**1 = Credit Card**

**2 = Cash**

**3 = No Charge**

**4 = Dispute**

**5 = Unknown**

**6 = Voided Trip**


4 = Dispute
5 = Unknown
6 = Voided Trip


**4 = Dispute**: 
    This indicates that the fare was disputed, possibly due to a disagreement over the amount charged or a service issue. In these cases, the passenger or driver raised a concern about the payment.

**5 = Unknown**: 
    This is used when the payment status is unclear or unrecorded.
    It might mean that the payment method was not identified or there was an error in recording it.

**6 = Voided Trip**: 
    This indicates that the trip was voided, canceled, or invalid for some reason,
    so the fare amount associated with this trip isn’t charged or is set to zero.

#### Payment Type

In [0]:
# Count unique values in 'payment_type'
taxi_df.groupBy("payment_type").count().orderBy("count", ascending=False).show()

# Bar plot for payment type
payment_type_data = taxi_df.groupBy("payment_type").count().orderBy("payment_type").collect()
x = [row['payment_type'] for row in payment_type_data]
y = [row['count'] for row in payment_type_data]

plt.bar(x, y, color="green")
plt.title("Payment Type Distribution")
plt.xlabel("Payment Type")
plt.ylabel("Frequency")
plt.show()

#### Count and Percentage of Trips by Payment Method

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Define a window to calculate the total trip count across all rows
window_spec = Window.partitionBy()

# Count of trips by payment method
payment_counts = taxi_df.groupBy("payment_type").agg(
    F.count("*").alias("trip_count")
).withColumn(
    "percentage", (F.col("trip_count") / F.sum("trip_count").over(window_spec)) * 100
).orderBy("trip_count", ascending=False)

payment_counts.show()

+------------+----------+------------------+
|payment_type|trip_count|        percentage|
+------------+----------+------------------+
|           1|  28715277| 81.89970399168432|
|           2|   5985718|17.072046088139658|
|           4|    238382|0.6798964619754736|
|           3|    122138| 0.348353458200537|
+------------+----------+------------------+



#### Average Fare Amount by Payment Method

In [0]:
# Average fare amount by payment method
average_fare = taxi_df.groupBy("payment_type").agg(
    F.avg("fare_amount").alias("average_fare"),
    F.median("fare_amount").alias("median_fare"),
    F.sum("fare_amount").alias("total_fare")
)

average_fare.show()

+------------+------------------+-----------+--------------------+
|payment_type|      average_fare|median_fare|          total_fare|
+------------+------------------+-----------+--------------------+
|           3| 15.16511740817736|       10.7|  1852237.1099999666|
|           1|17.487610434951442|       13.5| 5.021615777077212E8|
|           4|16.833006686745755|       12.1|   4012685.799999826|
|           2|17.240581276980983|       12.8|1.0319725768008806E8|
+------------+------------------+-----------+--------------------+



#### Effect of Date and Time on Fare

In [0]:
# Group by hour and calculate the average fare
avg_fare_by_hour = (
    taxi_df.groupBy("pickup_hour")
    .agg(F.avg("fare_amount").alias("average_fare"))
    .orderBy("pickup_hour")
)

# Convert to Pandas DataFrame for visualization
avg_fare_by_hour_pd = avg_fare_by_hour.toPandas()

In [0]:
# Group by day of the week and calculate the average fare
avg_fare_by_day = (
    taxi_df.groupBy("pickup_day_of_week")
    .agg(F.avg("fare_amount").alias("average_fare"))
    .orderBy("pickup_day_of_week")
)

# Convert to Pandas DataFrame for visualization
avg_fare_by_day_pd = avg_fare_by_day.toPandas()


In [0]:
# Group by month and calculate the average fare
avg_fare_by_month = (
    taxi_df.groupBy("pickup_month")
    .agg(F.avg("fare_amount").alias("average_fare"))
    .orderBy("pickup_month")
)

# Convert to Pandas DataFrame for visualization
avg_fare_by_month_pd = avg_fare_by_month.toPandas()

In [0]:
# Set up subplots
fig, ax = plt.subplots(3, 1, figsize=(12, 18))

# Plot Average Fare by Hour
sns.lineplot(data=avg_fare_by_hour_pd, x="pickup_hour", y="average_fare", marker="o", ax=ax[0], color="b")
ax[0].set_title("Average Fare by Hour of Day")
ax[0].set_xlabel("Hour of Day")
ax[0].set_ylabel("Average Fare")

# Plot Average Fare by Day of Week
sns.lineplot(data=avg_fare_by_day_pd, x="pickup_day_of_week", y="average_fare", marker="o", ax=ax[1], color="g")
ax[1].set_title("Average Fare by Day of the Week")
ax[1].set_xlabel("Day of the Week")
ax[1].set_ylabel("Average Fare")
ax[1].set_xticks(range(1, 8))
ax[1].set_xticklabels(["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])

# Plot Average Fare by Month
sns.lineplot(data=avg_fare_by_month_pd, x="pickup_month", y="average_fare", marker="o", ax=ax[2], color="r")
ax[2].set_title("Average Fare by Month")
ax[2].set_xlabel("Month")
ax[2].set_ylabel("Average Fare")

plt.tight_layout()
plt.show()


#### Pickup and Drop-off Density Across NYC Boroughs

In [0]:
pickup_counts = (taxi_df
                 .groupBy('pickup_borough')
                 .agg(F.count('*').alias('pickup_count'))
                 .orderBy('pickup_borough'))

dropoff_counts = (taxi_df
                  .groupBy('dropoff_borough')
                  .agg(F.count('*').alias('dropoff_count'))
                  .orderBy('dropoff_borough'))

pickup_counts_pd = pickup_counts.toPandas()
dropoff_counts_pd = dropoff_counts.toPandas()

density_df = (pickup_counts_pd
              .merge(dropoff_counts_pd, left_on='pickup_borough', right_on='dropoff_borough', how='outer')
              .fillna(0))  # Fill NaN values with 0

# Set the figure size
plt.figure(figsize=(14, 6))

# Bar plot for pickups
sns.barplot(data=density_df, x='pickup_borough', y='pickup_count', color='blue', alpha=0.6, label='Pickups')

# Bar plot for drop-offs (using the same x-axis)
sns.barplot(data=density_df, x='dropoff_borough', y='dropoff_count', color='orange', alpha=0.6, label='Drop-offs')

plt.title('Pickup and Drop-off Density Across NYC Boroughs')
plt.xlabel('Borough')
plt.ylabel('Count')
plt.legend()

plt.show()


In [0]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Sample data to reduce load
sampled_df = taxi_df.sample(fraction=0.1, seed=42)  # Use 10% of the data

# Select relevant columns and drop rows with null values
filtered_df = sampled_df.select("pickup_latitude", "pickup_longitude", "pickup_borough").dropna()

# Ensure numeric columns are properly cast
filtered_df = filtered_df.withColumn("pickup_latitude", filtered_df["pickup_latitude"].cast("double"))
filtered_df = filtered_df.withColumn("pickup_longitude", filtered_df["pickup_longitude"].cast("double"))

# Assemble the features into a vector
vector_assembler = VectorAssembler(inputCols=["pickup_latitude", "pickup_longitude"], outputCol="features")
vector_df = vector_assembler.transform(filtered_df)

# Persist data to optimize performance
vector_df.cache()

# Apply KMeans clustering with fewer clusters
kmeans = KMeans(k=10, seed=42)  # Use 10 clusters for simplicity
model = kmeans.fit(vector_df)

# Make predictions
predictions = model.transform(vector_df)

# Limit rows for Pandas conversion
predicted_pd = predictions.select("pickup_latitude", "pickup_longitude", "pickup_borough", "prediction").limit(5000).toPandas()

# Plotting
plt.figure(figsize=(12, 8))
sns.scatterplot(data=predicted_pd, 
                x='pickup_longitude', 
                y='pickup_latitude', 
                hue='prediction', 
                palette='tab10', 
                s=10, 
                alpha=0.7)

# Overlay borough information
for borough in predicted_pd['pickup_borough'].unique():
    subset = predicted_pd[predicted_pd['pickup_borough'] == borough]
    plt.scatter(subset['pickup_longitude'], subset['pickup_latitude'], label=borough, alpha=0.3)

plt.title("Clustering NYC Taxi Pickups into Major Zones")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(title='Boroughs')
plt.show()


## Weather Impact Analysis

#### Average Fare by Temperature Range

In [0]:
# Create temperature ranges and calculate average fare amount
temp_ranges = taxi_df.withColumn("temp_range", F.floor(F.col("temp") / 5) * 5)
avg_fare_by_temp = temp_ranges.groupBy("temp_range").agg(F.mean("fare_amount").alias("avg_fare")).orderBy("temp_range")

# Plot average fare amount by temperature range
temp_fare_df = avg_fare_by_temp.toPandas()

plt.figure(figsize=(10, 6))
plt.plot(temp_fare_df["temp_range"], temp_fare_df["avg_fare"], marker="o", color="firebrick")
plt.title("Average Fare Amount by Temperature Range")
plt.xlabel("Temperature Range")
plt.ylabel("Average Fare Amount")
plt.show()


#### Holiday Analysis

In [0]:
# Analyze trips based on holidays
holiday_data = taxi_df.groupBy("holidayName").count().orderBy("count", ascending=False).show()

# Bar plot for holidays
holiday_counts = taxi_df.groupBy("holidayName").count().orderBy("holidayName").collect()
x = [row['holidayName'] for row in holiday_counts]
y = [row['count'] for row in holiday_counts]

plt.barh(x, y, color="darkcyan")
plt.title("Holiday Trip Count")
plt.xlabel("Frequency")
plt.ylabel("Holiday Name")
plt.show()

#### Temperature Analysis

In [0]:
import matplotlib.pyplot as plt

temp_summary = taxi_df.select("temp").summary("count", "mean", "stddev", "min", "max").show()

temp_data = taxi_df.select("temp").toPandas()

plt.hist(temp_data['temp'], bins=30, color="plum", edgecolor="black")
plt.title("Temperature Distribution")
plt.xlabel("Temperature")
plt.ylabel("Frequency")
plt.show()