In [3]:
import pyspark
from pyspark.sql import SparkSession

!spark-shell --version

# Create SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('test-spark') \
                    .getOrCreate()

print(f'The PySpark {spark.version} version is running...')

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.5.5
      /_/
                        
Using Scala version 2.12.18, Java HotSpot(TM) 64-Bit Server VM, 1.8.0_441
Branch HEAD
Compiled by user ubuntu on 2025-02-23T20:30:46Z
Revision 7c29c664cdc9321205a98a14858aaf8daaa19db2
Url https://github.com/apache/spark
Type --help for more information.


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/15 12:53:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


The PySpark 3.5.5 version is running...


In [4]:
import pyspark

In [6]:
from pyspark.sql import SparkSession
import os

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("YellowTaxiData").getOrCreate()

# Step 2: Read the Parquet file into a Spark DataFrame
input_path = "yellow_tripdata_2024-10.parquet"  # Update with your actual file path
df = spark.read.parquet(input_path)

# Step 3: Repartition the DataFrame into 4 partitions
df = df.repartition(4)

# Step 4: Write the DataFrame to Parquet
output_path = "yellow_tripdata_2024-10_repartitioned"
df.write.mode("overwrite").parquet(output_path)

# Step 5: Compute the average Parquet file size
parquet_files = [f for f in os.listdir(output_path) if f.endswith(".parquet")]
total_size = sum(os.path.getsize(os.path.join(output_path, f)) for f in parquet_files)  # Total size in bytes
average_size_mb = (total_size / len(parquet_files)) / (1024 * 1024)  # Convert bytes to MB

print(f"Average Parquet File Size: {average_size_mb:.2f} MB")




Average Parquet File Size: 23.78 MB


                                                                                

In [7]:
(total_size / len(parquet_files))

24939051.5

In [40]:
from pyspark.sql.functions import col, to_date

# Step 3: Convert the pickup datetime column to date format
df = df.withColumn("pickup_date", to_date(col("tpep_pickup_datetime")))
df = df.withColumn("dropoff_date", to_date(col("tpep_dropoff_datetime")))

# Step 4: Filter for trips that started on October 15th, 2024
filtered_df = df.filter((col("pickup_date") == "2024-10-15") #& 
                       # (col("dropoff_date") == "2024-10-15") #&
                        # col("passenger_count").isNotNull()
                        )

# Step 5: Count the number of trips
trip_count = filtered_df.count()

print(f"Number of taxi trips on October 15th, 2024: {trip_count}")


Number of taxi trips on October 15th, 2024: 128893


In [41]:
from pyspark.sql.functions import col, unix_timestamp

# Calculate trip duration in hours
df = df.withColumn("trip_duration_hours", 
                   (unix_timestamp(col("tpep_dropoff_datetime")) - unix_timestamp(col("tpep_pickup_datetime"))) / 3600)

# Find the maximum trip duration
max_trip_duration = df.selectExpr("MAX(trip_duration_hours) AS longest_trip_hours").collect()[0]["longest_trip_hours"]

print(f"Longest trip duration: {max_trip_duration:.2f} hours")


Longest trip duration: 162.62 hours


In [42]:

# Load into a Spark DataFrame
zone_df = spark.read.csv("taxi_zone_lookup.csv", header=True, inferSchema=True)

# Show schema to verify column names
zone_df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [44]:
# Join the taxi data with zone lookup on PULocationID
joined_df = df.join(zone_df, df.PULocationID == zone_df.LocationID, "left")

# Count trips per pickup zone
zone_trip_counts = joined_df.groupBy("Zone").count()

# Find the least frequent pickup location
least_frequent_zone = zone_trip_counts.orderBy("count").first()["Zone"]

print(f"Least frequent pickup location zone: {least_frequent_zone}")


Least frequent pickup location zone: Governor's Island/Ellis Island/Liberty Island


                                                                                