In [1]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql import types

from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('homework') \
    .getOrCreate()

spark.version

'3.3.2'

In [3]:
df = spark.read.parquet('yellow_tripdata_2024-10.parquet')

In [4]:
df.schema

StructType([StructField('VendorID', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampType(), True), StructField('tpep_dropoff_datetime', TimestampType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('Airport_fee', DoubleType(), True)])

In [5]:
df.take(5)

[Row(VendorID=2, tpep_pickup_datetime=datetime.datetime(2024, 10, 1, 2, 30, 44), tpep_dropoff_datetime=datetime.datetime(2024, 10, 1, 2, 48, 26), passenger_count=1, trip_distance=3.0, RatecodeID=1, store_and_fwd_flag='N', PULocationID=162, DOLocationID=246, payment_type=1, fare_amount=18.4, extra=1.0, mta_tax=0.5, tip_amount=1.5, tolls_amount=0.0, improvement_surcharge=1.0, total_amount=24.9, congestion_surcharge=2.5, Airport_fee=0.0),
 Row(VendorID=1, tpep_pickup_datetime=datetime.datetime(2024, 10, 1, 2, 12, 20), tpep_dropoff_datetime=datetime.datetime(2024, 10, 1, 2, 25, 25), passenger_count=1, trip_distance=2.2, RatecodeID=1, store_and_fwd_flag='N', PULocationID=48, DOLocationID=236, payment_type=1, fare_amount=14.2, extra=3.5, mta_tax=0.5, tip_amount=3.8, tolls_amount=0.0, improvement_surcharge=1.0, total_amount=23.0, congestion_surcharge=2.5, Airport_fee=0.0),
 Row(VendorID=1, tpep_pickup_datetime=datetime.datetime(2024, 10, 1, 2, 4, 46), tpep_dropoff_datetime=datetime.datetime(2

In [16]:
df = df.repartition(4)
df.write.parquet('data-out')

In [7]:
df.createOrReplaceTempView('hw5')

In [17]:
!ls -lh data-out

total 97M
-rw-r--r-- 1 alexey 197121   0 Feb 22 15:17 _SUCCESS
-rw-r--r-- 1 alexey 197121 25M Feb 22 15:17 part-00000-0bd1f6a4-de39-45d9-aa4d-657ac8650605-c000.snappy.parquet
-rw-r--r-- 1 alexey 197121 25M Feb 22 15:17 part-00001-0bd1f6a4-de39-45d9-aa4d-657ac8650605-c000.snappy.parquet
-rw-r--r-- 1 alexey 197121 25M Feb 22 15:17 part-00002-0bd1f6a4-de39-45d9-aa4d-657ac8650605-c000.snappy.parquet
-rw-r--r-- 1 alexey 197121 25M Feb 22 15:17 part-00003-0bd1f6a4-de39-45d9-aa4d-657ac8650605-c000.snappy.parquet


In [6]:
df.withColumn('pickup_date', F.to_date(df.tpep_pickup_datetime)) \
    .filter("pickup_date = '2024-10-15'") \
    .count()

125567

In [8]:
spark.sql(""" 
SELECT count(*)
FROM hw5
WHERE cast(tpep_pickup_datetime as date) = '2024-10-15'
and tpep_pickup_datetime is not null
""").show()

+--------+
|count(1)|
+--------+
|  125567|
+--------+



In [24]:
df.withColumn('duration', (df.tpep_dropoff_datetime.cast('long') - df.tpep_pickup_datetime.cast('long'))/60/60) \
    .orderBy('duration', ascending=False) \
    .limit(5) \
    .select("duration") \
    .show()

+------------------+
|          duration|
+------------------+
| 162.6177777777778|
|           143.325|
|137.76055555555556|
|114.83472222222221|
| 89.89833333333333|
+------------------+



In [31]:

taxi_zone_schema = types.StructType([
    types.StructField('LocationID', types.IntegerType(), True),
    types.StructField('Borough', types.StringType(), True),
    types.StructField('Zone', types.StringType(), True),
    types.StructField('service_zone', types.StringType(), True)
])

In [33]:
df_zones = spark.read \
    .option("header", "true") \
    .schema(taxi_zone_schema) \
    .csv('taxi_zone_lookup.csv')

In [34]:
df_zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [38]:
df_zones.createOrReplaceTempView("zones")
df.createOrReplaceTempView("yellow")

In [41]:
spark.sql("""
SELECT
    zones.Zone,
    COUNT(1) as cnt
FROM
    yellow y LEFT JOIN zones ON 
        y.PULocationID = zones.LocationID
GROUP BY 1
ORDER BY 2 ASC;
""").take(5)

[Row(Zone="Governor's Island/Ellis Island/Liberty Island", cnt=1),
 Row(Zone='Arden Heights', cnt=2),
 Row(Zone='Rikers Island', cnt=2),
 Row(Zone='Jamaica Bay', cnt=3),
 Row(Zone='Green-Wood Cemetery', cnt=3)]

In [10]:
import pandas as pd
df = pd.read_parquet('yellow_tripdata_2024-10.parquet')
df['dt'] = df['tpep_pickup_datetime'].dt.date
(df['dt']==pd.to_datetime('2024-10-15').date()).sum()

128893