In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# create spark session

spark = SparkSession.builder\
    .master("local[*]")\
    .appName("homework")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/26 08:22:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Question 1: What is the spark version? 

In [11]:
spark.version

'3.4.2'

## Question 2: FHV October 2019 partition size 

In [3]:
# download data

! wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz

--2024-02-26 08:24:20--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-44d1-a138-4e8ea3c3a3b6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240226%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240226T082420Z&X-Amz-Expires=300&X-Amz-Signature=5f0aa5cf20486f68d977f257bf98c0e285a512dbd0a3d5e8200300e527a6165d&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhv_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-02-26 08:24:20--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6

In [5]:
# unzip file
!gunzip fhv_tripdata_2019-10.csv.gz

In [7]:
! ls -la | grep fhv


-rw-rw-r--  1 4oceanknowledges 4oceanknowledges 119796110 Dec  2  2022 fhv_tripdata_2019-10.csv
drwxr-xr-x  3 4oceanknowledges 4oceanknowledges      4096 Feb 22 08:57 fhvhv
-rw-rw-r--  1 4oceanknowledges 4oceanknowledges 752335705 Jul 14  2022 fhvhv_tripdata_2021-01.csv
-rw-rw-r--  1 4oceanknowledges 4oceanknowledges 308924937 Jun 30  2022 fhvhv_tripdata_2021-01.parquet


In [18]:
# read data with define schema
from pyspark.sql import types

schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropoff_datetime', types.TimestampType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])


df = spark.read\
    .option("header","true")\
    .schema(schema)\
    .csv('fhv_tripdata_2019-10.csv')

In [19]:
df.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropoff_datetime', TimestampType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [20]:
# repartition the df into 6 partitions as requested
df = df.repartition(numPartitions=6)

df.write.parquet('data/homework/')


                                                                                

In [21]:
# check the average file size
! ls -l data/homework/

total 37640
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges       0 Feb 26 08:46 _SUCCESS
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges 6424988 Feb 26 08:46 part-00000-5ba816a9-4a7c-4b2d-b58e-733dcbd3fa70-c000.snappy.parquet
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges 6417333 Feb 26 08:46 part-00001-5ba816a9-4a7c-4b2d-b58e-733dcbd3fa70-c000.snappy.parquet
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges 6418178 Feb 26 08:46 part-00002-5ba816a9-4a7c-4b2d-b58e-733dcbd3fa70-c000.snappy.parquet
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges 6417268 Feb 26 08:46 part-00003-5ba816a9-4a7c-4b2d-b58e-733dcbd3fa70-c000.snappy.parquet
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges 6421083 Feb 26 08:46 part-00004-5ba816a9-4a7c-4b2d-b58e-733dcbd3fa70-c000.snappy.parquet
-rw-r--r-- 1 4oceanknowledges 4oceanknowledges 6438857 Feb 26 08:46 part-00005-5ba816a9-4a7c-4b2d-b58e-733dcbd3fa70-c000.snappy.parquet


The average size of the file is about 6.4 MB

## Question 3: Count records on 15th of October 

In [32]:
from pyspark.sql import functions as F

In [22]:
# register the table

df.createOrReplaceTempView("fhv_data")

In [34]:
# Adding the date column to the spark dataframe
df = df.withColumn('pickup_date',F.to_date(df.pickup_datetime))

In [41]:
df.select('pickup_date').filter(df.pickup_date == '2019-10-15').count()

                                                                                

62610

## Question 4: The Longest trip

In [42]:
from pyspark.sql.functions import col

In [49]:
# Calculate the duration of each trip in seconds
df = df.withColumn("trip_duration_seconds", df.dropoff_datetime-df.pickup_datetime)

# Convert the duration from seconds to hours
df = df.withColumn("trip_duration_hours", df.trip_duration_seconds / 3600)

In [54]:
df_result=spark.sql(
    '''

        SELECT MAX(trip_duration_hours) AS longest_trip_hours
        FROM (
            SELECT TIMESTAMPDIFF(SECOND, pickup_datetime, dropoff_datetime) / 3600.0 AS trip_duration_hours
            FROM fhv_data
        ) AS durations;
        
    '''
)

In [55]:
df_result.show()



+------------------+
|longest_trip_hours|
+------------------+
|     631152.500000|
+------------------+



                                                                                

## Question 6: Least frequent pickup location zone

In [56]:
df_zones = spark.read.parquet('zones/')

In [75]:
df_join = df.join(df_zones, df_zones.LocationID == df.PULocationID)

In [78]:
df_join.createOrReplaceTempView('joined_table')

In [80]:
least_pickup_query = spark.sql(
    '''
    SELECT 
        Zone,
        Count(1) AS pickup_count
    FROM
        joined_table
    GROUP BY joined_table.Zone
    ORDER BY pickup_count
    LIMIT 1;
    '''
)

In [81]:
least_pickup_query.show()



+-----------+------------+
|       Zone|pickup_count|
+-----------+------------+
|Jamaica Bay|           1|
+-----------+------------+



                                                                                

In [79]:

spark.sql("""
SELECT DISTINCT(Zone) FROM joined_table LIMIT 10;


""").show()



+--------------------+
|                Zone|
+--------------------+
|           Homecrest|
|              Corona|
|    Bensonhurst West|
|         Westerleigh|
|          Douglaston|
|Charleston/Totten...|
|      Newark Airport|
|      Pelham Parkway|
|          Mount Hope|
|East Concourse/Co...|
+--------------------+



                                                                                