In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime

In [2]:
!curl -Lf https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz -o 2024.csv.gz
!gzip -d 2024.csv.gz
spark = SparkSession.builder.master('local[*]').appName('hw_2024').getOrCreate()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 18.4M  100 18.4M    0     0  21.8M      0 --:--:-- --:--:-- --:--:--  104M


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


26/01/13 08:13:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/13 08:13:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/13 08:13:12 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
# Question 1: Install Spark and PySpark
# Execute spark.version.
# What's the output?

spark.version

#### Answer: 3.3.2

'3.3.2'

In [4]:
# Question 2: FHV October 2019
# Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons.
# Repartition the Dataframe to 6 partitions and save it to parquet.
# What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? 
# Select the answer which most closely matches.

df = spark.read.option('header', 'true').option('inferSchema', 'true').csv('2024.csv')
partitioned = df.repartition(6).write.parquet('hw_2024/')
pdf = spark.read.parquet('hw_2024/*')
!ls -lash hw_2024/

#### Answer: 6.2

                                                                                

total 38M
4.0K drwxr-xr-x 2 ajay ajay 4.0K Jan 13 08:13 .
4.0K drwxrwxr-x 4 ajay ajay 4.0K Jan 13 08:13 ..
4.0K -rw-r--r-- 1 ajay ajay    8 Jan 13 08:13 ._SUCCESS.crc
 52K -rw-r--r-- 1 ajay ajay  50K Jan 13 08:13 .part-00000-3c3de914-3fa5-4c80-b3e7-567fa58540a2-c000.snappy.parquet.crc
 52K -rw-r--r-- 1 ajay ajay  49K Jan 13 08:13 .part-00001-3c3de914-3fa5-4c80-b3e7-567fa58540a2-c000.snappy.parquet.crc
 52K -rw-r--r-- 1 ajay ajay  49K Jan 13 08:13 .part-00002-3c3de914-3fa5-4c80-b3e7-567fa58540a2-c000.snappy.parquet.crc
 52K -rw-r--r-- 1 ajay ajay  49K Jan 13 08:13 .part-00003-3c3de914-3fa5-4c80-b3e7-567fa58540a2-c000.snappy.parquet.crc
 52K -rw-r--r-- 1 ajay ajay  49K Jan 13 08:13 .part-00004-3c3de914-3fa5-4c80-b3e7-567fa58540a2-c000.snappy.parquet.crc
 52K -rw-r--r-- 1 ajay ajay  50K Jan 13 08:13 .part-00005-3c3de914-3fa5-4c80-b3e7-567fa58540a2-c000.snappy.parquet.crc
   0 -rw-r--r-- 1 ajay ajay    0 Jan 13 08:13 _SUCCESS
6.2M -rw-r--r-- 1 ajay ajay 6.2M Jan 13 08:13 part-00000-3c3de91

In [5]:
# Question 3: Count records
# How many taxi trips were there on the 15th of October?
# Consider only trips that started on the 15th of October.


filtered = pdf.withColumn('duration', pdf.dropOff_datetime - pdf.pickup_datetime)
filtered.createTempView('hw')
spark.sql(("""
            
            select 
                count(*) as num_trips 
            from hw
            where date(pickup_datetime) = '2019-10-15'
            
            """)).show()

#### Answer: 62610



+---------+
|num_trips|
+---------+
|    62610|
+---------+



                                                                                

In [6]:
# Question 4: Longest trip for each day
# What is the length of the longest trip in the dataset in hours?

spark.sql(("""
           
            select 
                date(pickup_datetime) as date, 
                round(max(
                    (unix_timestamp(dropOff_datetime) - 
                    unix_timestamp(pickup_datetime)) 
                    / 3600.0), 2) as longest
            from hw
            group by 1
            order by 2 desc
            limit 5
            
            """)).show()

#### Answer: 631152.5 hours



+----------+---------+
|      date|  longest|
+----------+---------+
|2019-10-28|631152.50|
|2019-10-11|631152.50|
|2019-10-31| 87672.44|
|2019-10-01| 70128.03|
|2019-10-17|  8794.00|
+----------+---------+



                                                                                

In [7]:
# Question 5: User Interface
# Sparkâ€™s User Interface which shows the application's dashboard runs on which local port?

pdf._session

#### Answer: Usually 4040 but mine is running on 4042 since 4040 is occupied.

In [8]:
# Question 6: Least frequent pickup location zone
# Load the zone lookup data into a temp view in Spark
# Zone Data
# Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?

!curl -Lf https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv -o z.csv
zone = spark.read.option('header', 'true').option('inferSchema', 'true').csv('z.csv')
joined = pdf.join(zone, zone.LocationID == pdf.PUlocationID, how = 'left')
joined.createOrReplaceTempView('hw')

spark.sql(("""
            
            select Zone
            from hw
            group by 1
            order by count(*)
            
            """)).head(1)


#### Answer: Jamaica Bay

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 12322  100 12322    0     0  49046      0 --:--:-- --:--:-- --:--:-- 49046


                                                                                

[Row(Zone='Jamaica Bay')]

In [9]:
spark.stop()
!rm -rf *.csv *.parquet hw_*