In [1]:
import numpy as np
import pandas as pd
import os
import pyspark
from pyspark.sql import SparkSession, types

In [2]:
# downlod the CSV assigned for the homework
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz

--2024-05-05 08:57:46--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-44d1-a138-4e8ea3c3a3b6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240505%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240505T125746Z&X-Amz-Expires=300&X-Amz-Signature=3575be07e961f0fe7ab4c208b8263be510088834b7757d07c2b7a76007e46b9c&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhv_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-05-05 08:57:46--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-

In [3]:
# launch a Spark Session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/05/05 08:57:50 WARN Utils: Your hostname, thinkpad resolves to a loopback address: 127.0.1.1; using 192.168.1.162 instead (on interface wlp3s0)
24/05/05 08:57:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 08:57:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Question 1: what version of Spark are you using?

In [4]:
spark.version

'3.5.1'

### Question 2: 
Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons. Repartition the Dataframe to 6 partitions and save it to parquet. What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.

In [5]:
# read a preview of the dataset to create a schema
preview = pd.read_csv('fhv_tripdata_2019-10.csv.gz', compression='gzip', nrows=1000)

In [6]:
# create a spark df from pandas df and copy the schema for Spark CSV read
for col in spark.createDataFrame(preview).schema:
    print(col)

StructField('dispatching_base_num', StringType(), True)
StructField('pickup_datetime', StringType(), True)
StructField('dropOff_datetime', StringType(), True)
StructField('PUlocationID', DoubleType(), True)
StructField('DOlocationID', DoubleType(), True)
StructField('SR_Flag', DoubleType(), True)
StructField('Affiliated_base_number', StringType(), True)


In [7]:
# schema with python types
schema = types.StructType([
types.StructField('dispatching_base_num', types.StringType(), True), 
types.StructField('pickup_datetime', types.TimestampType(), True), 
types.StructField('dropOff_datetime', types.TimestampType(), True), 
types.StructField('PUlocationID', types.DoubleType(), True), 
types.StructField('DOlocationID', types.DoubleType(), True), 
types.StructField('SR_Flag', types.DoubleType(), True), 
types.StructField('Affiliated_base_number', types.StringType(), True)])

In [8]:
# read the CSV with spark using above schema
df = spark.read.csv('fhv_tripdata_2019-10.csv.gz', header=True, schema=schema)

In [9]:
# write the partitioned dataset to parquet files 
os.mkdir('sparktest')
df.repartition(6).write.parquet('sparktest/test.parquet')

                                                                                

In [10]:
# check file size
!ls -lh sparktest/test.parquet/

total 39M
-rw-r--r-- 1 student student 6.4M May  5 08:58 part-00000-e824925b-5f36-4c49-a996-ea9550294121-c000.snappy.parquet
-rw-r--r-- 1 student student 6.4M May  5 08:58 part-00001-e824925b-5f36-4c49-a996-ea9550294121-c000.snappy.parquet
-rw-r--r-- 1 student student 6.4M May  5 08:58 part-00002-e824925b-5f36-4c49-a996-ea9550294121-c000.snappy.parquet
-rw-r--r-- 1 student student 6.4M May  5 08:58 part-00003-e824925b-5f36-4c49-a996-ea9550294121-c000.snappy.parquet
-rw-r--r-- 1 student student 6.4M May  5 08:58 part-00004-e824925b-5f36-4c49-a996-ea9550294121-c000.snappy.parquet
-rw-r--r-- 1 student student 6.4M May  5 08:58 part-00005-e824925b-5f36-4c49-a996-ea9550294121-c000.snappy.parquet
-rw-r--r-- 1 student student    0 May  5 08:58 _SUCCESS


### Question 3:

**Count records:** How many taxi trips were there on the 15th of October? Consider only trips that started on the 15th of October.

In [11]:
# method 1 - use SQL

# establish view
df.createOrReplaceTempView('FHV_trips_2019_10')

In [12]:
# run query on view
spark.sql('''
SELECT COUNT(*) AS total_trips
FROM FHV_trips_2019_10
WHERE CAST(pickup_datetime AS DATE) = '2019-10-15'
''').show()

                                                                                

+-----------+
|total_trips|
+-----------+
|      62610|
+-----------+



In [13]:
# method 2: python one liner
df.filter(df.pickup_datetime.cast('date') == '2019-10-15').count()

                                                                                

62610

### Question 4:
**Longest trip for each day:** What is the length of the longest trip in the dataset in hours?

In [14]:
# calculate the total hours for each trip and show top 1 record
spark.sql('''
SELECT
TIMESTAMPDIFF(HOUR, pickup_datetime, dropoff_datetime) AS longest_trip_hours
FROM FHV_trips_2019_10
ORDER BY longest_trip_hours DESC
''').show(1)

[Stage 9:>                                                          (0 + 1) / 1]

+------------------+
|longest_trip_hours|
+------------------+
|            631152|
+------------------+
only showing top 1 row



                                                                                

### Question 5:

**User Interface:**
Spark’s User Interface which shows the application's dashboard runs on which local port? **4040**

### Question 6:

**Least frequent pickup location zone:**
Load the zone lookup data into a temp view in Spark. Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?

- East Chelsea
- **Jamaica Bay**
- Union Sq
- Crown Heights North


In [15]:
# load the zone lookup data
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

--2024-05-05 08:58:30--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240505%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240505T125830Z&X-Amz-Expires=300&X-Amz-Signature=8093acb066386650e6a963d04427bd621e100258a24b30cd198f1e0cd68bbc49&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2024-05-05 08:58:30--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6e

In [16]:
# read the CSV as a Spark df
zones = spark.read.csv('taxi_zone_lookup.csv', header=True)

In [17]:
# create temp view for SQL join
zones.createOrReplaceTempView('zones')

In [18]:
# SQL query to show pickups by zone ascending. Top result is the answer to the question.
spark.sql('''
    SELECT
    z.borough,
    z.zone,
    COUNT(*) AS total_trips
    FROM FHV_trips_2019_10 t
    INNER JOIN zones z ON t.PUlocationID = z.LocationID
    GROUP BY z.zone, z.borough
    ORDER BY total_trips
    ''').show()

[Stage 12:>                                                         (0 + 1) / 1]

+-------------+--------------------+-----------+
|      borough|                zone|total_trips|
+-------------+--------------------+-----------+
|       Queens|         Jamaica Bay|          1|
|    Manhattan|Governor's Island...|          2|
|     Brooklyn| Green-Wood Cemetery|          5|
|       Queens|       Broad Channel|          8|
|    Manhattan|     Highbridge Park|         14|
|    Manhattan|        Battery Park|         15|
|       Queens|Saint Michaels Ce...|         23|
|       Queens|Breezy Point/Fort...|         25|
|     Brooklyn|Marine Park/Floyd...|         26|
|       Queens|        Astoria Park|         29|
|    Manhattan|    Inwood Hill Park|         39|
|       Queens|       Willets Point|         47|
|       Queens|Forest Park/Highl...|         53|
|     Brooklyn|  Brooklyn Navy Yard|         57|
|        Bronx|        Crotona Park|         62|
|        Bronx|        Country Club|         77|
|Staten Island|     Freshkills Park|         89|
|     Brooklyn|     

                                                                                