In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [3]:
pyspark.__file__

'/usr/local/spark/python/pyspark/__init__.py'

In [4]:
spark.version

'3.5.0'

In [5]:
# Скачать файл
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz

--2024-03-08 17:09:54--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-44d1-a138-4e8ea3c3a3b6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240308%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240308T170956Z&X-Amz-Expires=300&X-Amz-Signature=822caa9160b9b2b74f26668cab62d171c016ceeabd1fc10331eb5294d6590b01&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhv_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-03-08 17:09:55--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-

In [6]:
# Разархивировать файл
!gunzip fhv_tripdata_2019-10.csv.gz /data/fhv_tripdata_2019-10.csv

gzip: /data/fhv_tripdata_2019-10.csv.gz: No such file or directory


In [7]:
!ls -lh /home/jovyan/work/fhv_tripdata_2019-10.csv

-rw-r--r-- 1 jovyan users 115M Dec  2  2022 /home/jovyan/work/fhv_tripdata_2019-10.csv


In [8]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True),
])

In [9]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('fhv_tripdata_2019-10.csv')

df = df.repartition(6)

In [10]:
df.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropoff_datetime', TimestampType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [11]:
# !sudo chmod -R 775 /home/jovyan/work/data/pq/fhv/2019/10/

In [12]:
df.write.parquet('data/pq/fhv/2019/10/', compression='snappy')

In [13]:
!ls -lh data/pq/fhv/2019/10/

total 37M
-rw-r--r-- 1 jovyan users 6.1M Mar  8 17:10 part-00000-8782222b-b98e-49f3-80c2-cdd18b2d29fd-c000.snappy.parquet
-rw-r--r-- 1 jovyan users 6.1M Mar  8 17:10 part-00001-8782222b-b98e-49f3-80c2-cdd18b2d29fd-c000.snappy.parquet
-rw-r--r-- 1 jovyan users 6.1M Mar  8 17:10 part-00002-8782222b-b98e-49f3-80c2-cdd18b2d29fd-c000.snappy.parquet
-rw-r--r-- 1 jovyan users 6.1M Mar  8 17:10 part-00003-8782222b-b98e-49f3-80c2-cdd18b2d29fd-c000.snappy.parquet
-rw-r--r-- 1 jovyan users 6.1M Mar  8 17:10 part-00004-8782222b-b98e-49f3-80c2-cdd18b2d29fd-c000.snappy.parquet
-rw-r--r-- 1 jovyan users 6.1M Mar  8 17:10 part-00005-8782222b-b98e-49f3-80c2-cdd18b2d29fd-c000.snappy.parquet
-rw-r--r-- 1 jovyan users    0 Mar  8 17:10 _SUCCESS


In [36]:
df = spark.read.parquet('data/pq/fhv/2019/10/')

**Q3**: How many taxi trips were there on February 15?

In [15]:
from pyspark.sql import functions as F

In [16]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .filter("pickup_date = '2019-10-15'") \
    .count()

62610

In [17]:
df.explain()

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [dispatching_base_num#28,pickup_datetime#29,dropoff_datetime#30,PULocationID#31,DOLocationID#32,SR_Flag#33,Affiliated_base_number#34] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/work/data/pq/fhv/2019/10], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<dispatching_base_num:string,pickup_datetime:timestamp,dropoff_datetime:timestamp,PULocatio...




In [18]:
df.createOrReplaceTempView('fhv_2019_10')

In [19]:
df.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|     B01711         |2019-10-03 17:11:49|2019-10-03 17:29:24|         171|          92|   NULL|       B01711         |
|              B01485|2019-10-01 01:39:49|2019-10-01 01:48:40|         264|          72|   NULL|                B01485|
|              B02975|2019-10-01 13:33:35|2019-10-01 13:43:46|         264|          73|   NULL|                B02975|
|              B00254|2019-10-03 20:33:11|2019-10-03 21:52:16|         246|         265|   NULL|                B02356|
|              B02715|2019-10-05 19:08:20|2019-10-05 20:28:01|         132|         265|   NULL|                B02682|
|              B00381|2019-10-01 07:51:0

In [20]:
spark.sql("""
SELECT
    COUNT(1)
FROM 
    fhv_2019_10
WHERE
    to_date(pickup_datetime) = '2019-10-15';
""").show()

+--------+
|count(1)|
+--------+
|   62610|
+--------+



**Q4**: Longest trip for each day

In [21]:
df.columns

['dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag',
 'Affiliated_base_number']

In [22]:
df \
  .withColumn('duration', (df.dropoff_datetime.cast('long') - df.pickup_datetime.cast('long'))/3600) \
  .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
  .groupBy('pickup_date') \
  .agg(F.round(F.max('duration'), 1).alias('max(duration)')) \
  .orderBy('max(duration)', ascending=False) \
  .limit(5) \
  .show() 

+-----------+-------------+
|pickup_date|max(duration)|
+-----------+-------------+
| 2019-10-28|     631152.5|
| 2019-10-11|     631152.5|
| 2019-10-31|      87672.4|
| 2019-10-01|      70128.0|
| 2019-10-17|       8794.0|
+-----------+-------------+



In [23]:
spark.sql("""
SELECT
    to_date(pickup_datetime) AS pickup_date,
    round(MAX((CAST(dropoff_datetime AS LONG) - CAST(pickup_datetime AS LONG)) / 3600), 1) AS duration
FROM 
    fhv_2019_10
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 5;
""").show()

+-----------+--------+
|pickup_date|duration|
+-----------+--------+
| 2019-10-28|631152.5|
| 2019-10-11|631152.5|
| 2019-10-31| 87672.4|
| 2019-10-01| 70128.0|
| 2019-10-17|  8794.0|
+-----------+--------+



**Q5**: Least frequent pickup location zone

In [24]:
# Скачать файл
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

# Показать первые строки файла
!head taxi+_zone_lookup.csv

--2024-03-08 17:10:15--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 16.182.40.216, 52.216.76.246, 54.231.229.24, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|16.182.40.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: ‘taxi+_zone_lookup.csv’


2024-03-08 17:10:15 (7.26 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]

"LocationID","Borough","Zone","service_zone"
1,"EWR","Newark Airport","EWR"
2,"Queens","Jamaica Bay","Boro Zone"
3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
4,"Manhattan","Alphabet City","Yellow Zone"
5,"Staten Island","Arden Heights","Boro Zone"
6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
7,"Queens","Astoria","Boro Zone"
8,"Queens","Astoria Park","Boro Zone"
9,"Queens","Auburndale","Boro Zone"


In [None]:
# Прочитать файл в DataFrame с использованием Spark
df1 = spark.read \
.option("header", "true") \
.csv('taxi+_zone_lookup.csv')

# Показать уникальные значения в столбце 'Zone'
df1.select('Zone').distinct().show()

# Показать содержимое DataFrame
df1.show()

# Подсчитать количество строк в DataFrame
df1.count()

In [27]:
# Записать DataFrame в формате parquet
df1.write.parquet('zones')

In [28]:
!ls -lh

total 115M
drwxr-sr-x 3 jovyan users 4.0K Mar  8 17:10 data
-rw-r--r-- 1 jovyan users 115M Dec  2  2022 fhv_tripdata_2019-10.csv
-rw-r--r-- 1 jovyan users  31K Mar  8 17:10 homework.ipynb
-rw-r--r-- 1 jovyan users  13K Aug 17  2016 taxi+_zone_lookup.csv
drwxr-sr-x 2 jovyan users 4.0K Mar  8 17:12 zones


In [29]:
df_zones = spark.read.parquet('zones')

In [30]:
df_zones.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [31]:
df1.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [32]:
df_zones.createOrReplaceTempView('zones')

In [33]:
spark.sql("""
SELECT
    zones. Zone as zone,
    COUNT (1)
FROM
    fhv_2019_10, zones
WHERE 
    fhv_2019_10.PUlocationID = zones.LocationID
GROUP BY 1
ORDER BY 2
LIMIT 5;
""").show()

+--------------------+--------+
|                zone|count(1)|
+--------------------+--------+
|         Jamaica Bay|       1|
|Governor's Island...|       2|
| Green-Wood Cemetery|       5|
|       Broad Channel|       8|
|     Highbridge Park|      14|
+--------------------+--------+



**Q6**: Most frequent `dispatching_base_num`

How many stages this spark job has?



In [34]:
spark.sql("""
SELECT
    dispatching_base_num,
    COUNT(1)
FROM 
    fhv_2019_10
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 5;
""").show()

+--------------------+--------+
|dispatching_base_num|count(1)|
+--------------------+--------+
|              B01145|   47548|
|              B00256|   43234|
|              B00856|   36778|
|              B03016|   34985|
|              B01239|   33586|
+--------------------+--------+



In [37]:
df \
    .groupBy('dispatching_base_num') \
        .count() \
    .orderBy('count', ascending=False) \
    .limit(5) \
    .show()

+--------------------+-----+
|dispatching_base_num|count|
+--------------------+-----+
|              B01145|47548|
|              B00256|43234|
|              B00856|36778|
|              B03016|34985|
|              B01239|33586|
+--------------------+-----+



In [40]:
spark.sql("""
SELECT
    CONCAT(pul.Zone, ' / ', dol.Zone) AS pu_do_pair,
    COUNT(1)
FROM 
    fhv_2019_10 fhv LEFT JOIN zones pul ON fhv.PULocationID = pul.LocationID
                      LEFT JOIN zones dol ON fhv.DOLocationID = dol.LocationID
GROUP BY 
    1
ORDER BY
    2 DESC
LIMIT 5;
""").show()

+--------------------+--------+
|          pu_do_pair|count(1)|
+--------------------+--------+
|             NV / NV|  341803|
|             NV / NA|   38677|
|  NV / East New York|   30540|
|NV / Williamsbrid...|   23154|
|NV / Washington H...|   21667|
+--------------------+--------+



In [41]:
!head -n 1001 /home/jovyan/work/fhv_tripdata_2019-10.csv > head.csv

In [42]:
import pandas as pd

In [43]:
df_pandas = pd.read_csv('head.csv')

In [44]:
df_pandas.dtypes

dispatching_base_num       object
pickup_datetime            object
dropOff_datetime           object
PUlocationID              float64
DOlocationID              float64
SR_Flag                   float64
Affiliated_base_number     object
dtype: object

In [45]:
spark.createDataFrame(df_pandas).schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropOff_datetime', StringType(), True), StructField('PUlocationID', DoubleType(), True), StructField('DOlocationID', DoubleType(), True), StructField('SR_Flag', DoubleType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [46]:
!head head.csv

dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
B00009,2019-10-01 00:23:00,2019-10-01 00:35:00,264,264,,B00009
B00013,2019-10-01 00:11:29,2019-10-01 00:13:22,264,264,,B00013
B00014,2019-10-01 00:11:43,2019-10-01 00:37:20,264,264,,B00014
B00014,2019-10-01 00:56:29,2019-10-01 00:57:47,264,264,,B00014
B00014,2019-10-01 00:23:09,2019-10-01 00:28:27,264,264,,B00014
B00021         ,2019-10-01 00:00:48,2019-10-01 00:07:12,129,129,,B00021         
B00021         ,2019-10-01 00:47:23,2019-10-01 00:53:25,57,57,,B00021         
B00021         ,2019-10-01 00:10:06,2019-10-01 00:19:50,173,173,,B00021         
B00021         ,2019-10-01 00:51:37,2019-10-01 01:06:14,226,226,,B00021         


In [47]:
spark.stop()