In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [None]:
df_fhvhv = spark.read \
    .option("header", "true") \
        .csv('../code/data/raw/fhvhv/2021/06')

In [None]:
df_fhvhv.show()

In [None]:
df_fhvhv.printSchema()

In [None]:
import pandas as pd

In [None]:
df_fhvhv_pd = pd.read_csv('../code/data/raw/fhvhv/2021/06/fhvhv_tripdata_2021_06.csv.gz', nrows = 1000)

In [None]:
df_fhvhv_pd['Affiliated_base_number'] = df_fhvhv_pd['Affiliated_base_number'].fillna('')
df_fhvhv_pd.info()

In [None]:
df_fhvhv_pd.head()

In [None]:
spark.createDataFrame(df_fhvhv_pd).schema

In [None]:
from pyspark.sql import types

In [None]:
fhvhv_schema = types.StructType([
types.StructField('dispatching_base_num', types.StringType(), True), 
types.StructField('pickup_datetime', types.TimestampType(), True), 
types.StructField('dropoff_datetime', types.TimestampType(), True), 
types.StructField('PULocationID', types.IntegerType(), True), 
types.StructField('DOLocationID', types.IntegerType(), True), 
types.StructField('SR_Flag', types.StringType()), 
types.StructField('Affiliated_base_number', types.StringType(), True)
])


In [None]:
year = 2021

for month in range(6, 7):
    print(f'processing data for {year}/{month}')

    input_path = f'../code/data/raw/fhvhv/{year}/{month:02d}/'
    output_path = f'../code/data/pq/fhvhv/{year}/{month:02d}/'

    df_fhvhv = spark.read \
        .option("header", "true") \
        .schema(fhvhv_schema) \
        .csv(input_path)

    df_fhvhv \
        .repartition(12) \
        .write.parquet(output_path)

Question 3:  
How many taxi trips were there on June 15?  
Consider only trips that started on June 15.  

In [None]:
df_fhvhv_pq = spark.read.parquet('../code/data/pq/fhvhv/2021/06*')

In [None]:
df_fhvhv_pq.show(5)

In [None]:
from pyspark.sql.functions import count, date_format, col

In [None]:
df_fhvhv_pq.select(date_format(col("pickup_datetime"), "dd.MM.yyyy").alias("pickup_date")) \
    .filter('pickup_date == "15.06.2021"').count()

In [None]:
df_fhvhv_pq.registerTempTable('fhvhv_data')

In [None]:
spark.sql("""


SELECT
(bigint(dropoff_datetime) - bigint(pickup_datetime)) /3600 as test
FROM fhvhv_data
ORDER BY (bigint(dropoff_datetime) - bigint(pickup_datetime)) DESC
LIMIT 1
"""


).show()

In [None]:
spark.sql(""" SELECT (bigint(dropoff_datetime) - (bigint(pickup_datetime)) /3600 FROM fhvhv_data LIMIT 1""").show()

## Question 6: 

**Most frequent pickup location zone**
Using the zone lookup data and the fhvhv June 2021 data,  
what is the name of the most frequent pickup location zone?  

In [None]:
!wget -nc https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

In [50]:
taxi_zones = spark.read \
            .option("Header", "True") \
            .csv('taxi_zone_lookup.csv')

In [51]:
taxi_zones.createTempView('taxi_zones_view')

In [60]:
spark.sql("""


SELECT 
*
FROM fhvhv_data
LIMIT 1
"""


).show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02889|2021-06-04 20:51:44|2021-06-04 21:10:12|         239|         158|      N|                B02889|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+



In [65]:
spark.sql("""
SELECT 
taxi_zones_view.Zone,
COUNT(*)
FROM fhvhv_data 
LEFT JOIN taxi_zones_view
ON fhvhv_data.PULocationID = taxi_zones_view.LocationID
GROUP BY taxi_zones_view.Zone
ORDER BY COUNT(*) DESC
LIMIT 10

""").show()

[Stage 62:>                                                         (0 + 4) / 4]

+--------------------+--------+
|                Zone|count(1)|
+--------------------+--------+
| Crown Heights North|  231279|
|        East Village|  221244|
|         JFK Airport|  188867|
|      Bushwick South|  187929|
|       East New York|  186780|
|TriBeCa/Civic Center|  164344|
|   LaGuardia Airport|  161596|
|            Union Sq|  158937|
|        West Village|  154698|
|             Astoria|  152493|
+--------------------+--------+



                                                                                