In [155]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
import pandas as pd
from pyspark.sql.functions import col, when, dayofmonth, from_unixtime, expr


In [156]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [157]:
df_pandas = pd.read_parquet("fhv_tripdata_2019-10.parquet")

In [158]:
df_pandas.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2019-10-01 00:23:00,2019-10-01 00:35:00,264.0,264.0,,B00009
1,B00013,2019-10-01 00:11:29,2019-10-01 00:13:22,264.0,264.0,,B00013
2,B00014,2019-10-01 00:11:43,2019-10-01 00:37:20,264.0,264.0,,B00014
3,B00014,2019-10-01 00:56:29,2019-10-01 00:57:47,264.0,264.0,,B00014
4,B00014,2019-10-01 00:23:09,2019-10-01 00:28:27,264.0,264.0,,B00014


In [159]:
output_path = "fhv_tripdata_2019-10_modified.parquet"
df_pandas.to_parquet(output_path)

In [160]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.LongType(), True),
    types.StructField('dropOff_datetime', types.LongType(), True),
    types.StructField('PUlocationID', types.DoubleType(), True),
    types.StructField('DOlocationID', types.DoubleType(), True),
    types.StructField('SR_Flag', types.IntegerType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [161]:
#spark_df = spark.createDataFrame(df_pandas, schema)

In [162]:
#spark_df.head()

In [163]:
#spark.conf.set("spark.sql.parquet.int96AsTimestamp", "true")
spark.conf.set("spark.sql.session.timeZone", "GMT")


In [164]:
df = spark.read \
    .option("header", "true") \
    .option("timeZone", "GMT") \
    .schema(schema) \
    .parquet("fhv_tripdata_2019-10_modified.parquet")

In [165]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: long (nullable = true)
 |-- dropOff_datetime: long (nullable = true)
 |-- PUlocationID: double (nullable = true)
 |-- DOlocationID: double (nullable = true)
 |-- SR_Flag: integer (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [166]:
df.head(10)

[Row(dispatching_base_num='B00009', pickup_datetime=1569889380000000000, dropOff_datetime=1569890100000000000, PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00009'),
 Row(dispatching_base_num='B00013', pickup_datetime=1569888689000000000, dropOff_datetime=1569888802000000000, PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00013'),
 Row(dispatching_base_num='B00014', pickup_datetime=1569888703000000000, dropOff_datetime=1569890240000000000, PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00014'),
 Row(dispatching_base_num='B00014', pickup_datetime=1569891389000000000, dropOff_datetime=1569891467000000000, PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00014'),
 Row(dispatching_base_num='B00014', pickup_datetime=1569889389000000000, dropOff_datetime=1569889707000000000, PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00014'),
 Row(

In [167]:
df = df.withColumn("time_difference_seconds", (df["dropOff_datetime"]-df["pickup_datetime"])/ 1000000000)
df = df.withColumn("pickup_datetime", from_unixtime(df.pickup_datetime / 1000000000))  # Convert nanoseconds to seconds
df = df.withColumn("dropOff_datetime", from_unixtime(df.dropOff_datetime / 1000000000))  # Convert nanoseconds to seconds

from pyspark.sql.functions import month, dayofmonth, hour

df = df.withColumn("pickup_month", month("pickup_datetime")) \
       .withColumn("pickup_day", dayofmonth("pickup_datetime")) \
       .withColumn("pickup_hour", hour("pickup_datetime"))

In [168]:
df.head(10)

[Row(dispatching_base_num='B00009', pickup_datetime='2019-10-01 00:23:00', dropOff_datetime='2019-10-01 00:35:00', PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00009', time_difference_seconds=720.0, pickup_month=10, pickup_day=1, pickup_hour=0),
 Row(dispatching_base_num='B00013', pickup_datetime='2019-10-01 00:11:29', dropOff_datetime='2019-10-01 00:13:22', PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00013', time_difference_seconds=113.0, pickup_month=10, pickup_day=1, pickup_hour=0),
 Row(dispatching_base_num='B00014', pickup_datetime='2019-10-01 00:11:43', dropOff_datetime='2019-10-01 00:37:20', PUlocationID=264.0, DOlocationID=264.0, SR_Flag=None, Affiliated_base_number='B00014', time_difference_seconds=1537.0, pickup_month=10, pickup_day=1, pickup_hour=0),
 Row(dispatching_base_num='B00014', pickup_datetime='2019-10-01 00:56:29', dropOff_datetime='2019-10-01 00:57:47', PUlocationID=264.0, DOlocationID=264.0, SR_F

In [169]:
df_filtered = df.filter(col("pickup_day") == 15) \
                .select("pickup_datetime")

In [170]:
df_filtered.count()

62629