In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034_Yellow_Taxi_Preprocessing_Feature_Engineering")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/23 04:44:50 WARN Utils: Your hostname, LAPTOP-VAB0S7AL resolves to a loopback address: 127.0.1.1; using 172.27.236.91 instead (on interface eth0)
22/08/23 04:44:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/08/23 04:44:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# read in 2019 taxi data & confirm
sdf_full = spark.read.parquet('../data/raw/tlc_data/tlc_data_yellow/2019*')
sdf_full.show(1, vertical=True, truncate=100)

[Stage 1:>                                                          (0 + 1) / 1]

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2019-03-01 00:24:41 
 tpep_dropoff_datetime | 2019-03-01 00:25:31 
 passenger_count       | 1.0                 
 trip_distance         | 0.0                 
 RatecodeID            | 1.0                 
 store_and_fwd_flag    | N                   
 PULocationID          | 145                 
 DOLocationID          | 145                 
 payment_type          | 2                   
 fare_amount           | 2.5                 
 extra                 | 0.5                 
 mta_tax               | 0.5                 
 tip_amount            | 0.0                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 3.8                 
 congestion_surcharge  | 0.0                 
 airport_fee           | null                
only showing top 1 row



                                                                                

In [3]:
sdf_full.count()

                                                                                

84598444

In [4]:
sdf_full.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)



In [5]:
from pyspark.sql import functions as F
from _global_vars import *
from collate import drop_cast_and_create_taxi

In [6]:
sdf = drop_cast_and_create_taxi(sdf_full)

In [7]:
# verify
#sdf.printSchema()
#sdf.limit(5)

In [8]:
# Pre cleaning descriptive statistics
sdf.describe(*non_categorical_features).show()



+-------+------------------+------------------+------------------+--------------------+------------------+
|summary|       fare_amount|        tip_amount|   passenger_count|congestion_surcharge|     trip_distance|
+-------+------------------+------------------+------------------+--------------------+------------------+
|  count|          84598444|          84598444|          84154061|            79297843|          84598444|
|   mean|13.412639732835764| 2.190078737505638|1.5626654190817957|  2.1949917301029234|3.0183506184817515|
| stddev|174.17668755385404|15.638996154306168|1.2079081585219809|  0.8296498809008713| 8.093902044464816|
|    min|           -1856.0|            -221.0|               0.0|                -2.5|         -37264.53|
|    max|          943274.8|         141492.02|               9.0|                 4.5|          45977.22|
+-------+------------------+------------------+------------------+--------------------+------------------+



                                                                                

### Weather data aggregation + joining

In [9]:
# read in weather data
weather_date_path = "../data/raw/nyc_weather_date/NYC.csv"
weather_sdf = spark.read.csv(weather_date_path, header=True, inferSchema=True)

In [10]:
weather_sdf.printSchema()

root
 |-- station: string (nullable = true)
 |-- valid: string (nullable = true)
 |-- lon: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- tmpf: string (nullable = true)
 |-- dwpf: string (nullable = true)
 |-- relh: string (nullable = true)
 |-- drct: string (nullable = true)
 |-- sknt: string (nullable = true)
 |-- p01i: string (nullable = true)
 |-- alti: string (nullable = true)
 |-- mslp: string (nullable = true)
 |-- vsby: string (nullable = true)
 |-- gust: string (nullable = true)
 |-- skyc1: string (nullable = true)
 |-- skyc2: string (nullable = true)
 |-- skyc3: string (nullable = true)
 |-- skyc4: string (nullable = true)
 |-- skyl1: string (nullable = true)
 |-- skyl2: string (nullable = true)
 |-- skyl3: string (nullable = true)
 |-- skyl4: string (nullable = true)
 |-- wxcodes: string (nullable = true)
 |-- ice_accretion_1hr: string (nullable = true)
 |-- ice_accretion_3hr: string (nullable = true)
 |-- ice_accretion_6hr: string (nullable = true)
 |-- pe

In [11]:
from collate import weather_process

In [12]:
weather_sdf = weather_process(weather_sdf)

In [13]:
weather_sdf.count()
weather_sdf.printSchema()
# few missing values -> we dont have to worry -> time to join datasets



root
 |-- hour_of_day_of_year: timestamp (nullable = true)
 |-- tmpf: double (nullable = true)
 |-- dwpf: double (nullable = true)
 |-- relh: double (nullable = true)



                                                                                

NameError: name 'sdf' is not defined

In [14]:
export_relative_dir = '../data/curated/'

In [15]:
weather_sdf.write.mode('overwrite').csv(export_relative_dir + "weather_data_clean.csv")

                                                                                

In [16]:
combined_sdf = sdf.join(weather_sdf, on=['hour_of_day_of_year'], how='inner')

In [17]:
combined_sdf.write.mode('overwrite').parquet(export_relative_dir + "yt2019_feature_eng.parquet")
print("export completed")

22/08/23 04:45:37 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

export completed


In [18]:
spark.sql("CLEAR CACHE")
spark.stop()