In [1]:
from pyspark.sql import SparkSession
from _global_vars import *

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034_Yellow_Taxi_Preprocessing_Cleaning")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/22 16:29:59 WARN Utils: Your hostname, LAPTOP-VAB0S7AL resolves to a loopback address: 127.0.1.1; using 172.27.239.27 instead (on interface eth0)
22/08/22 16:29:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/08/22 16:30:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/22 16:30:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# read in feature engineered and collated 2019 taxi data & confirm
sdf = spark.read.parquet('../data/curated/yt2019_feature_eng.parquet')
#sdf.show(1, vertical=True, truncate=100)

                                                                                

In [3]:
#total_records = sdf.count()
#print(total_records)
sdf.count()

                                                                                

84180903

# Remove bad records

### Filters (collated from univariate analysis)
* Restrict range of passenger_count (1:6) [drop other records]
* Only take 312miles > trip distances > 0 (3 hours at maximum speed)
* Fares capped at $500 (0:500)
* remove trips with average speed over 104 mph and remove trips less than 1 mph
* Limit trip times to under 5 hours (300 minutes) (more than it takes to drive entire perimeter of nyc)
* Trips with a time of less than 2 minutes should be excluded => not very realistic => passenger may have cancelled
* Congestion surcharge capped at 10 usd -> any records beyond this can go away
* Ratecode == 1 and fare < 2.5

In [4]:
import pandas as pd
from clean import get_outliers_df, remove_outliers
from pyspark.sql import functions as F

In [5]:
outlier_df = get_outliers_df(sdf)

                                                                                

In [6]:
print(outlier_df.sort_values('prop_of_total', ascending=False))

                        0  prop_of_total
congestion        5415432       0.064331
ratecode_min_fee  3581882       0.042550
t_time            2093024       0.024863
passenger_count   1960201       0.023286
t_speed           1093001       0.012984
pay_type          1071550       0.012729
DOLocation         984803       0.011699
PULocationID       830761       0.009869
trip_distance      748600       0.008893
rcode_id           444867       0.005285
vendorid           267050       0.003172
fare_amount        204131       0.002425
tip_amount           1827       0.000022


In [8]:
sdf = remove_outliers(sdf)

In [12]:
sdf.groupBy("is_valid_record").count().show()

                                                                                

+---------------+--------+
|is_valid_record|   count|
+---------------+--------+
|           true|71742914|
|          false|12437989|
+---------------+--------+



In [13]:
# remove bad records
sdf_filtered = sdf.filter(sdf.is_valid_record == True)
sdf_filtered.count()

                                                                                

71742914

In [16]:
# looks much nicer if you ask me...
summary_stats_continuous = sdf_filtered.describe(*non_categorical_features)
summary_stats_continuous.show()



+-------+------------------+------------------+------------------+-----------------+--------------------+
|summary|     trip_distance|        tip_amount|   passenger_count|      fare_amount|congestion_surcharge|
+-------+------------------+------------------+------------------+-----------------+--------------------+
|  count|          71742914|          71742914|          71742914|         71742914|            71742914|
|   mean| 2.589011225415214|2.0483753208021587|1.5949647654401102|11.92635378819433|  2.2415673274715324|
| stddev|2.8274863324423087|2.1991967369536964| 1.203682438758781|8.348539350425064|  0.7611033913076103|
|    min|              0.04|               0.0|               1.0|              2.5|                 0.0|
|    max|             149.5|             500.0|               6.0|            394.5|                2.75|
+-------+------------------+------------------+------------------+-----------------+--------------------+



                                                                                

In [21]:
depr_features = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'tip_amount','PU_datetime', 'DO_datetime', 'hour_of_day_of_year', 'is_valid_record']
sdf_filtered = sdf_filtered.drop(*depr_features)
sdf_filtered.columns

['VendorID',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'congestion_surcharge',
 'PU_hourofday',
 'DO_hourofday',
 'PU_dayofweek',
 'DO_dayofweek',
 'PU_dayofmonth',
 'DO_dayofmonth',
 'PU_month',
 'DO_month',
 'trip_time_minutes',
 'trip_speed_mph',
 'fare_per_minute',
 'tmpf',
 'dwpf',
 'relh']

In [20]:
export_relative_dir = '../data/curated/'
sdf_filtered.write.mode('overwrite').parquet(export_relative_dir + "yt2019_cleaned.parquet")

                                                                                

In [None]:
# looks much nicer if you ask me...
sdf_filtered.describe(*non_categorical_features).show()

In [None]:
spark.sql("CLEAR CACHE")
spark.stop()