In [1]:
from pyspark.sql import SparkSession
from urllib.request import urlretrieve
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("ADS project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.executor.memory","2G")
    .config("spark.driver.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/22 01:29:30 WARN Utils: Your hostname, Luo resolves to a loopback address: 127.0.1.1; using 172.17.1.121 instead (on interface eth0)
22/08/22 01:29:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/22 01:29:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.conf.set("spark.sql.parquet.compression.codec", "gzip")

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [4]:
# The timeline of year and month of data to use
YEARS = ['2019', '2021']
MONTHS = range(1, 13)
path_raw = f"../data/raw/tlc_data/"
path_curated = f"../data/curated/tlc_data/"

In [5]:
# read in the data
sdf_2019 = spark.read.parquet(f'{path_raw}2019/')
sdf_2021 = spark.read.parquet(f'{path_raw}2021/')

                                                                                

In [6]:
# brief overview of the final response feature
sdf_2021.select('tip_amount').describe()

                                                                                

summary,tip_amount
count,30904308.0
mean,2.341164260659189
stddev,2.829029227137787
min,-333.32
max,1140.44


In [7]:
# add the features that we need for the analysis based on the some of original data attributes
def add_feature(sdf):
    return sdf.withColumn('Month,Date', date_format("tpep_pickup_datetime", "MM,dd"))\
        .withColumn('Month', date_format("tpep_pickup_datetime", "MM").cast('int'))\
        .withColumn('Pickup_time', date_format('tpep_pickup_datetime', 'HH').cast('int'))\
        .withColumn("time_duration", unix_timestamp('tpep_dropoff_datetime') - unix_timestamp('tpep_pickup_datetime'))\
        .withColumn("is_weekend", dayofweek("tpep_pickup_datetime").isin([1,7]).cast("int"))\
        .withColumn("average_speed", F.col('trip_distance') / (F.col('time_duration')/3600))\
        .withColumn('is_airport', (F.col('DOLocationID')==1) | (F.col('DOLocationID')==132) | (F.col('DOLocationID')==138)\
                | (F.col('PULocationID')==1) | (F.col('PULocationID')==132) | (F.col('PULocationID')==138))

In [8]:
# remove the data records that are invalid against common sense or business rule
def clean_feature(sdf):
    return sdf.where(F.col('time_duration') >= 60).where(F.col('tip_amount') > 0)\
            .where(F.col('trip_distance') > 0).where(F.col('payment_type') == 1)\
            .where(F.col('fare_amount') >= 2.5)

In [9]:
# for data of each month in each year, add features and do the first cleaning then save them into the folder of '/firstclean'
for year in YEARS:
    for month in MONTHS:
        # turn '1' to '01' to keep consistent with the format of the file names
        month = str(month).zfill(2)
        sdf = spark.read.parquet(f'{path_raw}{year}/{year}-{month}.parquet')
        # add the features
        sdf = add_feature(sdf)
        # filter the invalid
        sdf = clean_feature(sdf)
        # write the cleaned data to directory
        sdf.write.mode("overwrite").parquet(f'{path_curated}{year}/firstclean/{year}--{month}-firstclean.parquet')
        print(f"finished {year} {month}")

22/08/22 01:29:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

finished 2019 01


                                                                                

finished 2019 02


                                                                                

finished 2019 03


                                                                                

finished 2019 04


                                                                                

finished 2019 05


                                                                                

finished 2019 06


                                                                                

finished 2019 07


                                                                                

finished 2019 08


                                                                                

finished 2019 09


                                                                                

finished 2019 10


                                                                                

finished 2019 11


                                                                                

finished 2019 12


                                                                                

finished 2021 01


                                                                                

finished 2021 02


                                                                                

finished 2021 03


                                                                                

finished 2021 04


                                                                                

finished 2021 05


                                                                                

finished 2021 06


                                                                                

finished 2021 07


                                                                                

finished 2021 08


                                                                                

finished 2021 09


                                                                                

finished 2021 10


                                                                                

finished 2021 11




finished 2021 12


                                                                                