# Testing Data

In [3]:
# Retrives the train taxi dataset

from urllib.request import urlretrieve
import os

YEAR = '2020'
MONTHS = range(1, 3)
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"

tlc_output_dir = '../data/raw/tlc_2020'
if not os.path.exists(tlc_output_dir):
    os.makedirs(tlc_output_dir)

for month in MONTHS:
    
    month = str(month).zfill(2) 
    print(f"Begin month {month}")
    
    # generate url
    url = f'{URL_TEMPLATE}{YEAR}-{month}.parquet'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{YEAR}-{month}.parquet"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed month {month}")

Begin month 01
Completed month 01
Begin month 02
Completed month 02


In [4]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("testing_data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/08/29 06:55:18 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.18.205.204 instead (on interface eth0)
22/08/29 06:55:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/29 06:55:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# reads in weather test dataset

tempdf = spark.read.option("header", "true").csv("../data/raw/JRB-test.tsv",sep='\t')
tempdf.limit(5)

22/08/29 06:55:38 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


station,valid,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,skyc1,skyc2,skyc3,skyc4,skyl1,skyl2,skyl3,skyl4,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,snowdepth
JRB,2020-01-01 00:56,44.1,37.9,78.66,260.0,7.0,0.0,29.64,1003.7,,,OVC,,,,5000.0,,,,,,,,,,,39.53,KJRB 010056Z AUTO...,
JRB,2020-01-01 01:56,43.0,36.0,76.12,260.0,9.0,0.0,29.66,1004.2,,19.0,FEW,SCT,OVC,,2100.0,3900.0,4500.0,,,,,,,,,37.19,KJRB 010156Z AUTO...,
JRB,2020-01-01 02:56,42.1,35.1,76.04,260.0,10.0,0.0,29.65,1003.9,,17.0,OVC,,,,5000.0,,,,,,,,,,,35.63,KJRB 010256Z AUTO...,
JRB,2020-01-01 03:56,42.1,34.0,72.78,260.0,8.0,0.0,29.65,1004.1,,14.0,FEW,BKN,OVC,,4600.0,6000.0,7500.0,,,,,,,,,36.55,KJRB 010356Z AUTO...,
JRB,2020-01-01 04:56,41.0,30.9,67.03,260.0,12.0,0.0,29.66,1004.4,,18.0,OVC,,,,6000.0,,,,,,,,,,,33.46,KJRB 010456Z AUTO...,


In [6]:
# feature selection and dropping of null

tempdf = tempdf.select("valid","tmpf","dwpf","relh","sknt")
tempdf = tempdf.dropna("any")
tempdf.limit(5)

valid,tmpf,dwpf,relh,sknt
2020-01-01 00:56,44.1,37.9,78.66,7.0
2020-01-01 01:56,43.0,36.0,76.12,9.0
2020-01-01 02:56,42.1,35.1,76.04,10.0
2020-01-01 03:56,42.1,34.0,72.78,8.0
2020-01-01 04:56,41.0,30.9,67.03,12.0


In [7]:
# same transformations as train set

from pyspark.sql.functions import *

tempdf = tempdf.withColumn("month-day-hr", date_format(col("valid"), "MMMM-dd-HH"))
tempdf = tempdf.withColumn("month", date_format(col("valid"), "MM").cast("long"))
tempdf = tempdf.withColumn("day", date_format(col("valid"), "dd").cast("long"))
tempdf = tempdf.withColumn("pickup_hour", date_format(col("valid"), "HH").cast("long"))
tempdf = tempdf.drop("valid")

tempdf = tempdf.withColumnRenamed("tmpf","temperature(f)")
tempdf = tempdf.withColumnRenamed("dwpf","dew_point_temp(f)")
tempdf = tempdf.withColumnRenamed("relh","relative_humidity")
tempdf = tempdf.withColumnRenamed("sknt","wind_speed")

for field in ('temperature(f)',"dew_point_temp(f)","relative_humidity","wind_speed"):
    tempdf = tempdf.withColumn(
        field,
        col(field).cast('double')
    )

tempdf.limit(5)

temperature(f),dew_point_temp(f),relative_humidity,wind_speed,month-day-hr,month,day,pickup_hour
44.1,37.9,78.66,7.0,January-01-00,1,1,0
43.0,36.0,76.12,9.0,January-01-01,1,1,1
42.1,35.1,76.04,10.0,January-01-02,1,1,2
42.1,34.0,72.78,8.0,January-01-03,1,1,3
41.0,30.9,67.03,12.0,January-01-04,1,1,4


In [10]:
tempdf.schema

StructType([StructField('temperature(f)', DoubleType(), True), StructField('dew_point_temp(f)', DoubleType(), True), StructField('relative_humidity', DoubleType(), True), StructField('wind_speed', DoubleType(), True), StructField('month-day-hr', StringType(), True), StructField('month', LongType(), True), StructField('day', LongType(), True), StructField('pickup_hour', LongType(), True)])

In [8]:
taxidf = spark.read.parquet('../data/raw/tlc_2020/')
taxidf.limit(5)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
1,2020-02-01 00:17:35,2020-02-01 00:30:32,1.0,2.6,1.0,N,145,7,1,11.0,0.5,0.5,2.45,0.0,0.3,14.75,0.0,
1,2020-02-01 00:32:47,2020-02-01 01:05:36,1.0,4.8,1.0,N,45,61,1,21.5,3.0,0.5,6.3,0.0,0.3,31.6,2.5,
1,2020-02-01 00:31:44,2020-02-01 00:43:28,1.0,3.2,1.0,N,186,140,1,11.0,3.0,0.5,1.0,0.0,0.3,15.8,2.5,
2,2020-02-01 00:07:35,2020-02-01 00:31:39,1.0,4.38,1.0,N,144,140,1,18.0,0.5,0.5,3.0,0.0,0.3,24.8,2.5,
2,2020-02-01 00:51:43,2020-02-01 01:01:29,1.0,2.28,1.0,N,238,152,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,0.0,


In [9]:
# same renaming and selecting as train set

from pyspark.sql.functions import *

taxidf = taxidf.withColumn("month-day-hr", date_format(col("tpep_pickup_datetime"), "MMMM-dd-HH"))

taxidf = taxidf.withColumn('tpep_pickup_datetime',to_timestamp(col('tpep_pickup_datetime')))\
  .withColumn('tpep_dropoff_datetime', to_timestamp(col('tpep_dropoff_datetime')))\
  .withColumn('journey_time',(col("tpep_dropoff_datetime").cast("long") - col('tpep_pickup_datetime').cast("double"))/60)
taxidf = taxidf.withColumn("journey_time", round(col("journey_time"), 2))

taxidf = taxidf.select("month-day-hr","PULocationID","journey_time","trip_distance")

taxidf.limit(10)

month-day-hr,PULocationID,journey_time,trip_distance
February-01-00,145,12.95,2.6
February-01-00,45,32.82,4.8
February-01-00,186,11.73,3.2
February-01-00,144,24.07,4.38
February-01-00,238,9.77,2.28
February-01-00,249,4.98,1.0
February-01-00,79,24.85,3.4
February-01-00,224,13.23,2.1
February-01-00,116,4.0,0.8
February-01-00,161,44.22,7.22


In [10]:
# filtering

print(taxidf.count())
taxidf = taxidf.filter(col('journey_time') > 10)
print(taxidf.count())
taxidf = taxidf.filter(col('trip_distance') > 0)
print(taxidf.count())
taxidf = taxidf.dropna("any")

12704375


                                                                                

6781135




6748318


                                                                                

In [11]:
countdf = taxidf.groupBy("month-day-hr","PULocationID").count()

In [11]:
countdf.count()

                                                                                

167713

In [12]:
# inner join

mergedf = countdf.join(tempdf, ["month-day-hr"])
mergedf.count()

                                                                                

164257

In [13]:
mergedf = mergedf.drop("month-day-hr")
mergedf.limit(10)

                                                                                

PULocationID,count,temperature(f),dew_point_temp(f),relative_humidity,wind_speed,month,day,pickup_hour
125,8,41.0,37.0,85.54,4.0,2,1,4
225,1,41.0,37.0,85.54,4.0,2,1,4
262,81,39.0,36.0,88.87,0.0,2,1,11
82,5,39.0,36.0,88.87,0.0,2,1,11
181,4,39.9,36.0,85.81,3.0,2,1,13
173,1,39.9,36.0,85.81,3.0,2,1,13
262,47,39.9,36.0,85.81,4.0,2,1,14
87,30,41.0,37.0,85.54,5.0,2,1,21
13,32,37.9,33.1,82.66,4.0,2,2,10
193,2,43.0,30.0,59.83,4.0,2,2,17


In [42]:
mergedf.schema

StructType([StructField('PULocationID', LongType(), True), StructField('count', LongType(), False), StructField('temperature(f)', DoubleType(), True), StructField('dew_point_temp(f)', DoubleType(), True), StructField('relative_humidity', DoubleType(), True), StructField('wind_speed', DoubleType(), True), StructField('month', LongType(), True), StructField('day', LongType(), True), StructField('pickup_hour', LongType(), True)])

In [14]:
mergedf.write.parquet('../data/curated/testdf.paraquet')

                                                                                