Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Tutorial: Load TAXI data and enrich it with Weather data in Pandas DataFrame

Begin by creating a dataframe to hold the taxi data. To download 6 months of taxi data, iteratively fetch one month at a time, and before appending it to green_taxi_df randomly sample 0.1% records from the specific month to avoid bloating the dataframe.

In [4]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from azureml.opendatasets import NycTlcGreen
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame


start = datetime.strptime("1/1/2016", "%m/%d/%Y")
end = datetime.strptime("1/31/2016", "%m/%d/%Y")

dfs = []
for sample_month in range(6):
    temp_df_green = NycTlcGreen(
        start + relativedelta(months=sample_month),
        end + relativedelta(months=sample_month)).to_spark_dataframe()
    dfs.append(temp_df_green.sample(False, 0.001, 3))

green_taxi_df = reduce(DataFrame.unionAll, dfs)

Save a copy of the raw_columns name list for clean up at the last step.

In [6]:
raw_columns = list(green_taxi_df.columns)

NYC Latitude & Longitude: (40.71455, -74.00712) found by Bing search.

Add to taxi dataframe

Make all Latitude and Longitude be the location of New York City.

In [9]:
from pyspark.sql.functions import lit

nyc_lat, nyc_long = (40.71455, -74.00712)
green_taxi_df = green_taxi_df.withColumn('lat', lit(nyc_lat)).withColumn('long', lit(nyc_long))
display(green_taxi_df.limit(5))

vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,puLocationId,doLocationId,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,rateCodeID,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,puYear,puMonth,lat,long
2,2016-01-18T17:12:47.000+0000,2016-01-18T17:19:52.000+0000,1,0.9,,,-73.8447265625,40.721492767333984,-73.85014343261719,40.71034622192383,1,N,1,6.5,0.0,0.5,0.3,1.0,0.0,,8.3,1,2016,1,40.71455,-74.00712
2,2016-01-18T18:06:18.000+0000,2016-01-18T18:10:32.000+0000,1,0.63,,,-73.91937255859375,40.758548736572266,-73.91181945800781,40.76240921020508,1,N,2,4.5,0.0,0.5,0.3,0.0,0.0,,5.3,1,2016,1,40.71455,-74.00712
2,2016-01-18T18:28:16.000+0000,2016-01-18T18:31:35.000+0000,1,0.44,,,-73.88418579101562,40.74907302856445,-73.87939453125,40.74943542480469,1,N,2,4.0,0.0,0.5,0.3,0.0,0.0,,4.8,1,2016,1,40.71455,-74.00712
2,2016-01-18T18:07:07.000+0000,2016-01-18T18:10:34.000+0000,1,1.06,,,-73.95337677001953,40.68108749389648,-73.95209503173828,40.69267654418945,1,N,2,5.0,0.0,0.5,0.3,0.0,0.0,,5.8,1,2016,1,40.71455,-74.00712
2,2016-01-18T20:01:52.000+0000,2016-01-18T20:05:42.000+0000,1,0.6,,,-73.96238708496094,40.80573654174805,-73.95942687988281,40.7994384765625,1,N,1,4.5,0.5,0.5,0.3,1.16,0.0,,6.96,1,2016,1,40.71455,-74.00712


Initialize LocationTimeCustomerData using pandas dataframe green_taxi.

In [11]:
from azureml.opendatasets.accessories.location_data import LatLongColumn
from azureml.opendatasets.accessories.location_time_customer_data \
    import LocationTimeCustomerData
from azureml.opendatasets import NoaaIsdWeather


green_taxi = LocationTimeCustomerData(
    green_taxi_df,
    LatLongColumn('lat', 'long'),
    'lpepPickupDatetime')

In [12]:
spark.conf.set('spark.sql.crossJoin.enabled', 'true')

Initialize NoaaIsdWeather class, get enricher from it, and enrich the taxi data without aggregation

In [14]:
weather = NoaaIsdWeather(
    cols=["temperature", "precipTime", "precipDepth", "snowDepth"],
    start_date=datetime(2016, 1, 1, 0, 0),
    end_date=datetime(2016, 6, 30, 23, 59))
weather_enricher = weather.get_enricher()
new_green_taxi, processed_weather = weather_enricher.enrich_customer_data_no_agg(
    customer_data_object=green_taxi,
    location_match_granularity=5,
    time_round_granularity='day')

Preview the pandas dataframe new_green_taxi.data

In [16]:
display(new_green_taxi.data.limit(3))

lat,long,vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,puLocationId,doLocationId,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,rateCodeID,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,puYear,puMonth,row_id,customer_rankgroupp6tnp,customer_join_timeii4qk
40.71455,-74.00712,2,2016-01-18T17:12:47.000+0000,2016-01-18T17:19:52.000+0000,1,0.9,,,-73.8447265625,40.721492767333984,-73.85014343261719,40.71034622192383,1,N,1,6.5,0.0,0.5,0.3,1.0,0.0,,8.3,1,2016,1,163208757248,1,2016-01-18T00:00:00.000+0000
40.71455,-74.00712,2,2016-01-18T18:06:18.000+0000,2016-01-18T18:10:32.000+0000,1,0.63,,,-73.91937255859375,40.758548736572266,-73.91181945800781,40.76240921020508,1,N,2,4.5,0.0,0.5,0.3,0.0,0.0,,5.3,1,2016,1,163208757249,1,2016-01-18T00:00:00.000+0000
40.71455,-74.00712,2,2016-01-18T18:28:16.000+0000,2016-01-18T18:31:35.000+0000,1,0.44,,,-73.88418579101562,40.74907302856445,-73.87939453125,40.74943542480469,1,N,2,4.0,0.0,0.5,0.3,0.0,0.0,,4.8,1,2016,1,163208757250,1,2016-01-18T00:00:00.000+0000


Define a dict `aggregations` to define how to aggregate each field at a hour level. For `snowDepth` and `temperature` we'll take the mean and for `precipTime` and `precipDepth` we'll take the hourly maximum. Use the groupby() function along with the aggregations to group data.

In [18]:
aggregations = {
    "snowDepth": "mean",
    "precipTime": "max",
    "temperature": "mean",
    "precipDepth": "max"}

The keys (`public_rankgroup`, `public_join_time`, `customer_rankgroup`, `customer_join_time`) used by groupby() and later merge() must be hacked here due to the current design.

In [20]:
public_rankgroup = processed_weather.id

public_join_time = [
    s for s in list(processed_weather.data.columns)
    if s.startswith('ds_join_time')][0]

customer_rankgroup = weather_enricher.location_selector.customer_rankgroup

customer_join_time = [
    s for s in list(new_green_taxi.data.columns)
    if type(s) is str and s.startswith('customer_join_time')][0]

weather_df_grouped = processed_weather.data.groupby(public_rankgroup, public_join_time).agg(aggregations)
display(weather_df_grouped.limit(3))

public_rankgroup87h2r,ds_join_timezkxf1,avg(snowDepth),avg(temperature),max(precipTime),max(precipDepth)
1,2016-03-14T00:00:00.000+0000,0.0,7.918023255813954,24.0,100.0
1,2016-01-13T00:00:00.000+0000,0.0,-2.266428571428573,24.0,3.0
1,2016-05-15T00:00:00.000+0000,0.0,12.797058823529422,24.0,0.0


Join the final dataframe, and preview the joined result.

In [22]:
taxi_df = new_green_taxi.data
joined_dataset = taxi_df.join(
    weather_df_grouped,
    [taxi_df[customer_rankgroup] == weather_df_grouped[public_rankgroup],
     taxi_df[customer_join_time] == weather_df_grouped[public_join_time]],
    how='left')

final_df = joined_dataset.select(raw_columns + [
    "avg(temperature)", "max(precipTime)", "max(precipDepth)", "avg(snowDepth)"])
display(final_df.limit(5))

vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,puLocationId,doLocationId,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,rateCodeID,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,puYear,puMonth,avg(temperature),max(precipTime),max(precipDepth),avg(snowDepth)
2,2016-03-14T00:06:39.000+0000,2016-03-14T00:22:51.000+0000,1,1.76,,,-73.95321655273438,40.73318099975586,-73.95167541503906,40.71426010131836,1,N,2,12.0,0.5,0.5,0.3,0.0,0.0,,13.3,1,2016,3,7.918023255813954,24.0,100.0,0.0
2,2016-03-14T00:41:53.000+0000,2016-03-14T00:50:22.000+0000,1,1.85,,,-73.9336929321289,40.85436248779297,-73.94861602783203,40.82904815673828,1,N,2,8.5,0.5,0.5,0.3,0.0,0.0,,9.8,1,2016,3,7.918023255813954,24.0,100.0,0.0
2,2016-03-14T00:54:47.000+0000,2016-03-14T01:02:37.000+0000,5,2.57,,,-73.95768737792969,40.816192626953125,-73.97406005859375,40.79408645629883,1,N,2,9.5,0.5,0.5,0.3,0.0,0.0,,10.8,1,2016,3,7.918023255813954,24.0,100.0,0.0
1,2016-03-14T01:07:20.000+0000,2016-03-14T01:11:06.000+0000,1,0.7,,,-73.86914825439453,40.74928283691406,-73.88216400146484,40.74787902832031,1,N,2,4.5,0.5,0.5,0.3,0.0,0.0,,5.8,1,2016,3,7.918023255813954,24.0,100.0,0.0
2,2016-03-14T07:30:54.000+0000,2016-03-14T07:59:29.000+0000,1,7.3,,,-73.98673248291016,40.688568115234375,-73.97416687011719,40.7547607421875,1,N,1,24.5,0.0,0.5,0.3,2.5,0.0,,27.8,1,2016,3,7.918023255813954,24.0,100.0,0.0


Check the join success rate.

In [24]:
final_df.toPandas().info()

In [25]:
final_df.createOrReplaceTempView('joined_df')

In [26]:
%sql
select * from joined_df
where lpepPickupDatetime >= '2016-01-26' and lpepPickupDatetime < '2016-01-27'
order by lpepPickupDatetime limit 5

vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,puLocationId,doLocationId,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,rateCodeID,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,puYear,puMonth,avg(temperature),max(precipTime),max(precipDepth),avg(snowDepth)
2,2016-01-26T01:49:47.000+0000,2016-01-26T02:02:44.000+0000,1,3.02,,,-73.84416961669922,40.72150039672852,-73.7972412109375,40.709228515625,1,N,2,12.5,0.5,0.5,0.3,0.0,0.0,,13.8,1,2016,1,4.2092857142857145,24.0,0.0,40.06896551724138
2,2016-01-26T06:11:45.000+0000,2016-01-26T06:26:49.000+0000,1,2.46,,,-73.90061950683594,40.8388900756836,-73.92524719238281,40.81021118164063,5,N,2,10.0,0.0,0.0,0.0,0.0,0.0,,10.0,2,2016,1,4.2092857142857145,24.0,0.0,40.06896551724138
2,2016-01-26T06:46:16.000+0000,2016-01-26T06:52:52.000+0000,1,0.92,,,-73.96138763427734,40.66564178466797,-73.94918823242188,40.66624069213867,1,N,1,6.0,0.0,0.5,0.3,1.36,0.0,,8.16,1,2016,1,4.2092857142857145,24.0,0.0,40.06896551724138
1,2016-01-26T06:49:11.000+0000,2016-01-26T06:58:47.000+0000,1,1.6,,,-73.93540954589844,40.85007858276367,-73.91783905029297,40.865718841552734,1,N,2,8.5,0.0,0.5,0.3,0.0,0.0,,9.3,1,2016,1,4.2092857142857145,24.0,0.0,40.06896551724138
2,2016-01-26T07:57:52.000+0000,2016-01-26T08:29:26.000+0000,4,6.28,,,-73.86204528808594,40.73024368286133,-73.96729278564453,40.760032653808594,1,N,1,25.5,0.0,0.5,0.3,6.58,0.0,,32.88,1,2016,1,4.2092857142857145,24.0,0.0,40.06896551724138
