### A little inspection and cleanup of taxi data

Jeff Hale

In [48]:
import pandas as pd

In [49]:
df= pd.read_csv("https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-06.csv.gz")
df

  df= pd.read_csv("https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-06.csv.gz")


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-06-01 00:13:26,2021-06-01 00:17:14,1.0,0.90,1.0,N,186,50,1.0,5.00,3.00,0.5,2.20,0.00,0.3,11.00,2.5
1,1.0,2021-06-01 00:32:23,2021-06-01 01:07:04,1.0,23.00,1.0,N,132,18,2.0,61.50,1.75,0.5,0.00,6.55,0.3,70.60,0.0
2,1.0,2021-06-01 00:12:15,2021-06-01 00:15:28,0.0,0.90,1.0,N,138,70,2.0,5.00,1.75,0.5,0.00,0.00,0.3,7.55,0.0
3,1.0,2021-06-01 00:35:00,2021-06-01 00:53:17,0.0,11.20,1.0,N,138,189,1.0,31.50,1.75,0.5,8.50,0.00,0.3,42.55,0.0
4,2.0,2021-06-01 00:31:01,2021-06-01 00:52:27,1.0,9.49,1.0,N,138,142,1.0,28.50,0.50,0.5,7.77,6.55,0.3,47.87,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2834259,,2021-06-23 07:33:56,2021-06-23 07:42:23,,1.60,,,50,164,,8.00,0.00,0.5,1.13,0.00,0.3,12.43,2.5
2834260,,2021-06-23 07:19:00,2021-06-23 08:03:00,,15.25,,,123,162,,43.91,2.75,0.5,0.00,6.55,0.3,54.01,0.0
2834261,,2021-06-23 07:06:58,2021-06-23 08:06:42,,6.27,,,265,181,,39.98,0.00,0.5,0.00,0.00,0.3,40.78,0.0
2834262,,2021-06-23 07:06:38,2021-06-23 08:06:15,,9.13,,,265,25,,42.03,0.00,0.5,0.00,0.00,0.3,42.83,0.0


We should fix the mixed dtypes

In [50]:
df.dtypes

VendorID                 float64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count          float64
trip_distance            float64
RatecodeID               float64
store_and_fwd_flag        object
PULocationID               int64
DOLocationID               int64
payment_type             float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
congestion_surcharge     float64
dtype: object

## Transforms
### Change the date columns from string to datetime format

In [51]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                    float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

In [52]:
df.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-06-01 00:13:26,2021-06-01 00:17:14,1.0,0.9,1.0,N,186,50,1.0,5.0,3.0,0.5,2.2,0.0,0.3,11.0,2.5
1,1.0,2021-06-01 00:32:23,2021-06-01 01:07:04,1.0,23.0,1.0,N,132,18,2.0,61.5,1.75,0.5,0.0,6.55,0.3,70.6,0.0


Because there are missing values, pandas converts to a float. We could have specified we wanted nullable integer dtype, but float is fine.

### Write out cleaned file

In [14]:
df.to_parquet("2022-07_yellow_cleaned.parquet", compression='gzip')

The end