In [45]:
import json
from kafka import KafkaProducer
from time import time
import pandas as pd

In [46]:
df = pd.read_csv('data/green_tripdata_2019-10.csv', low_memory=False)
df.dtypes

VendorID                 float64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID               float64
PULocationID               int64
DOLocationID               int64
passenger_count          float64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type             float64
trip_type                float64
congestion_surcharge     float64
dtype: object

In [47]:
req_cols = [
    'lpep_pickup_datetime',
    'lpep_dropoff_datetime',
    'PULocationID',
    'DOLocationID',
    'passenger_count',
    'trip_distance',
    'tip_amount'
]

for i, col in enumerate(df.columns):
    if col not in req_cols:
        print(f'index: {i}, column: {col}')

index: 0, column: VendorID
index: 3, column: store_and_fwd_flag
index: 4, column: RatecodeID
index: 9, column: fare_amount
index: 10, column: extra
index: 11, column: mta_tax
index: 13, column: tolls_amount
index: 14, column: ehail_fee
index: 15, column: improvement_surcharge
index: 16, column: total_amount
index: 17, column: payment_type
index: 18, column: trip_type
index: 19, column: congestion_surcharge


In [48]:
df_filter = df.drop(df.columns[[0, 3, 4, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19]], axis=1)

In [49]:
df_filter.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount
0,2019-10-01 00:26:02,2019-10-01 00:39:58,112,196,1.0,5.88,0.0
1,2019-10-01 00:18:11,2019-10-01 00:22:38,43,263,1.0,0.8,0.0
2,2019-10-01 00:09:31,2019-10-01 00:24:47,255,228,2.0,7.5,0.0
3,2019-10-01 00:37:40,2019-10-01 00:41:49,181,181,1.0,0.9,0.0
4,2019-10-01 00:08:13,2019-10-01 00:17:56,97,188,1.0,2.52,2.26


In [50]:
df_filter.dtypes

lpep_pickup_datetime      object
lpep_dropoff_datetime     object
PULocationID               int64
DOLocationID               int64
passenger_count          float64
trip_distance            float64
tip_amount               float64
dtype: object

In [51]:
len(df_filter)

476386

In [52]:
df_filter['passenger_count'] = df_filter['passenger_count'].fillna(0).astype(int)

In [53]:
df_filter.dtypes

lpep_pickup_datetime      object
lpep_dropoff_datetime     object
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
tip_amount               float64
dtype: object

In [54]:
def json_serializer(data):
    return json.dumps(data).encode('utf-8')

server = 'localhost:9092'
topic_name = 'green_taxis'

producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

t0 = time()

for index, row in df_filter.iterrows():
    message = row.to_dict()
    producer.send(topic_name, value=message)

producer.flush()
producer.close()

t1 = time()

took = t1 - t0
print(f'took {(t1 - t0):.2f} seconds')

Time taken to send and flush data: 41.758378982543945 seconds


In [55]:
df_zones = pd.read_csv('data/taxi_zone_lookup.csv')

In [56]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [57]:
df_zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,,


In [68]:
print(df_zones[df_zones['LocationID'] == 22])

    LocationID   Borough              Zone service_zone
21          22  Brooklyn  Bensonhurst West    Boro Zone


In [67]:
print(df_zones[df_zones['LocationID'] == 129])

     LocationID Borough             Zone service_zone
128         129  Queens  Jackson Heights    Boro Zone
