In [None]:
import numpy as np
import pandas as pd
import time
from kafka import KafkaProducer


In [None]:
# Explore the original data for Taxi in 2019-2
path = '../data/yellow_tripdata_2019-02.csv'
taxi_data = pd.read_csv(path, dtype=str)
taxi_data.columns


In [None]:
# Discard some unrelated columns to simplify the problem
# Some other attributes may be used later
simplified_taxi_data = taxi_data.drop(['VendorID',  'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'fare_amount', 'extra', 'total_amount',
                                       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge'], axis=1)

simplified_taxi_data


In [None]:
# Filter out the data within the manhattan zones
simplified_taxi_data['DOLocationID'] = simplified_taxi_data['DOLocationID'].astype('int64')
simplified_taxi_data['PULocationID'] = simplified_taxi_data['PULocationID'].astype('int64')

manhattan_zones = pd.read_csv("../data-NYCZones/zones/manhattan_zones.csv")
manhattan_zones_id  = list(manhattan_zones["zone_id"])
manhattan_taxi_data = simplified_taxi_data[(simplified_taxi_data['DOLocationID'].isin(manhattan_zones_id)) & (simplified_taxi_data['PULocationID'].isin(manhattan_zones_id))]

# Discard data out of the time range
year = 2019
month = 2
manhattan_taxi_data = manhattan_taxi_data[manhattan_taxi_data["tpep_pickup_datetime"] > f"{year}-%02d-01 00:00:00" % month]
manhattan_taxi_data = manhattan_taxi_data[manhattan_taxi_data["tpep_dropoff_datetime"] > f"{year}-%02d-01 00:00:00" % month]
manhattan_taxi_data = manhattan_taxi_data[manhattan_taxi_data["tpep_pickup_datetime"] < f"{year}-%02d-31 23:59:59" % month]
manhattan_taxi_data = manhattan_taxi_data[manhattan_taxi_data["tpep_dropoff_datetime"] < f"{year}-%02d-31 23:59:59" % month]

# Align time into hour, maybe handled with flink later
manhattan_taxi_data['tpep_pickup_datetime'] = pd.to_datetime(manhattan_taxi_data['tpep_pickup_datetime'])
manhattan_taxi_data['tpep_dropoff_datetime'] = pd.to_datetime(manhattan_taxi_data['tpep_dropoff_datetime'])

In [None]:
# Sort by drop off time to simulate the real-time events
result = manhattan_taxi_data
result = result.sort_values(by=['tpep_dropoff_datetime'])

# Filter to get the data in 2019-02-01
result = result[result["tpep_dropoff_datetime"] < f"2019-02-01 23:59:59"]
result

In [None]:
# Preload the data for 1 hour
pre_loaded_data = result[result["tpep_dropoff_datetime"] < f"2019-02-01 01:00:06"]
pre_loaded_data

In [None]:
# The data in time range [1:00, 3:00] are loaded for testing
# The time scale is 1min -> 1sec, so lasting for 2 minutes
remaining_data = result[(result["tpep_dropoff_datetime"] > f"2019-02-01 01:00:00")
                        & (result["tpep_dropoff_datetime"] < f"2019-02-01 03:00:00")]
remaining_data


In [None]:
preloaded_taxi_inflow = pre_loaded_data.drop(['tpep_pickup_datetime', 'PULocationID'], axis=1)
preloaded_taxi_outflow = pre_loaded_data.drop(['tpep_dropoff_datetime', 'DOLocationID'], axis=1)

remaining_taxi_inflow = remaining_data.drop(['tpep_pickup_datetime', 'PULocationID'], axis=1)
remaining_taxi_outflow = remaining_data.drop(['tpep_dropoff_datetime', 'DOLocationID'], axis=1)

In [None]:
preloaded_taxi_inflow['tpep_dropoff_datetime'] = preloaded_taxi_inflow['tpep_dropoff_datetime'].astype(str)
preloaded_taxi_outflow['tpep_pickup_datetime'] = preloaded_taxi_outflow['tpep_pickup_datetime'].astype(str)

remaining_taxi_inflow['tpep_dropoff_datetime'] = remaining_taxi_inflow['tpep_dropoff_datetime'].astype(str)
remaining_taxi_outflow['tpep_pickup_datetime'] = remaining_taxi_outflow['tpep_pickup_datetime'].astype(str)

In [None]:
preloaded_taxi_inflow = preloaded_taxi_inflow.rename(columns={'tpep_dropoff_datetime': 'time', 'DOLocationID': 'region_id'})
preloaded_taxi_outflow = preloaded_taxi_outflow.rename(columns={'tpep_pickup_datetime': 'time', 'PULocationID': 'region_id'})

remaining_taxi_inflow = remaining_taxi_inflow.rename(columns={'tpep_dropoff_datetime': 'time', 'DOLocationID': 'region_id'})
remaining_taxi_outflow = remaining_taxi_outflow.rename(columns={'tpep_pickup_datetime': 'time', 'PULocationID': 'region_id'})

In [None]:
preloaded_taxi_inflow
preloaded_taxi_outflow = preloaded_taxi_outflow.sort_values('time')
preloaded_taxi_outflow

# remaining_taxi_inflow
# remaining_taxi_outflow = remaining_taxi_outflow.sort_values('time')
# remaining_taxi_outflow

# Add some data out of range to ensure the tumble window finished
# preloaded_taxi_inflow = preloaded_taxi_inflow.append(remaining_taxi_inflow.iloc[0:10])
preloaded_taxi_outflow = preloaded_taxi_outflow.append(remaining_taxi_outflow.iloc[0:10])
preloaded_taxi_outflow

In [None]:
# Import data to kafka
producer = KafkaProducer(bootstrap_servers='localhost:9092')

# Preload the data into Kafka
N = preloaded_taxi_inflow.shape[0]
for i in range(N):
    inflow_string = preloaded_taxi_inflow.iloc[i].to_json()
    outflow_string = preloaded_taxi_outflow.iloc[i].to_json()

    # To send a message
    producer.send('taxi_inflow', inflow_string.encode('utf-8'))
    producer.send('taxi_outflow', outflow_string.encode('utf-8'))

In [None]:
producer = KafkaProducer(bootstrap_servers='localhost:9092')

N = remaining_taxi_inflow.shape[0]
for i in range(N):
    inflow_string = remaining_taxi_inflow.iloc[i].to_json()
    outflow_string = remaining_taxi_outflow.iloc[i].to_json()

    # To send a message
    producer.send('taxi_inflow', inflow_string.encode('utf-8'))
    producer.send('taxi_outflow', outflow_string.encode('utf-8'))
    time.sleep(0.1)

producer.flush()  # Wait for any outstanding messages to be transmitted and delivery acknowledgments received
producer.close()