In [None]:
import numpy as np
import pandas as pd
import time
import geopandas as gpd
from shapely import geometry
from pyproj import CRS
from pyproj import Transformer


In [None]:
# Explore the original data for Bike in 2019-1
path = '../data/201902-citibike-tripdata.csv'
bike_data = pd.read_csv(path, dtype=str)

# Discard some unrelated columns to simplify the problem
# Some other attributes may be used later
simplified_bike_data = bike_data.drop(['tripduration', 'start station id',
       'start station name', 'end station id', 'end station name',
       'bikeid', 'usertype', 'birth year', 'gender'], axis=1)

simplified_bike_data

In [None]:
# Discard data out of the time range
year = 2019
month = 2
simplified_bike_data = simplified_bike_data[simplified_bike_data["starttime"] > f"{year}-%02d-01 00:00:00" % month]
simplified_bike_data = simplified_bike_data[simplified_bike_data["stoptime"] > f"{year}-%02d-01 00:00:00" % month]
simplified_bike_data = simplified_bike_data[simplified_bike_data["starttime"] < f"{year}-%02d-01 03:59:59" % month]
simplified_bike_data = simplified_bike_data[simplified_bike_data["stoptime"] < f"{year}-%02d-01 03:59:59" % month]

In [None]:
simplified_bike_data

In [None]:
import csv

crs_WGS84 = CRS.from_epsg(4326)
crs_taxi_zones = CRS.from_epsg(2263)
transformer = Transformer.from_crs(crs_WGS84, crs_taxi_zones)
manhattan_zones_file = open('../data-NYCZones/zones/manhattan_zones.csv', encoding='utf-8')
manhattan_zones_reader = csv.reader(manhattan_zones_file)
taxi_zones = gpd.read_file("../data-NYCZones/zones/taxi_zones.shp")
manhattan_zones_numbers = []
start = 0

for i in manhattan_zones_reader:
    if start == 0:
        start = 1
        continue
    manhattan_zones_numbers.append(i[0])

manhattan_zones_numbers = manhattan_zones_numbers[1:]
manhattan_zones_numbers = [int(i) for i in manhattan_zones_numbers]
manhattan_zones = taxi_zones[taxi_zones['LocationID'].isin(manhattan_zones_numbers)]
manhattan_zones

In [None]:
def get_PULocationID(row):
    start_latitude = row['start station latitude']
    start_longitude = row['start station longitude']
    x, y = transformer.transform(start_latitude, start_longitude)
    for index, row in manhattan_zones.iterrows():
        if geometry.Point(x, y).within(row['geometry']):
            return index + 1
        
def get_DOLocationID(row):
    stop_latitude = row['end station latitude']
    stop_longitude = row['end station longitude']
    x, y = transformer.transform(stop_latitude, stop_longitude)
    for index, row in manhattan_zones.iterrows():
        if geometry.Point(x, y).within(row['geometry']):
            return index + 1
        
simplified_bike_data['PULocationID'] = simplified_bike_data.apply(get_PULocationID, axis=1)
simplified_bike_data['DOLocationID'] = simplified_bike_data.apply(get_DOLocationID, axis=1)
simplified_bike_data

In [None]:
simplified_bike_data = simplified_bike_data.dropna(how='any')
simplified_bike_data = simplified_bike_data.drop(['start station latitude', 'start station longitude', 'end station latitude', 'end station longitude'], axis=1)
simplified_bike_data

In [None]:
simplified_bike_data['DOLocationID'] = simplified_bike_data['DOLocationID'].astype('int64')
simplified_bike_data['PULocationID'] = simplified_bike_data['PULocationID'].astype('int64')
simplified_bike_data['starttime'] = pd.to_datetime(simplified_bike_data['starttime'])
simplified_bike_data['stoptime'] = pd.to_datetime(simplified_bike_data['stoptime'])
simplified_bike_data

In [None]:
simplified_bike_data['starttime'] = simplified_bike_data['starttime'].astype(str)
simplified_bike_data['stoptime'] = simplified_bike_data['stoptime'].astype(str)

bike_inflow = simplified_bike_data.drop(['starttime', 'PULocationID'], axis=1)
bike_outflow = simplified_bike_data.drop(['stoptime', 'DOLocationID'], axis=1)

In [None]:
bike_inflow = bike_inflow.rename(columns={'stoptime': 'time', 'DOLocationID': 'region_id'})
bike_outflow = bike_outflow.rename(columns={'starttime': 'time', 'PULocationID': 'region_id'})

In [None]:
preloaded_bike_inflow = bike_inflow[bike_inflow["time"] < f"2019-02-01 01:00:06"]
preloaded_bike_outflow = bike_outflow[bike_outflow["time"] < f"2019-02-01 01:00:06"]

remaining_bike_inflow = bike_inflow[bike_inflow["time"] > f"2019-02-01 01:00:06"]
remaining_bike_outflow = bike_outflow[bike_outflow["time"] > f"2019-02-01 01:00:06"]

In [None]:
remaining_bike_outflow

In [None]:
from kafka import KafkaProducer

# Export data to kafka
producer = KafkaProducer(bootstrap_servers='localhost:9092')

# Write 100 messages to kafka
# The number need to be considered later
N = preloaded_bike_inflow.shape[0]
for i in range(N):
    # json_string = simplified_bike_data.iloc[i].to_json()
    inflow_string = preloaded_bike_inflow.iloc[i].to_json()
    outflow_string = preloaded_bike_outflow.iloc[i].to_json()
    # To send a message
    # producer.send('bike_origin', json_string.encode('utf-8'))
    producer.send('bike_inflow', inflow_string.encode('utf-8'))
    producer.send('bike_outflow', outflow_string.encode('utf-8'))

In [None]:
producer = KafkaProducer(bootstrap_servers='localhost:9092')

N = remaining_bike_inflow.shape[0]
for i in range(N):
    inflow_string = remaining_bike_inflow.iloc[i].to_json()
    outflow_string = remaining_bike_outflow.iloc[i].to_json()

    # To send a message
    producer.send('bike_inflow', inflow_string.encode('utf-8'))
    producer.send('bike_outflow', outflow_string.encode('utf-8'))
    time.sleep(1)

producer.flush()  # Wait for any outstanding messages to be transmitted and delivery acknowledgments received
producer.close()

In [None]:
CREATE TABLE manhattan_bike (
	starttime TIMESTAMP, 
	stoptime TIMESTAMP, 
	`start station latitude` FLOAT, 
	`start station longitude` FLOAT, 
	`end station latitude` FLOAT, 
	`end station longitude` FLOAT, 
	`PULocationID` BIGINT, 
	`DOLocationID` BIGINT
) WITH (
    'connector' = 'kafka',  -- using kafka connector
    'topic' = 'bike_origin',  -- kafka topic
    'scan.startup.mode' = 'earliest-offset',  -- reading from the beginning
    'properties.bootstrap.servers' = 'localhost:9092',  -- kafka broker address
    'format' = 'json'  -- the data format is json
);

CREATE TABLE bike_inflow (
	stoptime TIMESTAMP(3), 
	`DOLocationID` BIGINT, 
    WATERMARK FOR stoptime AS stoptime - INTERVAL '5' SECOND
)WITH (
    'connector' = 'kafka',  -- using kafka connector
    'topic' = 'bike_inflow',  -- kafka topic
    'scan.startup.mode' = 'earliest-offset',  -- reading from the beginning
    'properties.bootstrap.servers' = 'localhost:9092',  -- kafka broker address
    'format' = 'json'  -- the data format is json
);

CREATE TABLE bike_outflow (
	starttime TIMESTAMP(3), 
	`PULocationID` BIGINT, 
    WATERMARK FOR starttime AS starttime - INTERVAL '5' SECOND
)WITH (
    'connector' = 'kafka',  -- using kafka connector
    'topic' = 'bike_outflow',  -- kafka topic
    'scan.startup.mode' = 'earliest-offset',  -- reading from the beginning
    'properties.bootstrap.servers' = 'localhost:9092',  -- kafka broker address
    'format' = 'json'  -- the data format is json
);


In [None]:
CREATE TABLE bike_origin_es (
    starttime TIMESTAMP, 
	stoptime TIMESTAMP, 
	`start station latitude` FLOAT, 
	`start station longitude` FLOAT, 
	`end station latitude` FLOAT, 
	`end station longitude` FLOAT, 
	`PULocationID` BIGINT, 
	`DOLocationID` BIGINT
) WITH (
    'connector' = 'elasticsearch-7', -- using elasticsearch connector
    'hosts' = 'https://demo0.es.asia-southeast1.gcp.elastic-cloud.com:9243',  -- elasticsearch address
    'username' = 'elastic',
    'password' = 'G1Tqo3onjeIg7G8E6zSIWlq8',
    'index' = 'bike_origin_es'  -- elasticsearch index name, similar to database table name
);

INSERT INTO bike_origin_es (SELECT * FROM manhattan_bike);