In [1]:
import pandas as pd
import numpy as np
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes, CsvTableSource
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf
import os
import time
from datetime import datetime

In [2]:
# 环境等设置
env = StreamExecutionEnvironment.get_execution_environment()
# env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '80m')
t_env.get_config().get_configuration().set_string("python.fn-execution.arrow.batch.size", '300000')

In [3]:
# 输入表创建
t_env.connect(FileSystem().path('./data/map_matching')) \
    .with_format(OldCsv()
                 .field('pickup_datetime', DataTypes.STRING())
                 .field('dropoff_datetime', DataTypes.STRING())
                 .field('pickup_longitude', DataTypes.FLOAT())
                 .field('pickup_latitude', DataTypes.FLOAT())
                 .field('dropoff_longitude', DataTypes.FLOAT())
                 .field('dropoff_latitude', DataTypes.FLOAT())
                 .field('O', DataTypes.BIGINT())
                 .field('D', DataTypes.BIGINT())
                 .field('same_od', DataTypes.BIGINT())
                 .field('duration', DataTypes.BIGINT())
                 .field('weekday', DataTypes.BIGINT())
                 .field('day', DataTypes.BIGINT())
                 .field('hour', DataTypes.BIGINT())
                 ) \
    .with_schema(Schema()
                 .field('pickup_datetime', DataTypes.STRING())
                 .field('dropoff_datetime', DataTypes.STRING())
                 .field('pickup_longitude', DataTypes.FLOAT())
                 .field('pickup_latitude', DataTypes.FLOAT())
                 .field('dropoff_longitude', DataTypes.FLOAT())
                 .field('dropoff_latitude', DataTypes.FLOAT())
                 .field('O', DataTypes.BIGINT())
                 .field('D', DataTypes.BIGINT())
                 .field('same_od', DataTypes.BIGINT())
                 .field('duration', DataTypes.BIGINT())
                 .field('weekday', DataTypes.BIGINT())
                 .field('day', DataTypes.BIGINT())
                 .field('hour', DataTypes.BIGINT())
                 ) \
    .create_temporary_table('mySource')

<pyflink.table.descriptors.StreamTableDescriptor at 0x15eb853bdc8>

In [4]:
t_env.connect(FileSystem().path('./data/pure_data')) \
    .with_format(OldCsv()
                 .field('pickup_datetime', DataTypes.STRING())
                 .field('dropoff_datetime', DataTypes.STRING())
                 .field('pickup_longitude', DataTypes.FLOAT())
                 .field('pickup_latitude', DataTypes.FLOAT())
                 .field('dropoff_longitude', DataTypes.FLOAT())
                 .field('dropoff_latitude', DataTypes.FLOAT())
                 .field('O', DataTypes.BIGINT())
                 .field('D', DataTypes.BIGINT())
                 .field('duration', DataTypes.BIGINT())
                 .field('weekday', DataTypes.BIGINT())
                 .field('day', DataTypes.BIGINT())
                 .field('hour', DataTypes.BIGINT())
                 ) \
    .with_schema(Schema()
                 .field('pickup_datetime', DataTypes.STRING())
                 .field('dropoff_datetime', DataTypes.STRING())
                 .field('pickup_longitude', DataTypes.FLOAT())
                 .field('pickup_latitude', DataTypes.FLOAT())
                 .field('dropoff_longitude', DataTypes.FLOAT())
                 .field('dropoff_latitude', DataTypes.FLOAT())
                 .field('O', DataTypes.BIGINT())
                 .field('D', DataTypes.BIGINT())
                 .field('duration', DataTypes.BIGINT())
                 .field('weekday', DataTypes.BIGINT())
                 .field('day', DataTypes.BIGINT())
                 .field('hour', DataTypes.BIGINT())
                 ) \
    .create_temporary_table('mySink')

<pyflink.table.descriptors.StreamTableDescriptor at 0x15eb8552188>

In [5]:
# 处理流程
t_env.from_path('mySource') \
    .where("O != -1") \
    .where("D != -1") \
    .where("duration >= 120") \
    .where("duration <= 3600") \
    .where("same_od == 0 ") \
    .where("weekday != -1") \
    .where("day != -1") \
    .where("hour != -1") \
    .select("pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, O, D, duration, weekday, day, hour") \
    .insert_into('mySink')

In [6]:
# 执行与计时
start_time = time.time()
t_env.execute("job2")
compute_time = time.time() - start_time
print(compute_time, compute_time / 60)

24.961636304855347 0.41602727174758913
