In [1]:
import pandas as pd
import numpy as np
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes, CsvTableSource
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf
import os
import time

In [2]:
st = time.time()
for hour in range(24):
    # 环境等设置
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(env)
    t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '80m')
    t_env.get_config().get_configuration().set_string("python.fn-execution.arrow.batch.size", '300000')

    # 输入表创建
    t_env.connect(FileSystem().path('./data/pure_data')) \
        .with_format(OldCsv()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .with_schema(Schema()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .create_temporary_table('mySource')

    # 输出表创建
    t_env.connect(FileSystem().path('./data/hour_data/hour_' + str(hour))) \
        .with_format(OldCsv()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .with_schema(Schema()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .create_temporary_table('mySink')
    # 处理流程
    t_env.from_path('mySource') \
        .where(f"hour = {hour}") \
        .insert_into('mySink')

    # 执行与计时
    start_time = time.time()
    t_env.execute("job2")
    compute_time = time.time() - start_time
    print(hour, compute_time, compute_time / 60)
sum_time = time.time() - st
print(sum_time, sum_time/60)

0 21.798309803009033 0.3633051633834839
1 19.11193013191223 0.31853216886520386
2 18.83568525314331 0.3139280875523885
3 18.503390789031982 0.30838984648386636
4 18.044984817504883 0.3007497469584147
5 18.051990747451782 0.3008665124575297
6 18.888731956481934 0.3148121992746989
7 20.039751768112183 0.3339958628018697
8 20.539194107055664 0.3423199017842611
9 20.69132924079895 0.34485548734664917
10 20.67331290245056 0.3445552150408427
11 20.71635127067566 0.34527252117792767
12 21.019620180130005 0.35032700300216674
13 21.013179779052734 0.35021966298421225
14 21.628498554229736 0.36047497590382893
15 21.129441022872925 0.35215735038121543
16 20.381054401397705 0.33968424002329506
17 20.911524295806885 0.34852540493011475
18 21.615147590637207 0.36025245984395343
19 21.74626326560974 0.3624377210934957
20 21.41997480392456 0.3569995800654093
21 21.086679220199585 0.35144465366999306
22 20.953561305999756 0.3492260217666626
23 20.502161502838135 0.3417026917139689
491.6881830692291 8.1