In [1]:
import pandas as pd
import numpy as np
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes, CsvTableSource
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf
import os
import time

In [2]:
st = time.time()
for day in range(1, 32):
    # 环境等设置
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(env)
    t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '80m')
    t_env.get_config().get_configuration().set_string("python.fn-execution.arrow.batch.size", '300000')

    # 输入表创建
    t_env.connect(FileSystem().path('./data/pure_data')) \
        .with_format(OldCsv()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .with_schema(Schema()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .create_temporary_table('mySource')

    # 输出表创建
    t_env.connect(FileSystem().path('./data/day_data/day_' + str(day))) \
        .with_format(OldCsv()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .with_schema(Schema()
                    .field('pickup_datetime', DataTypes.STRING())
                    .field('dropoff_datetime', DataTypes.STRING())
                    .field('pickup_longitude', DataTypes.FLOAT())
                    .field('pickup_latitude', DataTypes.FLOAT())
                    .field('dropoff_longitude', DataTypes.FLOAT())
                    .field('dropoff_latitude', DataTypes.FLOAT())
                    .field('O', DataTypes.BIGINT())
                    .field('D', DataTypes.BIGINT())
                    .field('duration', DataTypes.BIGINT())
                    .field('weekday', DataTypes.BIGINT())
                    .field('day', DataTypes.BIGINT())
                    .field('hour', DataTypes.BIGINT())
                    ) \
        .create_temporary_table('mySink')
    # 处理流程
    t_env.from_path('mySource') \
        .where(f"day = {day}") \
        .insert_into('mySink')

    # 执行与计时
    start_time = time.time()
    t_env.execute("job2")
    compute_time = time.time() - start_time
    print(day, compute_time, compute_time / 60)
sum_time = time.time() - st
print(sum_time, sum_time/60)

1 22.79132390022278 0.3798553983370463
2 20.14496421813965 0.3357494036356608
3 19.688557624816895 0.3281426270802816
4 20.235044479370117 0.3372507413228353
5 20.13295316696167 0.33554921944936117
6 20.391184329986572 0.3398530721664429
7 20.594365119934082 0.343239418665568
8 20.731487035751343 0.34552478392918906
9 20.503283977508545 0.3417213996251424
10 20.800549268722534 0.3466758211453756
11 20.86460566520691 0.34774342775344846
12 20.210022687911987 0.3368337114651998
13 20.41720676422119 0.3402867794036865
14 20.504284858703613 0.34173808097839353
15 20.466250896453857 0.3411041816075643
16 20.7745258808136 0.34624209801356
17 20.616384506225586 0.3436064084370931
18 20.788537979125977 0.34647563298543294
19 20.530308485031128 0.3421718080838521
20 20.13795828819275 0.3356326381365458
21 19.51940608024597 0.3253234346707662
22 20.278082609176636 0.33796804348627724
23 20.12494707107544 0.335415784517924
24 20.73248863220215 0.34554147720336914
25 20.85759997367859 0.3476266662