In [39]:
# !pip3 install plotly --quiet
# !pip3 install nbformat --quiet

In [40]:
import duckdb 
import pandas as pd
import plotly.express as px
import plotly
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [41]:
con = duckdb.connect()
log_db_name = 'benchmark_log_python_3_repeat_no_OOM_test.db'
attach_results = con.sql(f"ATTACH '{log_db_name}' as sqlite_db (TYPE SQLITE)")
ctas_results = con.sql("""
                       create or replace table duckdb_results as 
                       select * from sqlite_db.results
                       where 
                            NOT (benchmark = '003 Convert to Enums' and try_cast(time as float) < 0.01)""")
results = con.sql("""
    with version_ranks as (
        from duckdb_results
        select distinct
            (scenario::json ->> 'duckdb_version').replace('v','') as version,
            version.string_split('.') as split_version,
            try_cast(split_version[1] as int) * 100 * 100 as first_digit,
            try_cast(split_version[2] as int) * 100 as second_digit,
            try_cast(split_version[3] as int) as third_digit,
            first_digit + second_digit + third_digit as version_rank,
    ), max_version as (
        from version_ranks
        select 
            max(version_rank),
            arg_max(version, version_rank) as max_version
    ), median_results as (
        from duckdb_results
        select 
            run_id,
            min(repeat_id) as repeat_id,
            benchmark,
            scenario,
            median(time) as time,
        group by all            
    )
    from median_results
    join max_version on 1=1
    select 
        * exclude time,
        time as "Time (seconds)",
        (scenario::json ->> 'duckdb_version').replace('v','') as "DuckDB Version",
        case when benchmark ilike '%queries%' or benchmark ilike '%enums%' then 'Analysis'
             when benchmark ilike '%csv%' then 'CSV Import' 
             when benchmark ilike '%scan%' or benchmark ilike '%query pandas%' then 'Scan other formats' 
             when benchmark ilike '%export%' then 'Export'
        end as benchmark_type,
        case when benchmark_type = 'CSV Import' then 1
             when benchmark_type = 'Scan other formats' then 2
             when benchmark_type = 'Analysis' then 3
             when benchmark_type = 'Export' then 4
             end as benchmark_type_order,
        benchmark_type as "Benchmark Type",
        sum(time) over (partition by "DuckDB Version") as total_time_by_version,
        sum(case when (scenario::json ->> 'duckdb_version').replace('v','') = max_version.max_version then time else 0 end) over () as total_time_max_version,
        time / total_time_max_version as "Time Relative to Latest Version"
    order by benchmark, run_id, scenario, repeat_id""").df()
results.head(5)

Unnamed: 0,run_id,repeat_id,benchmark,scenario,max(version_rank),max_version,Time (seconds),DuckDB Version,benchmark_type,benchmark_type_order,Benchmark Type,total_time_by_version,total_time_max_version,Time Relative to Latest Version
0,1,0,001 Query pandas,"{""duckdb_version"": ""v0.10.2""}",1002,0.10.2,0.002314,0.10.2,Scan other formats,2.0,Scan other formats,36.54905,36.54905,6.3e-05
1,2,0,001 Query pandas,"{""duckdb_version"": ""v0.10.1""}",1002,0.10.2,0.001823,0.10.1,Scan other formats,2.0,Scan other formats,37.606047,36.54905,5e-05
2,3,0,001 Query pandas,"{""duckdb_version"": ""v0.10.0""}",1002,0.10.2,0.002377,0.10.0,Scan other formats,2.0,Scan other formats,38.160447,36.54905,6.5e-05
3,4,0,001 Query pandas,"{""duckdb_version"": ""v0.9.2""}",1002,0.10.2,0.001576,0.9.2,Scan other formats,2.0,Scan other formats,40.665796,36.54905,4.3e-05
4,5,0,001 Query pandas,"{""duckdb_version"": ""v0.9.1""}",1002,0.10.2,0.001964,0.9.1,Scan other formats,2.0,Scan other formats,41.170681,36.54905,5.4e-05


In [42]:
test_results = con.sql("""
    with version_ranks as (
        from duckdb_results
        select distinct
            (scenario::json ->> 'duckdb_version').replace('v','') as version,
            version.string_split('.') as split_version,
            try_cast(split_version[1] as int) * 100 * 100 as first_digit,
            try_cast(split_version[2] as int) * 100 as second_digit,
            try_cast(split_version[3] as int) as third_digit,
            first_digit + second_digit + third_digit as version_rank,
    )
    from version_ranks
    select 
        max(version_rank),
        arg_max(version, version_rank) as max_version
""").df()
test_results

Unnamed: 0,max(version_rank),max_version
0,1002,0.10.2


In [43]:
fig = px.area(
    results,
    x="DuckDB Version",
    y="Time (seconds)",
    color="Benchmark Type",
    line_group="benchmark",
    category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis","Export"]},
    template='plotly_white'
)
print(plotly.io.to_json(fig, pretty=True))
fig.show()

{
  "data": [
    {
      "fillpattern": {
        "shape": ""
      },
      "hovertemplate": "Benchmark Type=CSV Import\u003cbr\u003ebenchmark=002 Create table from csv\u003cbr\u003eDuckDB Version=%{x}\u003cbr\u003eTime (seconds)=%{y}\u003cextra\u003e\u003c\u002fextra\u003e",
      "legendgroup": "CSV Import",
      "line": {
        "color": "#636efa"
      },
      "marker": {
        "symbol": "circle"
      },
      "mode": "lines",
      "name": "CSV Import",
      "orientation": "v",
      "showlegend": true,
      "stackgroup": "1",
      "x": [
        "0.10.2",
        "0.10.1",
        "0.10.0",
        "0.9.2",
        "0.9.1",
        "0.9.0",
        "0.8.1",
        "0.7.1",
        "0.6.1",
        "0.5.1",
        "0.4.0",
        "0.3.4",
        "0.3.2",
        "0.3.1",
        "0.3.0",
        "0.2.9",
        "0.2.8",
        "0.2.7"
      ],
      "xaxis": "x",
      "y": [
        1.4280079589999986,
        1.4027828340000035,
        1.4081058749999968,
     

In [44]:
fig = px.area(
    results,
    x="DuckDB Version",
    y="Time Relative to Latest Version",
    color="Benchmark Type",
    line_group="benchmark",
    category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis","Export"]},
    template='plotly_white'
)
fig.show()

In [45]:
# Pandas or Arrow
# Pick either Arrow or Pandas for import/export depending on which is faster for each version
pandas_or_arrow = con.sql("""
    with version_ranks as (
        from duckdb_results
        select distinct
            (scenario::json ->> 'duckdb_version').replace('v','') as version,
            version.string_split('.') as split_version,
            try_cast(split_version[1] as int) * 100 * 100 as first_digit,
            try_cast(split_version[2] as int) * 100 as second_digit,
            try_cast(split_version[3] as int) as third_digit,
            first_digit + second_digit + third_digit as version_rank,
    ), max_version as (
        from version_ranks
        select 
            max(version_rank),
            arg_max(version, version_rank) as max_version
    ), median_results as (
        from duckdb_results
        select 
            run_id,
            min(repeat_id) as repeat_id,
            benchmark,
            scenario,
            (scenario::json ->> 'duckdb_version').replace('v','') as version,
            median(time) as time,
        where 
            benchmark != '001 Query pandas'
        group by all            
    )
    from median_results
    join max_version on 1=1
    join version_ranks on median_results.version = version_ranks.version 
    select 
        median_results.* exclude (time, version, benchmark),
        benchmark.replace('Windowing performance test: ','') as benchmark,
        version_ranks.version_rank,
        time as "Time (seconds)",
        version_ranks.version as "DuckDB Version",
        case when (benchmark ilike '%queries%' or benchmark ilike '%enums%') and benchmark ilike '%join%' then 'Analysis - Join'
             when (benchmark ilike '%queries%' or benchmark ilike '%enums%') and benchmark not ilike '%join%' then 'Analysis - Group By'
             when benchmark ilike '%csv%' then 'CSV Import' 
             when benchmark ilike '%scan%' or benchmark ilike '%query pandas%' then 'Scan other formats' 
             when benchmark ilike '%export%' then 'Export'
             when benchmark ilike '%window%' then 'Window Functions' 
        end as benchmark_type,
        case when benchmark_type = 'CSV Import' then 1
             when benchmark_type = 'Scan other formats' then 2
             when benchmark_type = 'Analysis - Group By' then 3
             when benchmark_type = 'Analysis - Join' then 4
             when benchmark_type = 'Window Functions' then 5
             when benchmark_type = 'Export' then 6
             end as benchmark_type_order,
        benchmark_type as "Benchmark Type",
        sum(time) over (partition by "DuckDB Version") as total_time_by_version,
        sum(case when (scenario::json ->> 'duckdb_version').replace('v','') = max_version.max_version then time else 0 end) over () as total_time_max_version,
        time / total_time_max_version as "Time Relative to Latest Version",
        sum(case when (scenario::json ->> 'duckdb_version').replace('v','') = max_version.max_version then time else 0 end) over (partition by benchmark_type) as benchmark_type_time_max_version,
        time / benchmark_type_time_max_version as "Time Relative to Latest Version by Type",
        sum(case when (scenario::json ->> 'duckdb_version').replace('v','') = max_version.max_version then time else 0 end) over (partition by benchmark_type, benchmark) as benchmark_time_max_version,
        time / benchmark_time_max_version as "Time Relative to Latest Version by Benchmark",
        case when benchmark ilike '%pandas%' then 'Pandas' 
             when benchmark ilike '%arrow%' then 'Arrow'
             end as pandas_or_arrow
    where 
        (version_rank < 501 and (pandas_or_arrow is null OR pandas_or_arrow = 'Pandas'))
        OR (version_rank >= 501 and (pandas_or_arrow is null OR pandas_or_arrow = 'Arrow'))
    order by benchmark, run_id, version_rank, repeat_id""").df()
pandas_or_arrow.head()

Unnamed: 0,run_id,repeat_id,scenario,benchmark,version_rank,Time (seconds),DuckDB Version,benchmark_type,benchmark_type_order,Benchmark Type,total_time_by_version,total_time_max_version,Time Relative to Latest Version,benchmark_type_time_max_version,Time Relative to Latest Version by Type,benchmark_time_max_version,Time Relative to Latest Version by Benchmark,pandas_or_arrow
0,1,0,"{""duckdb_version"": ""v0.10.2""}",002 Create table from csv,1002,1.428008,0.10.2,CSV Import,1,CSV Import,29.487007,29.487007,0.048428,5.166436,0.276401,1.428008,1.0,
1,2,0,"{""duckdb_version"": ""v0.10.1""}",002 Create table from csv,1001,1.402783,0.10.1,CSV Import,1,CSV Import,30.099737,29.487007,0.047573,5.166436,0.271518,1.428008,0.982335,
2,3,0,"{""duckdb_version"": ""v0.10.0""}",002 Create table from csv,1000,1.408106,0.10.0,CSV Import,1,CSV Import,30.631541,29.487007,0.047753,5.166436,0.272549,1.428008,0.986063,
3,4,0,"{""duckdb_version"": ""v0.9.2""}",002 Create table from csv,902,1.881047,0.9.2,CSV Import,1,CSV Import,33.690451,29.487007,0.063792,5.166436,0.36409,1.428008,1.317253,
4,5,0,"{""duckdb_version"": ""v0.9.1""}",002 Create table from csv,901,1.888766,0.9.1,CSV Import,1,CSV Import,34.045396,29.487007,0.064054,5.166436,0.365584,1.428008,1.322658,


In [46]:
# We need to fix plotly's ordering issue when we filter down to just analysis
pandas_or_arrow_with_placeholders = con.sql("""
    
    with version_ranks as (
        from duckdb_results
        select distinct
            (scenario::json ->> 'duckdb_version').replace('v','') as version,
            version.string_split('.') as split_version,
            try_cast(split_version[1] as int) * 100 * 100 as first_digit,
            try_cast(split_version[2] as int) * 100 as second_digit,
            try_cast(split_version[3] as int) as third_digit,
            first_digit + second_digit + third_digit as version_rank,
    ), placeholder as (
        from duckdb_results
        join version_ranks on version_ranks.version = (duckdb_results.scenario::json ->> 'duckdb_version').replace('v','')
        join (select distinct "Benchmark Type" from pandas_or_arrow) benchmark_types on 1=1
        select 
            0 as run_id,
            min(repeat_id) as repeat_id,
            '000 Placeholder for charting' as benchmark,
            scenario,
            (scenario::json ->> 'duckdb_version').replace('v','') as "DuckDB Version",
            0 as "Time (seconds)",
            0 as "Time Relative to Latest Version",   
            version_ranks.version_rank,
            "Benchmark Type" 
        group by all
    ), unioned_with_placeholder as (
        from pandas_or_arrow
        union all by name
        from placeholder   
    )
    from unioned_with_placeholder
    order by benchmark, run_id, version_rank, repeat_id
              
""").df()
# pandas_or_arrow_with_placeholders

In [47]:
fig = px.area(
    pandas_or_arrow_with_placeholders,
    x="DuckDB Version",
    y="Time (seconds)",
    color="Benchmark Type",
    line_group="benchmark",
    category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
    template='plotly_white',
    color_discrete_sequence=px.colors.qualitative.T10,
)
# print(plotly.io.to_json(fig, pretty=True))
fig.show()

fig = px.area(
    pandas_or_arrow_with_placeholders,
    x="DuckDB Version",
    y="Time Relative to Latest Version",
    color="Benchmark Type",
    line_group="benchmark",
    category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
    template='plotly_white',
    color_discrete_sequence=px.colors.qualitative.T10,
)
fig.show()



In [48]:
benchmark_types = ["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]
for benchmark_type in benchmark_types:
    fig = px.line(
        con.sql(f"""from pandas_or_arrow_with_placeholders where "Benchmark Type" = '{benchmark_type}'""").df(),
        x="DuckDB Version",
        y="Time Relative to Latest Version by Benchmark",
        color="benchmark",
        # line_group="benchmark",
        # category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
        template='plotly_white',
        color_discrete_sequence=px.colors.qualitative.T10,
        title=benchmark_type,
    )
    fig.show()

In [53]:
benchmark_types = ["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]
for benchmark_type in benchmark_types:
    fig = px.area(
        con.sql(f"""from pandas_or_arrow_with_placeholders where "Benchmark Type" = '{benchmark_type}'""").df(),
        x="DuckDB Version",
        y="Time Relative to Latest Version by Type",
        color="benchmark",
        # line_group="benchmark",
        # category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
        template='plotly_white',
        color_discrete_sequence=px.colors.qualitative.T10,
        title=benchmark_type,
    )
    fig.show()

- Group by
    - 0.3.1: 1.28 - .48 improvement
        - Default multithreading (https://github.com/duckdb/duckdb/pull/2393)
        - Push based execution (https://github.com/duckdb/duckdb/pull/2393)
    - 0.6.1:  0.32 to 0.1 improvement
        - Parallel data loading (https://github.com/duckdb/duckdb/pull/5082)
        - Patas floating point compression (https://github.com/duckdb/duckdb/pull/5044)
    - 0.8.1: from .1 to .8 improvement
        - Avoid unnecessary resizing of hash tables (https://github.com/duckdb/duckdb/pull/6877)
    - 0.10.0: from .8 to .75
- Join
    - 0.6.1: 2.2 to 0.35
        - Improvements to out of core hash join (https://github.com/duckdb/duckdb/pull/4970)
        - Parallel data loading (https://github.com/duckdb/duckdb/pull/5082)
        - Patas floating point compression (https://github.com/duckdb/duckdb/pull/5044)
    - 0.7.1: 0.35 to 0.2
        - *TODO NEED TO ADD THINGS HERE*
    - (then gave back ground to 0.27 up through 0.9.2)
    - 0.10.0: 0.27 to 0.2
- 008 Scan and Aggregate over Parquet file
    - 0.3.1: .005 to .001 
        - Default multithreading (https://github.com/duckdb/duckdb/pull/2393)
        - Push based execution (https://github.com/duckdb/duckdb/pull/2393)
- Create table from csv
    - 0.8.1: 1.2 to 0.4
        - Parallel CSV Reader as default (https://github.com/duckdb/duckdb/pull/6977)
    - 0.10.0: 0.36 to 0.28
- Scan and aggregate over Pandas
    - 668 - 271
- Should tell the Pandas vs. Arrow story (0.5.1)
- Should tell the Enums story (0.6.1)
- I need to do this for window functions
- Benchmark the connection time?
    - Lazily load row groups from tables: 100x benefit with 770 million rows: https://github.com/duckdb/duckdb/pull/6715

In [49]:
# Also plot by date
from datetime import datetime
versions = {
    # 0.2.7 is the first with MacOS ARM
    '0.2.7': {'date':datetime.strptime('2021-06-14','%Y-%m-%d')},
    '0.2.8': {'date':datetime.strptime('2021-08-02','%Y-%m-%d')},
    '0.2.9': {'date':datetime.strptime('2021-09-06','%Y-%m-%d')},
    '0.3.0': {'date':datetime.strptime('2021-10-06','%Y-%m-%d')},
    '0.3.1': {'date':datetime.strptime('2021-11-16','%Y-%m-%d')},
    '0.3.2': {'date':datetime.strptime('2022-02-07','%Y-%m-%d')},
    # 0.3.3 did not upload to pip correctly so it should be skipped
    # '0.3.3': {'date':datetime.strptime('2022-04-11','%Y-%m-%d')},
    '0.3.4': {'date':datetime.strptime('2022-04-25','%Y-%m-%d')},
    '0.4.0': {'date':datetime.strptime('2022-06-20','%Y-%m-%d')},
    '0.5.1': {'date':datetime.strptime('2022-09-19','%Y-%m-%d')},
    '0.6.1': {'date':datetime.strptime('2022-12-06','%Y-%m-%d')},
    '0.7.1': {'date':datetime.strptime('2023-02-27','%Y-%m-%d')},
    '0.8.1': {'date':datetime.strptime('2023-06-13','%Y-%m-%d')},
    '0.9.0': {'date':datetime.strptime('2023-09-26','%Y-%m-%d')},
    '0.9.1': {'date':datetime.strptime('2023-10-11','%Y-%m-%d')},
    '0.9.2': {'date':datetime.strptime('2023-11-14','%Y-%m-%d')},
    '0.10.0': {'date':datetime.strptime('2024-02-13','%Y-%m-%d')},
    '0.10.1': {'date':datetime.strptime('2024-03-18','%Y-%m-%d')},
    '0.10.2': {'date':datetime.strptime('2024-04-17','%Y-%m-%d')},
}
versions_df = pd.DataFrame([versions])
# versions_df
pandas_or_arrow_with_dates = con.sql("""
    with unpivoted as (
        unpivot versions_df
        on columns(*)
        into name version
             value date_struct
    ), version_to_date_map as (
        from unpivoted 
        select 
            version,
            date_struct.date as "Release Date"
    )
    from pandas_or_arrow_with_placeholders data
    left join version_to_date_map on data."DuckDB Version" = version_to_date_map.version
    select 
        data.*,
        version_to_date_map.* exclude version
        
""").df()
# pandas_or_arrow_with_dates

In [50]:
fig = px.area(
    pandas_or_arrow_with_dates,
    x="Release Date",
    y="Time (seconds)",
    color="Benchmark Type",
    line_group="benchmark",
    category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
    template='plotly_white',
    color_discrete_sequence=px.colors.qualitative.T10,
    hover_data=["DuckDB Version"]
)
# print(plotly.io.to_json(fig, pretty=True))
# fig.update_layout(xaxis2= {'anchor': 'y', 'overlaying': 'x', 'side': 'top'})
# print(fig.data[1])
# fig.data[1].update(xaxis='x2')
fig.show()

fig = px.area(
    pandas_or_arrow_with_dates,
    x="Release Date",
    y="Time Relative to Latest Version",
    color="Benchmark Type",
    line_group="benchmark",
    category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
    template='plotly_white',
    color_discrete_sequence=px.colors.qualitative.T10,
    hover_data=["DuckDB Version"]
)
fig.show()

In [51]:
benchmark_types = ["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]
for benchmark_type in benchmark_types:
    fig = px.line(
        con.sql(f"""from pandas_or_arrow_with_dates where "Benchmark Type" = '{benchmark_type}'""").df(),
        x="Release Date",
        y="Time Relative to Latest Version by Benchmark",
        color="benchmark",
        # line_group="benchmark",
        # category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
        template='plotly_white',
        color_discrete_sequence=px.colors.qualitative.T10,
        title=benchmark_type,
    )
    fig.show()

In [54]:
benchmark_types = ["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]
for benchmark_type in benchmark_types:
    fig = px.area(
        con.sql(f"""from pandas_or_arrow_with_dates where "Benchmark Type" = '{benchmark_type}'""").df(),
        x="Release Date",
        y="Time Relative to Latest Version by Type",
        color="benchmark",
        # line_group="benchmark",
        # category_orders={"Benchmark Type":["CSV Import","Scan other formats","Analysis - Group By", 'Analysis - Join', 'Window Functions', "Export"]},
        template='plotly_white',
        color_discrete_sequence=px.colors.qualitative.T10,
        title=benchmark_type,
    )
    fig.show()

In [52]:
# Want to show a table of the various configurations (Pandas vs. Arrow, Enums or not, etc.)
# Use the DuckDB CLI to generate Markdown
import subprocess 
import os
duckdb_location = './duckdb_0_9_0'
sql_location = './benchmark_configs.sql'
versions_with_enums = ['0.6.1', '0.7.1', '0.8.1', '0.9.0', '0.9.1', '0.9.2', '0.10.0', '0.10.1', '0.10.2']
versions_using_pandas  = ['0.2.7', '0.2.8', '0.2.9', '0.3.0', '0.3.1', '0.3.2', '0.3.4', '0.4.0']
versions_using_pyarrow = ['0.5.1', '0.6.1', '0.7.1', '0.8.1', '0.9.0', '0.9.1', '0.9.2', '0.10.0', '0.10.1', '0.10.2']

# versions_without_enums_pre_pandas = []
# for version in versions_without_enums:
#     versions_without_enums_pre_pandas.append({'duckdb_version_without_enums':version})
# versions_without_enums_df = pd.DataFrame(versions_without_enums_pre_pandas)
# print(versions_without_enums_df)

# versions_without_pyarrow_pre_pandas = []
# for version in versions_without_pyarrow:
#     versions_without_pyarrow_pre_pandas.append({'duckdb_version_without_pyarrow':version})
# versions_without_pyarrow_df = pd.DataFrame(versions_without_pyarrow_pre_pandas)
# print(versions_without_pyarrow_df)

if os.path.exists(sql_location):
    os.remove(sql_location)
with open(sql_location, mode='w') as f:
    f.write(f""".mode markdown
            with pandas as (
                SELECT UNNEST({versions_using_pandas}) as versions_using_pandas
            ), enums as (
                SELECT UNNEST({versions_with_enums}) as versions_with_enums
            ), pyarrow as (
                SELECT UNNEST({versions_using_pyarrow}) as versions_using_pyarrow
            ), all_versions as (
                from pandas select distinct versions_using_pandas as duckdb_version
                union
                from enums select distinct versions_with_enums as duckdb_version
                union 
                from pyarrow select distinct *
            )
            from all_versions a
            left join pandas on a.duckdb_version = pandas.versions_using_pandas 
            left join pyarrow on a.duckdb_version = pyarrow.versions_using_pyarrow
            left join enums on a.duckdb_version = enums.versions_with_enums
            select 
                a.*,
                a.duckdb_version.string_split('.') as split_version,
                try_cast(split_version[1] as int) * 100 * 100 as first_digit,
                try_cast(split_version[2] as int) * 100 as second_digit,
                try_cast(split_version[3] as int) as third_digit,
                first_digit + second_digit + third_digit as version_rank,
                pandas.*,
                pyarrow.*,
                enums.*,
            order by 
                version_rank
""")
result = subprocess.run([duckdb_location, ':memory:', "-c",f""".read {sql_location}"""], capture_output=True, text=True)
# print(result)
print('result.stdout:\n'+result.stdout)
print('result.stderr:\n'+result.stderr)

result.stdout:
| duckdb_version | split_version | first_digit | second_digit | third_digit | version_rank | versions_using_pandas | versions_using_pyarrow | versions_with_enums |
|----------------|---------------|-------------|--------------|-------------|--------------|-----------------------|------------------------|---------------------|
| 0.2.7          | [0, 2, 7]     | 0           | 200          | 7           | 207          | 0.2.7                 |                        |                     |
| 0.2.8          | [0, 2, 8]     | 0           | 200          | 8           | 208          | 0.2.8                 |                        |                     |
| 0.2.9          | [0, 2, 9]     | 0           | 200          | 9           | 209          | 0.2.9                 |                        |                     |
| 0.3.0          | [0, 3, 0]     | 0           | 300          | 0           | 300          | 0.3.0                 |                        |                     |
|

## Brainstorming
- Total time area plot
- Relative time area plot (with current as 1, scaling the rest off of that)
    - Grouping related operations together
        - Maybe grouping by type (export to pandas)
        - Maybe grouping by task (group by tests)
        - Group by "importing, querying, exporting" to show the whole burger idea
    - Likely need to filter things down
    - Choosing between Pandas and Arrow whenever Arrow was faster
- Annotating the charts to show what improved with each version

- Max data scale plot
    - Likely need more than just 2 datasets (10 million, 100 million, 1 billion, 10 billion?) Likely unable to generate 10 billion locally with this script.
    - Probably want a column chart with group by and join as the 2 types of bars (alternating group by / join for each version showing the max it can handle? Or maybe a line chart)
