Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions starrocks/ddl_default.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CREATE TABLE bluesky (
`id` BIGINT AUTO_INCREMENT,
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON"
);
2 changes: 1 addition & 1 deletion starrocks/ddl_lz4.sql → starrocks/ddl_materialized.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ CREATE TABLE bluesky (
`collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
`did` VARCHAR(255) AS get_json_string(data, '$.did'),
`time_us` BIGINT AS get_json_int(data, '$.time_us')
) ENGINE=OLAP
)
ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`);
15 changes: 0 additions & 15 deletions starrocks/ddl_zstd.sql

This file was deleted.

32 changes: 16 additions & 16 deletions starrocks/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,30 +53,30 @@ benchmark() {

case $CHOICE in
2)
benchmark 10 lz4
benchmark 10 zstd
benchmark 10 default
benchmark 10 materialized
;;
3)
benchmark 100 lz4
benchmark 100 zstd
benchmark 100 default
benchmark 100 materialized
;;
4)
benchmark 1000 lz4
benchmark 1000 zstd
benchmark 1000 default
benchmark 1000 materialized
;;
5)
benchmark 1 lz4
benchmark 1 zstd
benchmark 10 lz4
benchmark 10 zstd
benchmark 100 lz4
benchmark 100 zstd
benchmark 1000 lz4
benchmark 1000 zstd
benchmark 1 materialized
benchmark 1 default
benchmark 10 materialized
benchmark 10 default
benchmark 100 materialized
benchmark 100 default
benchmark 1000 materialized
benchmark 1000 default
;;
*)
benchmark 1 lz4
benchmark 1 zstd
benchmark 1 materialized
benchmark 1 default
;;
esac

Expand Down
10 changes: 5 additions & 5 deletions starrocks/queries.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SELECT cast(data->'commit.collection' AS VARCHAR) AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
SELECT cast(data->'commit.collection' AS VARCHAR) AS event, count() AS count, count(DISTINCT cast(data->'did' AS VARCHAR)) AS users FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') GROUP BY event ORDER BY count DESC;
SELECT cast(data->'commit.collection' AS VARCHAR) AS event, hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT cast(data->'$.did' as VARCHAR) as user_id, min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
SELECT cast(data->'$.did' as VARCHAR) as user_id, date_diff('millisecond', min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We ideally update the queries and the runtime measurements in the same PR, otherwise the measurements become stale. If you like me to run the measurements on my local machine, please let me know.

(but let's first clarify the questions in my other comment)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, please help me run that measurements. Appreciate it.
I attempted to run this benchmark in a 32-core Docker container but obtained results that differed from yours. I’ll work on reproducing your results; however, for now, I believe it’s best to rely solely on your data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, i'm running on the latest version StarRocks-3.4.1, your results seems to be on StarRocks-3.4.0. If possible please use that version when running the measurements, thanks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running the benchmarks right now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, i'm running on the latest version StarRocks-3.4.1, your results seems to be on StarRocks-3.4.0. If possible please use that version when running the measurements, thanks.

I did not choose the version intentionally, I am just testing whichever version is loaded in install.sh.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My local measurements for scale factors 1, 10, 100 succeeded, then for scale factor 1000 only a single file was processed. I think the reason was that I was in parallel experimenting with #43 ... redoing the measurements now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By default, SR ensures the atomic loading of a file, meaning the load will fail if any records are unqualified. However, SR offers a max_filter_ratio parameter in stream load to control this behavior—consider whether you need to adjust it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@murphyatwork I am not very familiar with Starrocks but feel free to change the scripts so they use the setting. I could then re-benchmark.

SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC;
SELECT get_json_string(data, 'commit.collection') AS event, hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT get_json_string(data, '$.did') as user_id, min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
SELECT get_json_string(data, '$.did') as user_id, date_diff('millisecond', min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
44 changes: 22 additions & 22 deletions starrocks/queries_formatted.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
-- Q1 - Top event types
------------------------------------------------------------------------------------------------------------------------

SELECT cast(data->'commit.collection' AS VARCHAR) AS event,
SELECT get_json_string(data, 'commit.collection') AS event,
count() AS count
FROM bluesky
GROUP BY event
Expand All @@ -12,55 +12,55 @@ ORDER BY count DESC;
-- Q2 - Top event types together with unique users per event type
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'commit.collection' AS VARCHAR) AS event,
get_json_string(data, 'commit.collection') AS event,
count() AS count,
count(DISTINCT cast(data->'did' AS VARCHAR)) AS users
count(DISTINCT get_json_string(data, 'did')) AS users
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
GROUP BY event
ORDER BY count DESC;

------------------------------------------------------------------------------------------------------------------------
-- Q3 - When do people use BlueSky
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'commit.collection' AS VARCHAR) AS event,
hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day,
get_json_string(data, 'commit.collection') AS event,
hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day,
count() AS count
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR)))
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection')))
GROUP BY event, hour_of_day
ORDER BY hour_of_day, event;

------------------------------------------------------------------------------------------------------------------------
-- Q4 - top 3 post veterans
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'$.did' as VARCHAR) as user_id,
min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date
get_json_string(data, '$.did') as user_id,
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
AND (data->'commit.collection' = 'app.bsky.feed.post')
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY first_post_ts ASC
ORDER BY first_post_date ASC
LIMIT 3;

------------------------------------------------------------------------------------------------------------------------
-- Q5 - top 3 users with longest activity
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'$.did' as VARCHAR) as user_id,
get_json_string(data, '$.did') as user_id,
date_diff('millisecond',
min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),
max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))),
max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
AND (data->'commit.collection' = 'app.bsky.feed.post')
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY activity_span DESC
LIMIT 3;
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"system": "Starrocks (lz4)",
"system": "Starrocks (default)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 1000000000,
"num_loaded_documents": null,
"num_loaded_documents": 804000000,
"data_compression": "lz4",
"total_size": null,
"result": [
[null, null, null],
[null, null, null],
[null, null, null],
[null, null, null],
[2.27,1.24,1.21],
[17.81,10.67,10.20],
[7.38,6.78,7.62],
[7.24, null, null],
[null, null, null]
]
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"system": "Starrocks (zstd)",
"system": "Starrocks (materialized)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 1000000000,
"num_loaded_documents": null,
"num_loaded_documents": 997000000,
"data_compression": "zstd",
"total_size": null,
"total_size": 191541000000,
"result": [
[null, null, null],
[null, null, null],
[null, null, null],
[null, null, null],
[1.75,1.56,1.54],
[49.75,41.61,31.38],
[12.90,12.58,5.76],
[5.64,6.21,6.03],
[null, null, null]
]
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"system": "Starrocks (lz4)",
"system": "Starrocks (default)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 100000000,
"num_loaded_documents": 100000000,
"num_loaded_documents": 91000000,
"data_compression": "lz4",
"total_size": 19182000000,
"total_size": 17109000000,
"result": [
[0.25,0.17,0.17],
[8.13,4.33,3.82],
[3.18,3.08,3.05],
[4.06,4.07,4.12],
[4.04,4.20,3.97]
[0.61,0.16,0.16],
[19.26,7.12,7.18],
[1.12,1.08,1.08],
[0.55,0.55,0.54],
[0.60,0.60,0.60]
]
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"system": "Starrocks (zstd)",
"system": "Starrocks (materialized)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
Expand All @@ -12,12 +12,12 @@
"dataset_size": 100000000,
"num_loaded_documents": 100000000,
"data_compression": "zstd",
"total_size": 31200000000,
"total_size": 16190000000,
"result": [
[0.22,0.17,0.18],
[28.09,3.94,3.89],
[3.04,3.05,3.11],
[3.99,4.04,3.94],
[4.13,4.12,4.11]
[0.21,0.17,0.18],
[8.38,2.19,2.17],
[2.16,1.10,1.06],
[6.62,0.43,0.45],
[0.48,0.48,0.49]
]
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"system": "Starrocks (lz4)",
"system": "Starrocks (default)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 10000000,
"num_loaded_documents": 9999994,
"num_loaded_documents": 7000000,
"data_compression": "lz4",
"total_size": 1967000000,
"total_size": 824028000,
"result": [
[0.11,0.10,0.10],
[0.45,0.40,0.42],
[0.58,0.45,0.50],
[0.57,0.62,0.61],
[0.69,0.60,0.55]
[0.03,0.02,0.03],
[0.52,0.50,0.48],
[0.25,0.25,0.18],
[0.11,0.11,0.10],
[0.11,0.12,0.12]
]
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"system": "Starrocks (zstd)",
"system": "Starrocks (materialized)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
Expand All @@ -12,12 +12,12 @@
"dataset_size": 10000000,
"num_loaded_documents": 10000000,
"data_compression": "zstd",
"total_size": 3193000000,
"total_size": 616175000,
"result": [
[0.10,0.12,0.09],
[1.79,0.43,0.37],
[0.48,0.45,0.47],
[0.57,0.70,0.62],
[0.59,0.72,0.73]
[0.09,0.13,0.12],
[0.34,0.33,0.33],
[0.22,0.20,0.26],
[0.11,0.10,0.10],
[0.11,0.10,0.11]
]
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"system": "Starrocks (lz4)",
"system": "Starrocks (default)",
"version": "3.4.0-e94580b",
"os": "Ubuntu 24.04",
"date": "2025-01-13",
"date": "2025-03-24",
"machine": "m6i.8xlarge, 10000gib gp3",
"cluster_size": 1,
"comment": "",
Expand All @@ -12,12 +12,12 @@
"dataset_size": 1000000,
"num_loaded_documents": 1000000,
"data_compression": "lz4",
"total_size": 201845000,
"total_size": 1,
"result": [
[0.65,0.05,0.05],
[0.36,0.28,0.29],
[0.31,0.28,0.28],
[0.52,0.52,0.51],
[0.51,0.51,0.52]
[0.05,0.05,0.06],
[0.19,0.08,0.07],
[0.13,0.13,0.13],
[0.07,0.07,0.07],
[0.07,0.07,0.07]
]
}
Loading