diff --git a/starrocks/count.sh b/starrocks/count.sh index bb27457..de1338e 100755 --- a/starrocks/count.sh +++ b/starrocks/count.sh @@ -12,4 +12,4 @@ fi DB_NAME="$1" TABLE_NAME="$2" -mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;" +mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SELECT count() FROM $TABLE_NAME;" diff --git a/starrocks/create_and_load.sh b/starrocks/create_and_load.sh index 2db7d1d..a0bde1f 100755 --- a/starrocks/create_and_load.sh +++ b/starrocks/create_and_load.sh @@ -15,6 +15,7 @@ DATA_DIRECTORY="$3" NUM_FILES="$4" SUCCESS_LOG="$5" ERROR_LOG="$6" +DDL_FILE="ddl.sql" # Validate arguments [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } @@ -22,10 +23,10 @@ ERROR_LOG="$6" echo "Create database" -mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" +mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" echo "Execute DDL" -mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql" +mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" < "$DDL_FILE" echo "Load data" ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" diff --git a/starrocks/ddl.sql b/starrocks/ddl.sql index c27de83..b48e15e 100644 --- a/starrocks/ddl.sql +++ b/starrocks/ddl.sql @@ -1,4 +1,12 @@ CREATE TABLE bluesky ( `id` BIGINT AUTO_INCREMENT, - `data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON" -); \ No newline at end of file + `data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON", + + sort_key VARBINARY AS encode_sort_key( + get_json_string(data, 'kind'), + get_json_string(data, 'commit.operation'), + get_json_string(data, 'commit.collection'), + get_json_string(data, 'did') + ) +) +ORDER BY (sort_key); diff --git a/starrocks/drop_table.sh b/starrocks/drop_table.sh index 46d041f..466b9f2 100755 --- a/starrocks/drop_table.sh +++ b/starrocks/drop_table.sh @@ -12,4 +12,4 @@ DB_NAME="$1" TABLE_NAME="$2" echo "Dropping table: $DB_NAME.$TABLE_NAME" -mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME" +mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "DROP TABLE IF EXISTS $TABLE_NAME" diff --git a/starrocks/install.sh b/starrocks/install.sh index 21151d7..4512c2c 100755 --- a/starrocks/install.sh +++ b/starrocks/install.sh @@ -4,4 +4,21 @@ sudo snap install docker sudo apt-get update sudo apt-get install -y mysql-client -docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu +docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu:4.0.0-rc01 + +echo "Starting StarRocks container..." +sleep 5 + +# Monitor logs until "Enjoy" appears +echo "Monitoring container logs for 'Enjoy' message..." +timeout 300 docker logs -f starrocks | while read line; do + echo "$line" + if echo "$line" | grep -q "Enjoy"; then + echo "Found 'Enjoy' message! Container is ready." + # Kill the docker logs process + pkill -f "docker logs -f starrocks" + break + fi +done + +echo "StarRocks started successfully." diff --git a/starrocks/load_data.sh b/starrocks/load_data.sh index 13b10f3..6f3b1da 100755 --- a/starrocks/load_data.sh +++ b/starrocks/load_data.sh @@ -36,14 +36,25 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do echo "Error: Failed to uncompress $file" >> "$ERROR_LOG" continue fi - MAX_ATTEMPT=10 + + MAX_ATTEMPT=1 attempt=0 while [ $attempt -lt $MAX_ATTEMPT ] do - # Attempt the import - http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load) - response_body="$(cat /tmp/curl_body)" - response_status="$(cat /tmp/curl_body | jq -r '.Status')" + http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body_$$) \ + --location-trusted -u root: \ + -H "max_filter_ratio: 0.00001" \ + -H "strict_mode: true" \ + -H "Expect:100-continue" \ + -T "$uncompressed_file" \ + -XPUT http://${DB_HOST}:${DB_HTTP_PORT}/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load) + response_body="$(cat /tmp/curl_body_$$)" + if jq -e . >/dev/null 2>&1 < /tmp/curl_body_$$; then + response_status="$(jq -r '.Status' < /tmp/curl_body_$$)" + else + response_status="" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Invalid JSON response for $file: $(cat /tmp/curl_body_$$)" >> "$ERROR_LOG" + fi echo $response_status if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then if [ "$response_status" = "Success" ] diff --git a/starrocks/main.sh b/starrocks/main.sh index 5e24dfe..154d1b9 100755 --- a/starrocks/main.sh +++ b/starrocks/main.sh @@ -18,6 +18,11 @@ ERROR_LOG="${4:-error.log}" # Define prefix for output files OUTPUT_PREFIX="${5:-_m6i.8xlarge}" +export DB_HOST="127.0.0.1" +export DB_USER="root" +export DB_MYSQL_PORT="9030" +export DB_HTTP_PORT="8030" # HTTP endpoint for stream load + # Check if the directory exists if [[ ! -d "$DATA_DIRECTORY" ]]; then echo "Error: Data directory '$DATA_DIRECTORY' does not exist." diff --git a/starrocks/physical_query_plans.sh b/starrocks/physical_query_plans.sh index 993fcae..7cd6f74 100755 --- a/starrocks/physical_query_plans.sh +++ b/starrocks/physical_query_plans.sh @@ -17,7 +17,7 @@ cat queries.sql | while read -r query; do echo "------------------------------------------------------------------------------------------------------------------------" echo "Physical query plan for query Q$QUERY_NUM:" echo - mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query" + mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "EXPLAIN $query" # Increment the query number QUERY_NUM=$((QUERY_NUM + 1)) diff --git a/starrocks/queries.sql b/starrocks/queries.sql index f126823..72ddbce 100644 --- a/starrocks/queries.sql +++ b/starrocks/queries.sql @@ -1,5 +1,5 @@ SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC; -SELECT get_json_string(data, 'commit.collection') AS event, hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; -SELECT get_json_string(data, '$.did') as user_id, min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3; -SELECT get_json_string(data, '$.did') as user_id, date_diff('millisecond', min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; +SELECT get_json_string(data, 'commit.collection') AS event, hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; +SELECT get_json_string(data, 'did') as user_id, to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3; +SELECT get_json_string(data, 'did') as user_id, date_diff('millisecond', to_datetime(min(get_json_int(data, 'time_us')), 6), to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; diff --git a/starrocks/queries_formatted.sql b/starrocks/queries_formatted.sql index 16973b5..2e0c896 100644 --- a/starrocks/queries_formatted.sql +++ b/starrocks/queries_formatted.sql @@ -3,9 +3,9 @@ ------------------------------------------------------------------------------------------------------------------------ SELECT get_json_string(data, 'commit.collection') AS event, - count() AS count -FROM bluesky -GROUP BY event + count() AS count +FROM bluesky +GROUP BY event ORDER BY count DESC; ------------------------------------------------------------------------------------------------------------------------ @@ -17,7 +17,7 @@ SELECT count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') - AND (get_json_string(data, 'commit.operation') = 'create') + AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC; @@ -25,13 +25,13 @@ ORDER BY count DESC; -- Q3 - When do people use BlueSky ------------------------------------------------------------------------------------------------------------------------ SELECT - get_json_string(data, 'commit.collection') AS event, - hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, + get_json_string(data, 'commit.collection') AS event, + hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky -WHERE (get_json_string(data, 'kind') = 'commit') -AND (get_json_string(data, 'commit.operation') = 'create') -AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) +WHERE (get_json_string(data, 'kind') = 'commit') +AND (get_json_string(data, 'commit.operation') = 'create') +AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; @@ -39,11 +39,11 @@ ORDER BY hour_of_day, event; -- Q4 - top 3 post veterans ------------------------------------------------------------------------------------------------------------------------ SELECT - get_json_string(data, '$.did') as user_id, - min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date + get_json_string(data, 'did') as user_id, + to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky -WHERE (get_json_string(data, 'kind') = 'commit') - AND (get_json_string(data, 'commit.operation') = 'create') +WHERE (get_json_string(data, 'kind') = 'commit') + AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC @@ -53,13 +53,13 @@ LIMIT 3; -- Q5 - top 3 users with longest activity ------------------------------------------------------------------------------------------------------------------------ SELECT - get_json_string(data, '$.did') as user_id, - date_diff('millisecond', - min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), - max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span + get_json_string(data, 'did') as user_id, + date_diff('millisecond', + to_datetime(min(get_json_int(data, 'time_us')), 6), + to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky -WHERE (get_json_string(data, 'kind') = 'commit') - AND (get_json_string(data, 'commit.operation') = 'create') +WHERE (get_json_string(data, 'kind') = 'commit') + AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC diff --git a/starrocks/results/m6i.8xlarge_bluesky_1000m.json b/starrocks/results/m6i.8xlarge_bluesky_1000m.json index de425df..ae418ea 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_1000m.json +++ b/starrocks/results/m6i.8xlarge_bluesky_1000m.json @@ -1,20 +1,20 @@ { "system": "Starrocks", - "version": "3.4.0-e94580b", + "version": "4.0.0-rc01", "os": "Ubuntu 24.04", - "date": "2025-03-24", + "date": "2025-09-09", "machine": "m6i.8xlarge, 10000gib gp3", "retains_structure": "yes", "tags": [ ], "dataset_size": 1000000000, - "num_loaded_documents": 804000000, - "total_size": 158929000000, + "num_loaded_documents": 996999662, + "total_size": 192981470543, "result": [ - [2.27,1.24,1.21], - [17.81,10.67,10.20], - [7.38,6.78,7.62], - [7.24, null, null], - [null, null, null] + [1.02,0.81,0.85], + [8.93,8.70,20.44], + [2.05,2.03,29.45], + [3.25,1.76,21.13], + [7.16,4.21, null] ] } diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m.json b/starrocks/results/m6i.8xlarge_bluesky_100m.json index c40d531..65b20d7 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_100m.json +++ b/starrocks/results/m6i.8xlarge_bluesky_100m.json @@ -1,20 +1,20 @@ { "system": "Starrocks", - "version": "3.4.0-e94580b", + "version": "4.0.0-rc01", "os": "Ubuntu 24.04", - "date": "2025-03-24", + "date": "2025-09-09", "machine": "m6i.8xlarge, 10000gib gp3", "retains_structure": "yes", "tags": [ ], "dataset_size": 100000000, - "num_loaded_documents": 91000000, - "total_size": 17109000000, + "num_loaded_documents": 99999984, + "total_size": 18193481465, "result": [ - [0.61,0.16,0.16], - [19.26,7.12,7.18], - [1.12,1.08,1.08], - [0.55,0.55,0.54], - [0.60,0.60,0.60] + [0.57,0.42,0.43], + [8.04,1.28,1.19], + [0.97,0.77,0.78], + [0.76,0.73,0.78], + [0.78,0.75,0.77] ] } diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m.json b/starrocks/results/m6i.8xlarge_bluesky_10m.json index 8bb6970..e3706cc 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_10m.json +++ b/starrocks/results/m6i.8xlarge_bluesky_10m.json @@ -1,20 +1,20 @@ { "system": "Starrocks", - "version": "3.4.0-e94580b", + "version": "4.0.0-rc01", "os": "Ubuntu 24.04", - "date": "2025-03-24", + "date": "2025-09-09", "machine": "m6i.8xlarge, 10000gib gp3", "retains_structure": "yes", "tags": [ ], "dataset_size": 10000000, - "num_loaded_documents": 7000000, - "total_size": 824028000, + "num_loaded_documents": 9999997, + "total_size": 2043330691, "result": [ - [0.03,0.02,0.03], - [0.52,0.50,0.48], - [0.25,0.25,0.18], - [0.11,0.11,0.10], - [0.11,0.12,0.12] + [0.07,0.05,0.05], + [0.33,0.32,0.32], + [0.11,0.11,0.09], + [0.03,0.03,0.02], + [0.03,0.03,0.03] ] } diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m.json b/starrocks/results/m6i.8xlarge_bluesky_1m.json index 5635826..cbec0c7 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_1m.json +++ b/starrocks/results/m6i.8xlarge_bluesky_1m.json @@ -1,20 +1,20 @@ { "system": "Starrocks", - "version": "3.4.0-e94580b", + "version": "4.0.0-rc01", "os": "Ubuntu 24.04", - "date": "2025-03-24", + "date": "2025-09-09", "machine": "m6i.8xlarge, 10000gib gp3", "retains_structure": "yes", "tags": [ ], "dataset_size": 1000000, "num_loaded_documents": 1000000, - "total_size": 1, + "total_size": 207987146, "result": [ - [0.05,0.05,0.06], - [0.19,0.08,0.07], - [0.13,0.13,0.13], - [0.07,0.07,0.07], - [0.07,0.07,0.07] + [0.03,0.03,0.03], + [0.11,0.05,0.04], + [0.04,0.04,0.03], + [0.03,0.02,0.02], + [0.02,0.02,0.02] ] } diff --git a/starrocks/run_queries.sh b/starrocks/run_queries.sh index 990a1c7..5bbb52e 100755 --- a/starrocks/run_queries.sh +++ b/starrocks/run_queries.sh @@ -26,7 +26,7 @@ cat queries.sql | while read -r query; do # Execute the query multiple times for i in $(seq 1 $TRIES); do - RESP=$(mysql -vvv -h127.1 -P9030 -uroot "$DB_NAME" -e "$query" | perl -nle 'print $1 if /\((\d+\.\d+)+ sec\)/' ||:) + RESP=$(mysql -vvv -h "$DB_HOST" -P "$DB_MYSQL_PORT" -u"$DB_USER" "$DB_NAME" -e "$query" | perl -nle 'print $1 if /\((\d+\.\d+)+ sec\)/' ||:) echo "Response time: ${RESP} s" done; done; diff --git a/starrocks/total_size.sh b/starrocks/total_size.sh index 6d8fc05..f628b40 100755 --- a/starrocks/total_size.sh +++ b/starrocks/total_size.sh @@ -12,4 +12,4 @@ fi DB_NAME="$1" TABLE_NAME="$2" -mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SHOW DATA FROM $TABLE_NAME" +mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SHOW DATA FROM $TABLE_NAME"