Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion starrocks/count.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ fi
DB_NAME="$1"
TABLE_NAME="$2"

mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SELECT count() FROM $TABLE_NAME;"
5 changes: 3 additions & 2 deletions starrocks/create_and_load.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@ DATA_DIRECTORY="$3"
NUM_FILES="$4"
SUCCESS_LOG="$5"
ERROR_LOG="$6"
DDL_FILE="ddl.sql"

# Validate arguments
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }


echo "Create database"
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"

echo "Execute DDL"
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql"
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" < "$DDL_FILE"

echo "Load data"
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
12 changes: 10 additions & 2 deletions starrocks/ddl.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
CREATE TABLE bluesky (
`id` BIGINT AUTO_INCREMENT,
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON"
);
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON",

sort_key VARBINARY AS encode_sort_key(
get_json_string(data, 'kind'),
get_json_string(data, 'commit.operation'),
get_json_string(data, 'commit.collection'),
get_json_string(data, 'did')
)
)
ORDER BY (sort_key);
2 changes: 1 addition & 1 deletion starrocks/drop_table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ DB_NAME="$1"
TABLE_NAME="$2"

echo "Dropping table: $DB_NAME.$TABLE_NAME"
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "DROP TABLE IF EXISTS $TABLE_NAME"
19 changes: 18 additions & 1 deletion starrocks/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,21 @@ sudo snap install docker
sudo apt-get update
sudo apt-get install -y mysql-client

docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu
docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu:4.0.0-rc01

echo "Starting StarRocks container..."
sleep 5

# Monitor logs until "Enjoy" appears
echo "Monitoring container logs for 'Enjoy' message..."
timeout 300 docker logs -f starrocks | while read line; do
echo "$line"
if echo "$line" | grep -q "Enjoy"; then
echo "Found 'Enjoy' message! Container is ready."
# Kill the docker logs process
pkill -f "docker logs -f starrocks"
break
fi
done

echo "StarRocks started successfully."
21 changes: 16 additions & 5 deletions starrocks/load_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,25 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
continue
fi
MAX_ATTEMPT=10

MAX_ATTEMPT=1
attempt=0
while [ $attempt -lt $MAX_ATTEMPT ]
do
# Attempt the import
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
response_body="$(cat /tmp/curl_body)"
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body_$$) \
--location-trusted -u root: \
-H "max_filter_ratio: 0.00001" \
-H "strict_mode: true" \
-H "Expect:100-continue" \
-T "$uncompressed_file" \
-XPUT http://${DB_HOST}:${DB_HTTP_PORT}/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
response_body="$(cat /tmp/curl_body_$$)"
if jq -e . >/dev/null 2>&1 < /tmp/curl_body_$$; then
response_status="$(jq -r '.Status' < /tmp/curl_body_$$)"
else
response_status=""
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Invalid JSON response for $file: $(cat /tmp/curl_body_$$)" >> "$ERROR_LOG"
fi
echo $response_status
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
if [ "$response_status" = "Success" ]
Expand Down
5 changes: 5 additions & 0 deletions starrocks/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ ERROR_LOG="${4:-error.log}"
# Define prefix for output files
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"

export DB_HOST="127.0.0.1"
export DB_USER="root"
export DB_MYSQL_PORT="9030"
export DB_HTTP_PORT="8030" # HTTP endpoint for stream load

# Check if the directory exists
if [[ ! -d "$DATA_DIRECTORY" ]]; then
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
Expand Down
2 changes: 1 addition & 1 deletion starrocks/physical_query_plans.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cat queries.sql | while read -r query; do
echo "------------------------------------------------------------------------------------------------------------------------"
echo "Physical query plan for query Q$QUERY_NUM:"
echo
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query"
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "EXPLAIN $query"

# Increment the query number
QUERY_NUM=$((QUERY_NUM + 1))
Expand Down
6 changes: 3 additions & 3 deletions starrocks/queries.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC;
SELECT get_json_string(data, 'commit.collection') AS event, hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT get_json_string(data, '$.did') as user_id, min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
SELECT get_json_string(data, '$.did') as user_id, date_diff('millisecond', min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
SELECT get_json_string(data, 'commit.collection') AS event, hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT get_json_string(data, 'did') as user_id, to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
SELECT get_json_string(data, 'did') as user_id, date_diff('millisecond', to_datetime(min(get_json_int(data, 'time_us')), 6), to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
38 changes: 19 additions & 19 deletions starrocks/queries_formatted.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
------------------------------------------------------------------------------------------------------------------------

SELECT get_json_string(data, 'commit.collection') AS event,
count() AS count
FROM bluesky
GROUP BY event
count() AS count
FROM bluesky
GROUP BY event
ORDER BY count DESC;

------------------------------------------------------------------------------------------------------------------------
Expand All @@ -17,33 +17,33 @@ SELECT
count(DISTINCT get_json_string(data, 'did')) AS users
FROM bluesky
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (get_json_string(data, 'commit.operation') = 'create')
GROUP BY event
ORDER BY count DESC;

------------------------------------------------------------------------------------------------------------------------
-- Q3 - When do people use BlueSky
------------------------------------------------------------------------------------------------------------------------
SELECT
get_json_string(data, 'commit.collection') AS event,
hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day,
get_json_string(data, 'commit.collection') AS event,
hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day,
count() AS count
FROM bluesky
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection')))
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection')))
GROUP BY event, hour_of_day
ORDER BY hour_of_day, event;

------------------------------------------------------------------------------------------------------------------------
-- Q4 - top 3 post veterans
------------------------------------------------------------------------------------------------------------------------
SELECT
get_json_string(data, '$.did') as user_id,
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date
get_json_string(data, 'did') as user_id,
to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date
FROM bluesky
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY first_post_date ASC
Expand All @@ -53,13 +53,13 @@ LIMIT 3;
-- Q5 - top 3 users with longest activity
------------------------------------------------------------------------------------------------------------------------
SELECT
get_json_string(data, '$.did') as user_id,
date_diff('millisecond',
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))),
max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span
get_json_string(data, 'did') as user_id,
date_diff('millisecond',
to_datetime(min(get_json_int(data, 'time_us')), 6),
to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span
FROM bluesky
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
WHERE (get_json_string(data, 'kind') = 'commit')
AND (get_json_string(data, 'commit.operation') = 'create')
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY activity_span DESC
Expand Down
18 changes: 9 additions & 9 deletions starrocks/results/m6i.8xlarge_bluesky_1000m.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"system": "Starrocks",
"version": "3.4.0-e94580b",
"version": "4.0.0-rc01",
"os": "Ubuntu 24.04",
"date": "2025-03-24",
"date": "2025-09-09",
"machine": "m6i.8xlarge, 10000gib gp3",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 1000000000,
"num_loaded_documents": 804000000,
"total_size": 158929000000,
"num_loaded_documents": 996999662,
"total_size": 192981470543,
"result": [
[2.27,1.24,1.21],
[17.81,10.67,10.20],
[7.38,6.78,7.62],
[7.24, null, null],
[null, null, null]
[1.02,0.81,0.85],
[8.93,8.70,20.44],
[2.05,2.03,29.45],
[3.25,1.76,21.13],
[7.16,4.21, null]
]
}
18 changes: 9 additions & 9 deletions starrocks/results/m6i.8xlarge_bluesky_100m.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"system": "Starrocks",
"version": "3.4.0-e94580b",
"version": "4.0.0-rc01",
"os": "Ubuntu 24.04",
"date": "2025-03-24",
"date": "2025-09-09",
"machine": "m6i.8xlarge, 10000gib gp3",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 100000000,
"num_loaded_documents": 91000000,
"total_size": 17109000000,
"num_loaded_documents": 99999984,
"total_size": 18193481465,
"result": [
[0.61,0.16,0.16],
[19.26,7.12,7.18],
[1.12,1.08,1.08],
[0.55,0.55,0.54],
[0.60,0.60,0.60]
[0.57,0.42,0.43],
[8.04,1.28,1.19],
[0.97,0.77,0.78],
[0.76,0.73,0.78],
[0.78,0.75,0.77]
]
}
18 changes: 9 additions & 9 deletions starrocks/results/m6i.8xlarge_bluesky_10m.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"system": "Starrocks",
"version": "3.4.0-e94580b",
"version": "4.0.0-rc01",
"os": "Ubuntu 24.04",
"date": "2025-03-24",
"date": "2025-09-09",
"machine": "m6i.8xlarge, 10000gib gp3",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 10000000,
"num_loaded_documents": 7000000,
"total_size": 824028000,
"num_loaded_documents": 9999997,
"total_size": 2043330691,
"result": [
[0.03,0.02,0.03],
[0.52,0.50,0.48],
[0.25,0.25,0.18],
[0.11,0.11,0.10],
[0.11,0.12,0.12]
[0.07,0.05,0.05],
[0.33,0.32,0.32],
[0.11,0.11,0.09],
[0.03,0.03,0.02],
[0.03,0.03,0.03]
]
}
16 changes: 8 additions & 8 deletions starrocks/results/m6i.8xlarge_bluesky_1m.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"system": "Starrocks",
"version": "3.4.0-e94580b",
"version": "4.0.0-rc01",
"os": "Ubuntu 24.04",
"date": "2025-03-24",
"date": "2025-09-09",
"machine": "m6i.8xlarge, 10000gib gp3",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 1000000,
"num_loaded_documents": 1000000,
"total_size": 1,
"total_size": 207987146,
"result": [
[0.05,0.05,0.06],
[0.19,0.08,0.07],
[0.13,0.13,0.13],
[0.07,0.07,0.07],
[0.07,0.07,0.07]
[0.03,0.03,0.03],
[0.11,0.05,0.04],
[0.04,0.04,0.03],
[0.03,0.02,0.02],
[0.02,0.02,0.02]
]
}
2 changes: 1 addition & 1 deletion starrocks/run_queries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ cat queries.sql | while read -r query; do

# Execute the query multiple times
for i in $(seq 1 $TRIES); do
RESP=$(mysql -vvv -h127.1 -P9030 -uroot "$DB_NAME" -e "$query" | perl -nle 'print $1 if /\((\d+\.\d+)+ sec\)/' ||:)
RESP=$(mysql -vvv -h "$DB_HOST" -P "$DB_MYSQL_PORT" -u"$DB_USER" "$DB_NAME" -e "$query" | perl -nle 'print $1 if /\((\d+\.\d+)+ sec\)/' ||:)
echo "Response time: ${RESP} s"
done;
done;
2 changes: 1 addition & 1 deletion starrocks/total_size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ fi
DB_NAME="$1"
TABLE_NAME="$2"

mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SHOW DATA FROM $TABLE_NAME"
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SHOW DATA FROM $TABLE_NAME"