From 043c987f82d4280f669a9647e2d54e1785df84c8 Mon Sep 17 00:00:00 2001 From: Lionel Palacin Date: Wed, 12 Mar 2025 17:15:16 +0000 Subject: [PATCH 1/2] Add starrocks candidate --- starrocks/benchmark.sh | 33 ++++++++ starrocks/count.sh | 13 ++++ starrocks/create_and_load.sh | 33 ++++++++ starrocks/ddl_lz4.sql | 12 +++ starrocks/ddl_zstd.sql | 15 ++++ starrocks/install.sh | 7 ++ starrocks/load_data.sh | 72 ++++++++++++++++++ starrocks/main.sh | 75 +++++++++++++++++++ starrocks/physical_query_plans.sh | 24 ++++++ starrocks/queries.sql | 5 ++ starrocks/queries_formatted.sql | 66 ++++++++++++++++ .../results/m6i.8xlarge_bluesky_100m_lz4.json | 24 ++++++ .../m6i.8xlarge_bluesky_100m_zstd.json | 24 ++++++ .../results/m6i.8xlarge_bluesky_10m_lz4.json | 24 ++++++ .../results/m6i.8xlarge_bluesky_10m_zstd.json | 24 ++++++ .../results/m6i.8xlarge_bluesky_1m_lz4.json | 24 ++++++ .../results/m6i.8xlarge_bluesky_1m_zstd.json | 24 ++++++ starrocks/run_queries.sh | 30 ++++++++ starrocks/total_size.sh | 13 ++++ 19 files changed, 542 insertions(+) create mode 100755 starrocks/benchmark.sh create mode 100755 starrocks/count.sh create mode 100755 starrocks/create_and_load.sh create mode 100644 starrocks/ddl_lz4.sql create mode 100644 starrocks/ddl_zstd.sql create mode 100755 starrocks/install.sh create mode 100755 starrocks/load_data.sh create mode 100755 starrocks/main.sh create mode 100755 starrocks/physical_query_plans.sh create mode 100644 starrocks/queries.sql create mode 100644 starrocks/queries_formatted.sql create mode 100644 starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json create mode 100644 starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json create mode 100644 starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json create mode 100644 starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json create mode 100755 starrocks/run_queries.sh create mode 100755 starrocks/total_size.sh diff --git a/starrocks/benchmark.sh b/starrocks/benchmark.sh new file mode 100755 index 0000000..dd8834f --- /dev/null +++ b/starrocks/benchmark.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_NAME="$1" +RESULT_FILE_RUNTIMES="$2" +RESULT_FILE_MEMORY_USAGE="$3" + +# Construct the query log file name using $DB_NAME +QUERY_LOG_FILE="query_log.txt" + +# Print the database name +echo "Running queries on database: $DB_NAME" + +# Run queries and log the output +./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt + +# Process the query log and prepare the result +RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \ +awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') + +# Output the result +if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then + echo "$RESULT" > "$RESULT_FILE_RUNTIMES" + echo "Result written to $RESULT_FILE_RUNTIMES" +else + echo "$RESULT" +fi diff --git a/starrocks/count.sh b/starrocks/count.sh new file mode 100755 index 0000000..f15c21c --- /dev/null +++ b/starrocks/count.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_NAME="$1" +TABLE_NAME="$2" + +mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM '$TABLE_NAME';" diff --git a/starrocks/create_and_load.sh b/starrocks/create_and_load.sh new file mode 100755 index 0000000..45863ca --- /dev/null +++ b/starrocks/create_and_load.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 7 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_NAME="$1" +TABLE_NAME="$2" +DDL_FILE="$3" +DATA_DIRECTORY="$4" +NUM_FILES="$5" +SUCCESS_LOG="$6" +ERROR_LOG="$7" + +# Validate arguments +[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } +[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } +[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } + + +# Create database +mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" + +# Execute DDL +mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE" + +# Load data +./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" + +echo "Script completed successfully." diff --git a/starrocks/ddl_lz4.sql b/starrocks/ddl_lz4.sql new file mode 100644 index 0000000..95916c8 --- /dev/null +++ b/starrocks/ddl_lz4.sql @@ -0,0 +1,12 @@ +CREATE TABLE bluesky ( + `id` BIGINT AUTO_INCREMENT, + -- Main JSON column (comes after key columns) + `data` JSON NULL COMMENT "Main JSON object", + -- Key columns (must come first in the schema and in the same order as DUPLICATE KEY) + `kind` VARCHAR(255) AS get_json_string(data, '$.kind'), + `operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'), + `collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'), + `did` VARCHAR(255) AS get_json_string(data, '$.did'), + `time_us` BIGINT AS get_json_int(data, '$.time_us') +) ENGINE=OLAP +ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`); diff --git a/starrocks/ddl_zstd.sql b/starrocks/ddl_zstd.sql new file mode 100644 index 0000000..e96786e --- /dev/null +++ b/starrocks/ddl_zstd.sql @@ -0,0 +1,15 @@ +CREATE TABLE bluesky ( + `id` BIGINT AUTO_INCREMENT, + -- Main JSON column (comes after key columns) + `data` JSON NULL COMMENT "Main JSON object", + -- Key columns (must come first in the schema and in the same order as ORDER BY) + `kind` VARCHAR(255) AS get_json_string(data, '$.kind'), + `operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'), + `collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'), + `did` VARCHAR(255) AS get_json_string(data, '$.did'), + `time_us` BIGINT AS get_json_int(data, '$.time_us') +) ENGINE=OLAP +ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`) +PROPERTIES ( +"compression" = "ZSTD" +); diff --git a/starrocks/install.sh b/starrocks/install.sh new file mode 100755 index 0000000..bc26085 --- /dev/null +++ b/starrocks/install.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +sudo snap install docker +sudo apt-get update +sudo apt-get install -y mysql-client +sudo docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name quickstart starrocks/allin1-ubuntu + diff --git a/starrocks/load_data.sh b/starrocks/load_data.sh new file mode 100755 index 0000000..125bb75 --- /dev/null +++ b/starrocks/load_data.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 6 ]]; then + echo "Usage: $0 " + exit 1 +fi + + +# Arguments +DATA_DIRECTORY="$1" +DB_NAME="$2" +TABLE_NAME="$3" +MAX_FILES="$4" +SUCCESS_LOG="$5" +ERROR_LOG="$6" + +# Validate arguments +[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } +[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } + +# Create a temporary directory for uncompressed files +TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX) +trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit + +# Load data +counter=0 +for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do + echo "Processing file: $file" + + # Uncompress the file into the TEMP_DIR + uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" + gunzip -c "$file" > "$uncompressed_file" + + if [[ $? -ne 0 ]]; then + echo "Error: Failed to uncompress $file" >> "$ERROR_LOG" + continue + fi + MAX_ATTEMPT=10 + attempt=0 + while [ $attempt -lt $MAX_ATTEMPT ] + do + # Attempt the import + http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load) + response_body="$(cat /tmp/curl_body)" + response_status="$(cat /tmp/curl_body | jq -r '.Status')" + echo $response_status + if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then + if [ "$response_status" = "Success" ] + then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG" + rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing + attempt=$((MAX_ATTEMPT)) + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG" + attempt=$((attempt + 1)) + sleep 2 + fi + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG" + attempt=$((attempt + 1)) + sleep 2 + fi + done + + counter=$((counter + 1)) + if [[ $counter -ge $MAX_FILES ]]; then + break + fi +done + +echo "Script completed successfully." diff --git a/starrocks/main.sh b/starrocks/main.sh new file mode 100755 index 0000000..fc90bac --- /dev/null +++ b/starrocks/main.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Default data directory +DEFAULT_DATA_DIRECTORY=~/data/bluesky + +# Allow the user to optionally provide the data directory as an argument +DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" + +# Define success and error log files +SUCCESS_LOG="${2:-success.log}" +ERROR_LOG="${3:-error.log}" + +# Define prefix for output files +OUTPUT_PREFIX="${4:-_m6i.8xlarge}" + +# Check if the directory exists +if [[ ! -d "$DATA_DIRECTORY" ]]; then + echo "Error: Data directory '$DATA_DIRECTORY' does not exist." + exit 1 +fi + +echo "Select the dataset size to benchmark:" +echo "1) 1m (default)" +echo "2) 10m" +echo "3) 100m" +echo "4) 1000m" +echo "5) all" +read -p "Enter the number corresponding to your choice: " choice + +./install.sh + +benchmark() { + local size=$1 + local suffix=$2 + # Check DATA_DIRECTORY contains the required number of files to run the benchmark + file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) + if (( file_count < size )); then + echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." + exit 1 + fi + ./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" + ./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size" + ./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count" + ./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans" + ./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage" +} + +case $choice in + 2) + benchmark 10 lz4 + benchmark 10 zstd + ;; + 3) + benchmark 100 lz4 + benchmark 100 zstd + ;; + 4) + benchmark 1000 lz4 + benchmark 1000 zstd + ;; + 5) + benchmark 1 lz4 + benchmark 1 zstd + benchmark 10 lz4 + benchmark 10 zstd + benchmark 100 lz4 + benchmark 100 zstd + benchmark 1000 lz4 + benchmark 1000 zstd + ;; + *) + benchmark 1 lz4 + benchmark 1 zstd + ;; +esac diff --git a/starrocks/physical_query_plans.sh b/starrocks/physical_query_plans.sh new file mode 100755 index 0000000..993fcae --- /dev/null +++ b/starrocks/physical_query_plans.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_NAME="$1" + +QUERY_NUM=1 + +cat queries.sql | while read -r query; do + + # Print the query number + echo "------------------------------------------------------------------------------------------------------------------------" + echo "Physical query plan for query Q$QUERY_NUM:" + echo + mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query" + + # Increment the query number + QUERY_NUM=$((QUERY_NUM + 1)) +done; diff --git a/starrocks/queries.sql b/starrocks/queries.sql new file mode 100644 index 0000000..9a3e6f6 --- /dev/null +++ b/starrocks/queries.sql @@ -0,0 +1,5 @@ +SELECT cast(data->'commit.collection' AS VARCHAR) AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; +SELECT cast(data->'commit.collection' AS VARCHAR) AS event, count() AS count, count(DISTINCT cast(data->'did' AS VARCHAR)) AS users FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') GROUP BY event ORDER BY count DESC; +SELECT cast(data->'commit.collection' AS VARCHAR) AS event, hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; +SELECT cast(data->'$.did' as VARCHAR) as user_id, min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3; +SELECT cast(data->'$.did' as VARCHAR) as user_id, date_diff('millisecond', min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; diff --git a/starrocks/queries_formatted.sql b/starrocks/queries_formatted.sql new file mode 100644 index 0000000..b549847 --- /dev/null +++ b/starrocks/queries_formatted.sql @@ -0,0 +1,66 @@ +------------------------------------------------------------------------------------------------------------------------ +-- Q1 - Top event types +------------------------------------------------------------------------------------------------------------------------ + +SELECT cast(data->'commit.collection' AS VARCHAR) AS event, + count() AS count +FROM bluesky +GROUP BY event +ORDER BY count DESC; + +------------------------------------------------------------------------------------------------------------------------ +-- Q2 - Top event types together with unique users per event type +------------------------------------------------------------------------------------------------------------------------ +SELECT + cast(data->'commit.collection' AS VARCHAR) AS event, + count() AS count, + count(DISTINCT cast(data->'did' AS VARCHAR)) AS users +FROM bluesky +WHERE (data->'kind' = 'commit') + AND (data->'commit.operation' = 'create') +GROUP BY event +ORDER BY count DESC; + +------------------------------------------------------------------------------------------------------------------------ +-- Q3 - When do people use BlueSky +------------------------------------------------------------------------------------------------------------------------ +SELECT + cast(data->'commit.collection' AS VARCHAR) AS event, + hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day, + count() AS count +FROM bluesky +WHERE (data->'kind' = 'commit') +AND (data->'commit.operation' = 'create') +AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) +GROUP BY event, hour_of_day +ORDER BY hour_of_day, event; + +------------------------------------------------------------------------------------------------------------------------ +-- Q4 - top 3 post veterans +------------------------------------------------------------------------------------------------------------------------ +SELECT + cast(data->'$.did' as VARCHAR) as user_id, + min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date +FROM bluesky +WHERE (data->'kind' = 'commit') + AND (data->'commit.operation' = 'create') + AND (data->'commit.collection' = 'app.bsky.feed.post') +GROUP BY user_id +ORDER BY first_post_ts ASC +LIMIT 3; + +------------------------------------------------------------------------------------------------------------------------ +-- Q5 - top 3 users with longest activity +------------------------------------------------------------------------------------------------------------------------ +SELECT + cast(data->'$.did' as VARCHAR) as user_id, + date_diff('millisecond', + min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))), + max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span +FROM bluesky +WHERE (data->'kind' = 'commit') + AND (data->'commit.operation' = 'create') + AND (data->'commit.collection' = 'app.bsky.feed.post') +GROUP BY user_id +ORDER BY activity_span DESC +LIMIT 3; diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json new file mode 100644 index 0000000..506051d --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json @@ -0,0 +1,24 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "tags": [ + ], + "dataset_size": 100000000, + "dataset_size_readable": "100m", + "num_loaded_documents": 100000000, + "data_compression": "lz4", + "total_size": 19182000000, + "total_size_readable": "19.182 GB", + "result": [ + [0.25,0.17,0.17], + [8.13,4.33,3.82], + [3.18,3.08,3.05], + [4.06,4.07,4.12], + [4.04,4.20,3.97] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json new file mode 100644 index 0000000..239d8da --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json @@ -0,0 +1,24 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "tags": [ + ], + "dataset_size": 100000000, + "dataset_size_readable": "100m", + "num_loaded_documents": 100000000, + "data_compression": "zstd", + "total_size": 31200000000, + "total_size_readable": "31.200 GB", + "result": [ + [0.22,0.17,0.18], + [28.09,3.94,3.89], + [3.04,3.05,3.11], + [3.99,4.04,3.94], + [4.13,4.12,4.11] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json new file mode 100644 index 0000000..3d0e830 --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json @@ -0,0 +1,24 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "tags": [ + ], + "dataset_size": 10000000, + "dataset_size_readable": "10m", + "num_loaded_documents": 9999994, + "data_compression": "lz4", + "total_size": 1967000000, + "total_size_readable": "1.967 GB", + "result": [ + [0.11,0.10,0.10], + [0.45,0.40,0.42], + [0.58,0.45,0.50], + [0.57,0.62,0.61], + [0.69,0.60,0.55] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json new file mode 100644 index 0000000..4c9a4e7 --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json @@ -0,0 +1,24 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "tags": [ + ], + "dataset_size": 10000000, + "dataset_size_readable": "10m", + "num_loaded_documents": 10000000, + "data_compression": "zstd", + "total_size": 3193000000, + "total_size_readable": "3.193 GB", + "result": [ + [0.10,0.12,0.09], + [1.79,0.43,0.37], + [0.48,0.45,0.47], + [0.57,0.70,0.62], + [0.59,0.72,0.73] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json new file mode 100644 index 0000000..9ba51b6 --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json @@ -0,0 +1,24 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "tags": [ + ], + "dataset_size": 1000000, + "dataset_size_readable": "1m", + "num_loaded_documents": 1000000, + "data_compression": "lz4", + "total_size": 201845000, + "total_size_readable": "201.845 MB", + "result": [ + [0.65,0.05,0.05], + [0.36,0.28,0.29], + [0.31,0.28,0.28], + [0.52,0.52,0.51], + [0.51,0.51,0.52] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json new file mode 100644 index 0000000..7373bb6 --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json @@ -0,0 +1,24 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "tags": [ + ], + "dataset_size": 1000000, + "dataset_size_readable": "1m", + "num_loaded_documents": 1000000, + "data_compression": "zstd", + "total_size": 322945000, + "total_size_readable": "322.945 MB", + "result": [ + [0.05,0.05,0.04], + [0.37,0.28,0.29], + [0.28,0.27,0.27], + [0.51,0.51,0.51], + [0.50,0.50,0.51] + ] +} diff --git a/starrocks/run_queries.sh b/starrocks/run_queries.sh new file mode 100755 index 0000000..019abe9 --- /dev/null +++ b/starrocks/run_queries.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_NAME="$1" + +TRIES=3 + +cat queries.sql | while read -r query; do + + # Clear the Linux file system cache + echo "Clearing file system cache..." + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + echo "File system cache cleared." + + # Print the query + echo "Running query: $query" + + # Execute the query multiple times + for i in $(seq 1 $TRIES); do + RESP=$(mysql -vvv -h127.1 -P9030 -uroot "$DB_NAME" -e "$query" | perl -nle 'print $1 if /\((\d+\.\d+)+ sec\)/' ||:) + echo "Response time: ${RESP} s" + done; +done; diff --git a/starrocks/total_size.sh b/starrocks/total_size.sh new file mode 100755 index 0000000..704b6f8 --- /dev/null +++ b/starrocks/total_size.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Arguments +DB_NAME="$1" +TABLE_NAME="$2" + +mysql -P 9030 -h 127.0.0.1 -u root -e "SHOW DATA FROM $DB_NAME.$TABLE_NAME" From f085fb8285df3b6aff5598f712efb40b12c1116d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Mar 2025 19:24:37 +0000 Subject: [PATCH 2/2] More fixups --- starrocks/create_and_load.sh | 8 +++--- starrocks/drop_table.sh | 14 +++++++++++ starrocks/load_data.sh | 4 +-- starrocks/main.sh | 1 + .../m6i.8xlarge_bluesky_1000m_lz4.json | 25 +++++++++++++++++++ .../m6i.8xlarge_bluesky_1000m_zstd.json | 25 +++++++++++++++++++ .../results/m6i.8xlarge_bluesky_100m_lz4.json | 1 + .../m6i.8xlarge_bluesky_100m_zstd.json | 1 + .../results/m6i.8xlarge_bluesky_10m_lz4.json | 1 + .../results/m6i.8xlarge_bluesky_10m_zstd.json | 1 + .../results/m6i.8xlarge_bluesky_1m_lz4.json | 1 + .../results/m6i.8xlarge_bluesky_1m_zstd.json | 1 + 12 files changed, 75 insertions(+), 8 deletions(-) create mode 100755 starrocks/drop_table.sh create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json diff --git a/starrocks/create_and_load.sh b/starrocks/create_and_load.sh index 45863ca..e8d84bd 100755 --- a/starrocks/create_and_load.sh +++ b/starrocks/create_and_load.sh @@ -21,13 +21,11 @@ ERROR_LOG="$7" [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } -# Create database +echo "Create database" mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" -# Execute DDL +echo "Execute DDL" mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE" -# Load data +echo "Load data" ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" - -echo "Script completed successfully." diff --git a/starrocks/drop_table.sh b/starrocks/drop_table.sh new file mode 100755 index 0000000..5aec315 --- /dev/null +++ b/starrocks/drop_table.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Check if the required arguments are provided +if [[ $# -lt 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +DB_NAME="$1" +TABLE_NAME="$2" + +echo "Dropping table: $DB_NAME.$TABLE_NAME" + +mysql -P 9030 -h 127.0.0.1 -u root -e "DROP TABLE IF EXISTS $DB_NAME.$TABLE_NAME" diff --git a/starrocks/load_data.sh b/starrocks/load_data.sh index 125bb75..13b10f3 100755 --- a/starrocks/load_data.sh +++ b/starrocks/load_data.sh @@ -38,7 +38,7 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do fi MAX_ATTEMPT=10 attempt=0 - while [ $attempt -lt $MAX_ATTEMPT ] + while [ $attempt -lt $MAX_ATTEMPT ] do # Attempt the import http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load) @@ -68,5 +68,3 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do break fi done - -echo "Script completed successfully." diff --git a/starrocks/main.sh b/starrocks/main.sh index fc90bac..d825140 100755 --- a/starrocks/main.sh +++ b/starrocks/main.sh @@ -43,6 +43,7 @@ benchmark() { ./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count" ./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans" ./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage" + ./drop_table.sh "bluesky_${size}m_${suffix}" bluesky } case $choice in diff --git a/starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json new file mode 100644 index 0000000..b5cf4c4 --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json @@ -0,0 +1,25 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "retains_structure": "yes", + "tags": [ + ], + "dataset_size": 100000000, + "dataset_size_readable": "100m", + "num_loaded_documents": 100000000, + "data_compression": "lz4", + "total_size": 19182000000, + "total_size_readable": "19.182 GB", + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json new file mode 100644 index 0000000..19acf94 --- /dev/null +++ b/starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json @@ -0,0 +1,25 @@ +{ + "system": "Starrocks", + "version": "3.4.0-e94580b", + "os": "Ubuntu 24.04", + "date": "2025-01-13", + "machine": "m6i.8xlarge, 10000gib gp3", + "cluster_size": 1, + "comment": "", + "retains_structure": "yes", + "tags": [ + ], + "dataset_size": 100000000, + "dataset_size_readable": "100m", + "num_loaded_documents": 100000000, + "data_compression": "zstd", + "total_size": 31200000000, + "total_size_readable": "31.200 GB", + "result": [ + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null], + [null, null, null] + ] +} diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json index 506051d..07a36cc 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json +++ b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json @@ -6,6 +6,7 @@ "machine": "m6i.8xlarge, 10000gib gp3", "cluster_size": 1, "comment": "", + "retains_structure": "yes", "tags": [ ], "dataset_size": 100000000, diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json index 239d8da..d147ee1 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json +++ b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json @@ -6,6 +6,7 @@ "machine": "m6i.8xlarge, 10000gib gp3", "cluster_size": 1, "comment": "", + "retains_structure": "yes", "tags": [ ], "dataset_size": 100000000, diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json index 3d0e830..6c9443e 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json +++ b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json @@ -6,6 +6,7 @@ "machine": "m6i.8xlarge, 10000gib gp3", "cluster_size": 1, "comment": "", + "retains_structure": "yes", "tags": [ ], "dataset_size": 10000000, diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json index 4c9a4e7..266a147 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json +++ b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json @@ -6,6 +6,7 @@ "machine": "m6i.8xlarge, 10000gib gp3", "cluster_size": 1, "comment": "", + "retains_structure": "yes", "tags": [ ], "dataset_size": 10000000, diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json index 9ba51b6..40f2283 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json +++ b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json @@ -6,6 +6,7 @@ "machine": "m6i.8xlarge, 10000gib gp3", "cluster_size": 1, "comment": "", + "retains_structure": "yes", "tags": [ ], "dataset_size": 1000000, diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json index 7373bb6..f5de5d0 100644 --- a/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json +++ b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json @@ -6,6 +6,7 @@ "machine": "m6i.8xlarge, 10000gib gp3", "cluster_size": 1, "comment": "", + "retains_structure": "yes", "tags": [ ], "dataset_size": 1000000,