From 043c987f82d4280f669a9647e2d54e1785df84c8 Mon Sep 17 00:00:00 2001
From: Lionel Palacin <lionel.palacin@clickhouse.com>
Date: Wed, 12 Mar 2025 17:15:16 +0000
Subject: [PATCH 1/2] Add starrocks candidate

---
 starrocks/benchmark.sh                        | 33 ++++++++
 starrocks/count.sh                            | 13 ++++
 starrocks/create_and_load.sh                  | 33 ++++++++
 starrocks/ddl_lz4.sql                         | 12 +++
 starrocks/ddl_zstd.sql                        | 15 ++++
 starrocks/install.sh                          |  7 ++
 starrocks/load_data.sh                        | 72 ++++++++++++++++++
 starrocks/main.sh                             | 75 +++++++++++++++++++
 starrocks/physical_query_plans.sh             | 24 ++++++
 starrocks/queries.sql                         |  5 ++
 starrocks/queries_formatted.sql               | 66 ++++++++++++++++
 .../results/m6i.8xlarge_bluesky_100m_lz4.json | 24 ++++++
 .../m6i.8xlarge_bluesky_100m_zstd.json        | 24 ++++++
 .../results/m6i.8xlarge_bluesky_10m_lz4.json  | 24 ++++++
 .../results/m6i.8xlarge_bluesky_10m_zstd.json | 24 ++++++
 .../results/m6i.8xlarge_bluesky_1m_lz4.json   | 24 ++++++
 .../results/m6i.8xlarge_bluesky_1m_zstd.json  | 24 ++++++
 starrocks/run_queries.sh                      | 30 ++++++++
 starrocks/total_size.sh                       | 13 ++++
 19 files changed, 542 insertions(+)
 create mode 100755 starrocks/benchmark.sh
 create mode 100755 starrocks/count.sh
 create mode 100755 starrocks/create_and_load.sh
 create mode 100644 starrocks/ddl_lz4.sql
 create mode 100644 starrocks/ddl_zstd.sql
 create mode 100755 starrocks/install.sh
 create mode 100755 starrocks/load_data.sh
 create mode 100755 starrocks/main.sh
 create mode 100755 starrocks/physical_query_plans.sh
 create mode 100644 starrocks/queries.sql
 create mode 100644 starrocks/queries_formatted.sql
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json
 create mode 100755 starrocks/run_queries.sh
 create mode 100755 starrocks/total_size.sh

diff --git a/starrocks/benchmark.sh b/starrocks/benchmark.sh
new file mode 100755
index 0000000..dd8834f
--- /dev/null
+++ b/starrocks/benchmark.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 3 ]]; then
+    echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+RESULT_FILE_RUNTIMES="$2"
+RESULT_FILE_MEMORY_USAGE="$3"
+
+# Construct the query log file name using $DB_NAME
+QUERY_LOG_FILE="query_log.txt"
+
+# Print the database name
+echo "Running queries on database: $DB_NAME"
+
+# Run queries and log the output
+./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt
+
+# Process the query log and prepare the result
+RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
+awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
+
+# Output the result
+if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
+    echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
+    echo "Result written to $RESULT_FILE_RUNTIMES"
+else
+    echo "$RESULT"
+fi
diff --git a/starrocks/count.sh b/starrocks/count.sh
new file mode 100755
index 0000000..f15c21c
--- /dev/null
+++ b/starrocks/count.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+TABLE_NAME="$2"
+
+mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM '$TABLE_NAME';"
diff --git a/starrocks/create_and_load.sh b/starrocks/create_and_load.sh
new file mode 100755
index 0000000..45863ca
--- /dev/null
+++ b/starrocks/create_and_load.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 7 ]]; then
+    echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+TABLE_NAME="$2"
+DDL_FILE="$3"
+DATA_DIRECTORY="$4"
+NUM_FILES="$5"
+SUCCESS_LOG="$6"
+ERROR_LOG="$7"
+
+# Validate arguments
+[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
+[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
+[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
+
+
+# Create database
+mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
+
+# Execute DDL
+mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE"
+
+# Load data
+./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
+
+echo "Script completed successfully."
diff --git a/starrocks/ddl_lz4.sql b/starrocks/ddl_lz4.sql
new file mode 100644
index 0000000..95916c8
--- /dev/null
+++ b/starrocks/ddl_lz4.sql
@@ -0,0 +1,12 @@
+CREATE TABLE bluesky (
+    `id` BIGINT AUTO_INCREMENT,
+    -- Main JSON column (comes after key columns)
+    `data` JSON NULL COMMENT "Main JSON object",
+    -- Key columns (must come first in the schema and in the same order as DUPLICATE KEY)
+    `kind` VARCHAR(255) AS get_json_string(data, '$.kind'),
+    `operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'),
+    `collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
+    `did` VARCHAR(255) AS get_json_string(data, '$.did'),
+    `time_us` BIGINT AS get_json_int(data, '$.time_us')
+) ENGINE=OLAP
+ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`);
diff --git a/starrocks/ddl_zstd.sql b/starrocks/ddl_zstd.sql
new file mode 100644
index 0000000..e96786e
--- /dev/null
+++ b/starrocks/ddl_zstd.sql
@@ -0,0 +1,15 @@
+CREATE TABLE bluesky (
+    `id` BIGINT AUTO_INCREMENT,
+    -- Main JSON column (comes after key columns)
+    `data` JSON NULL COMMENT "Main JSON object",
+    -- Key columns (must come first in the schema and in the same order as ORDER BY)
+    `kind` VARCHAR(255) AS get_json_string(data, '$.kind'),
+    `operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'),
+    `collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
+    `did` VARCHAR(255) AS get_json_string(data, '$.did'),
+    `time_us` BIGINT AS get_json_int(data, '$.time_us')
+) ENGINE=OLAP
+ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`)
+PROPERTIES (
+"compression" = "ZSTD"
+);
diff --git a/starrocks/install.sh b/starrocks/install.sh
new file mode 100755
index 0000000..bc26085
--- /dev/null
+++ b/starrocks/install.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+sudo snap install docker
+sudo apt-get update
+sudo apt-get install -y mysql-client
+sudo docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name quickstart starrocks/allin1-ubuntu
+
diff --git a/starrocks/load_data.sh b/starrocks/load_data.sh
new file mode 100755
index 0000000..125bb75
--- /dev/null
+++ b/starrocks/load_data.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 6 ]]; then
+    echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
+    exit 1
+fi
+
+
+# Arguments
+DATA_DIRECTORY="$1"
+DB_NAME="$2"
+TABLE_NAME="$3"
+MAX_FILES="$4"
+SUCCESS_LOG="$5"
+ERROR_LOG="$6"
+
+# Validate arguments
+[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
+[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
+
+# Create a temporary directory for uncompressed files
+TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
+trap "rm -rf $TEMP_DIR" EXIT  # Cleanup temp directory on script exit
+
+# Load data
+counter=0
+for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
+    echo "Processing file: $file"
+
+    # Uncompress the file into the TEMP_DIR
+    uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
+    gunzip -c "$file" > "$uncompressed_file"
+
+    if [[ $? -ne 0 ]]; then
+        echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
+        continue
+    fi
+    MAX_ATTEMPT=10
+    attempt=0
+    while [ $attempt -lt $MAX_ATTEMPT ] 
+    do
+        # Attempt the import
+        http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
+        response_body="$(cat /tmp/curl_body)"
+        response_status="$(cat /tmp/curl_body | jq -r '.Status')"
+        echo $response_status
+        if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
+            if [ "$response_status" = "Success" ]
+            then
+                echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
+                rm -f "$uncompressed_file"  # Delete the uncompressed file after successful processing
+                attempt=$((MAX_ATTEMPT))
+            else
+                echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
+                attempt=$((attempt + 1))
+                sleep 2
+            fi
+        else
+            echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
+            attempt=$((attempt + 1))
+            sleep 2
+        fi
+    done
+
+    counter=$((counter + 1))
+    if [[ $counter -ge $MAX_FILES ]]; then
+        break
+    fi
+done
+
+echo "Script completed successfully."
diff --git a/starrocks/main.sh b/starrocks/main.sh
new file mode 100755
index 0000000..fc90bac
--- /dev/null
+++ b/starrocks/main.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Default data directory
+DEFAULT_DATA_DIRECTORY=~/data/bluesky
+
+# Allow the user to optionally provide the data directory as an argument
+DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
+
+# Define success and error log files
+SUCCESS_LOG="${2:-success.log}"
+ERROR_LOG="${3:-error.log}"
+
+# Define prefix for output files
+OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
+
+# Check if the directory exists
+if [[ ! -d "$DATA_DIRECTORY" ]]; then
+    echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
+    exit 1
+fi
+
+echo "Select the dataset size to benchmark:"
+echo "1) 1m (default)"
+echo "2) 10m"
+echo "3) 100m"
+echo "4) 1000m"
+echo "5) all"
+read -p "Enter the number corresponding to your choice: " choice
+
+./install.sh
+
+benchmark() {
+    local size=$1
+    local suffix=$2
+    # Check DATA_DIRECTORY contains the required number of files to run the benchmark
+    file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
+    if (( file_count < size )); then
+        echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
+        exit 1
+    fi
+    ./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
+    ./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size"
+    ./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count"
+    ./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans"
+    ./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage"
+}
+
+case $choice in
+    2)
+        benchmark 10 lz4
+        benchmark 10 zstd
+        ;;
+    3)
+        benchmark 100 lz4
+        benchmark 100 zstd
+        ;;
+    4)
+        benchmark 1000 lz4
+        benchmark 1000 zstd
+        ;;
+    5)
+        benchmark 1 lz4
+        benchmark 1 zstd
+        benchmark 10 lz4
+        benchmark 10 zstd
+        benchmark 100 lz4
+        benchmark 100 zstd
+        benchmark 1000 lz4
+        benchmark 1000 zstd
+        ;;
+    *)
+        benchmark 1 lz4
+        benchmark 1 zstd
+        ;;
+esac
diff --git a/starrocks/physical_query_plans.sh b/starrocks/physical_query_plans.sh
new file mode 100755
index 0000000..993fcae
--- /dev/null
+++ b/starrocks/physical_query_plans.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <DB_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+
+QUERY_NUM=1
+
+cat queries.sql | while read -r query; do
+
+    # Print the query number
+    echo "------------------------------------------------------------------------------------------------------------------------"
+    echo "Physical query plan for query Q$QUERY_NUM:"
+    echo
+    mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query"
+
+    # Increment the query number
+    QUERY_NUM=$((QUERY_NUM + 1))
+done;
diff --git a/starrocks/queries.sql b/starrocks/queries.sql
new file mode 100644
index 0000000..9a3e6f6
--- /dev/null
+++ b/starrocks/queries.sql
@@ -0,0 +1,5 @@
+SELECT cast(data->'commit.collection' AS VARCHAR) AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
+SELECT cast(data->'commit.collection' AS VARCHAR) AS event, count() AS count, count(DISTINCT cast(data->'did' AS VARCHAR)) AS users FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') GROUP BY event ORDER BY count DESC;
+SELECT cast(data->'commit.collection' AS VARCHAR) AS event, hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
+SELECT cast(data->'$.did' as VARCHAR) as user_id, min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
+SELECT cast(data->'$.did' as VARCHAR) as user_id, date_diff('millisecond', min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
diff --git a/starrocks/queries_formatted.sql b/starrocks/queries_formatted.sql
new file mode 100644
index 0000000..b549847
--- /dev/null
+++ b/starrocks/queries_formatted.sql
@@ -0,0 +1,66 @@
+------------------------------------------------------------------------------------------------------------------------
+-- Q1 - Top event types
+------------------------------------------------------------------------------------------------------------------------
+
+SELECT cast(data->'commit.collection' AS VARCHAR) AS event,
+       count() AS count 
+FROM bluesky 
+GROUP BY event 
+ORDER BY count DESC;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q2 - Top event types together with unique users per event type
+------------------------------------------------------------------------------------------------------------------------
+SELECT
+    cast(data->'commit.collection' AS VARCHAR) AS event,
+    count() AS count,
+	  count(DISTINCT cast(data->'did' AS VARCHAR)) AS users
+FROM bluesky
+WHERE (data->'kind' = 'commit')
+  AND (data->'commit.operation' = 'create') 
+GROUP BY event
+ORDER BY count DESC;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q3 - When do people use BlueSky
+------------------------------------------------------------------------------------------------------------------------
+SELECT
+    cast(data->'commit.collection' AS VARCHAR) AS event, 
+    hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day,
+    count() AS count
+FROM bluesky
+WHERE (data->'kind' = 'commit') 
+AND (data->'commit.operation' = 'create') 
+AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) 
+GROUP BY event, hour_of_day
+ORDER BY hour_of_day, event;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q4 - top 3 post veterans
+------------------------------------------------------------------------------------------------------------------------
+SELECT
+      cast(data->'$.did' as VARCHAR) as user_id, 
+      min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date 
+FROM bluesky
+WHERE (data->'kind' = 'commit') 
+  AND (data->'commit.operation' = 'create') 
+  AND (data->'commit.collection' = 'app.bsky.feed.post')
+GROUP BY user_id
+ORDER BY first_post_ts ASC
+LIMIT 3;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q5 - top 3 users with longest activity
+------------------------------------------------------------------------------------------------------------------------
+SELECT
+      cast(data->'$.did' as VARCHAR) as user_id, 
+      date_diff('millisecond', 
+      min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),
+      max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span 
+FROM bluesky
+WHERE (data->'kind' = 'commit') 
+    AND (data->'commit.operation' = 'create') 
+    AND (data->'commit.collection' = 'app.bsky.feed.post')
+GROUP BY user_id
+ORDER BY activity_span DESC
+LIMIT 3;
diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json
new file mode 100644
index 0000000..506051d
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json
@@ -0,0 +1,24 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "tags": [
+  ],
+  "dataset_size": 100000000,
+  "dataset_size_readable": "100m",
+  "num_loaded_documents": 100000000,
+  "data_compression": "lz4",
+  "total_size": 19182000000,
+  "total_size_readable": "19.182 GB",
+  "result": [
+    [0.25,0.17,0.17],
+    [8.13,4.33,3.82],
+    [3.18,3.08,3.05],
+    [4.06,4.07,4.12],
+    [4.04,4.20,3.97]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json
new file mode 100644
index 0000000..239d8da
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json
@@ -0,0 +1,24 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "tags": [
+  ],
+  "dataset_size": 100000000,
+  "dataset_size_readable": "100m",
+  "num_loaded_documents": 100000000,
+  "data_compression": "zstd",
+  "total_size": 31200000000,
+  "total_size_readable": "31.200 GB",
+  "result": [
+    [0.22,0.17,0.18],
+    [28.09,3.94,3.89],
+    [3.04,3.05,3.11],
+    [3.99,4.04,3.94],
+    [4.13,4.12,4.11]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json
new file mode 100644
index 0000000..3d0e830
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json
@@ -0,0 +1,24 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "tags": [
+  ],
+  "dataset_size": 10000000,
+  "dataset_size_readable": "10m",
+  "num_loaded_documents": 9999994,
+  "data_compression": "lz4",
+  "total_size": 1967000000,
+  "total_size_readable": "1.967 GB",
+  "result": [
+    [0.11,0.10,0.10],
+    [0.45,0.40,0.42],
+    [0.58,0.45,0.50],
+    [0.57,0.62,0.61],
+    [0.69,0.60,0.55]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json
new file mode 100644
index 0000000..4c9a4e7
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json
@@ -0,0 +1,24 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "tags": [
+  ],
+  "dataset_size": 10000000,
+  "dataset_size_readable": "10m",
+  "num_loaded_documents": 10000000,
+  "data_compression": "zstd",
+  "total_size": 3193000000,
+  "total_size_readable": "3.193 GB",
+  "result": [
+    [0.10,0.12,0.09],
+    [1.79,0.43,0.37],
+    [0.48,0.45,0.47],
+    [0.57,0.70,0.62],
+    [0.59,0.72,0.73]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json
new file mode 100644
index 0000000..9ba51b6
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json
@@ -0,0 +1,24 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "tags": [
+  ],
+  "dataset_size": 1000000,
+  "dataset_size_readable": "1m",
+  "num_loaded_documents": 1000000,
+  "data_compression": "lz4",
+  "total_size": 201845000,
+  "total_size_readable": "201.845 MB",
+  "result": [
+    [0.65,0.05,0.05],
+    [0.36,0.28,0.29],
+    [0.31,0.28,0.28],
+    [0.52,0.52,0.51],
+    [0.51,0.51,0.52]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json
new file mode 100644
index 0000000..7373bb6
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json
@@ -0,0 +1,24 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "tags": [
+  ],
+  "dataset_size": 1000000,
+  "dataset_size_readable": "1m",
+  "num_loaded_documents": 1000000,
+  "data_compression": "zstd",
+  "total_size": 322945000,
+  "total_size_readable": "322.945 MB",
+  "result": [
+    [0.05,0.05,0.04],
+    [0.37,0.28,0.29],
+    [0.28,0.27,0.27],
+    [0.51,0.51,0.51],
+    [0.50,0.50,0.51]
+  ]
+}
diff --git a/starrocks/run_queries.sh b/starrocks/run_queries.sh
new file mode 100755
index 0000000..019abe9
--- /dev/null
+++ b/starrocks/run_queries.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <DB_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+
+TRIES=3
+
+cat queries.sql | while read -r query; do
+
+    # Clear the Linux file system cache
+    echo "Clearing file system cache..."
+    sync
+    echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
+    echo "File system cache cleared."
+
+    # Print the query
+    echo "Running query: $query"
+
+    # Execute the query multiple times
+    for i in $(seq 1 $TRIES); do
+        RESP=$(mysql -vvv -h127.1 -P9030 -uroot "$DB_NAME" -e "$query" | perl -nle 'print $1 if /\((\d+\.\d+)+ sec\)/' ||:)
+        echo "Response time: ${RESP} s"
+    done;
+done;
diff --git a/starrocks/total_size.sh b/starrocks/total_size.sh
new file mode 100755
index 0000000..704b6f8
--- /dev/null
+++ b/starrocks/total_size.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
+    exit 1
+fi
+
+# Arguments
+DB_NAME="$1"
+TABLE_NAME="$2"
+
+mysql -P 9030 -h 127.0.0.1 -u root -e "SHOW DATA FROM $DB_NAME.$TABLE_NAME"

From f085fb8285df3b6aff5598f712efb40b12c1116d Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 12 Mar 2025 19:24:37 +0000
Subject: [PATCH 2/2] More fixups

---
 starrocks/create_and_load.sh                  |  8 +++---
 starrocks/drop_table.sh                       | 14 +++++++++++
 starrocks/load_data.sh                        |  4 +--
 starrocks/main.sh                             |  1 +
 .../m6i.8xlarge_bluesky_1000m_lz4.json        | 25 +++++++++++++++++++
 .../m6i.8xlarge_bluesky_1000m_zstd.json       | 25 +++++++++++++++++++
 .../results/m6i.8xlarge_bluesky_100m_lz4.json |  1 +
 .../m6i.8xlarge_bluesky_100m_zstd.json        |  1 +
 .../results/m6i.8xlarge_bluesky_10m_lz4.json  |  1 +
 .../results/m6i.8xlarge_bluesky_10m_zstd.json |  1 +
 .../results/m6i.8xlarge_bluesky_1m_lz4.json   |  1 +
 .../results/m6i.8xlarge_bluesky_1m_zstd.json  |  1 +
 12 files changed, 75 insertions(+), 8 deletions(-)
 create mode 100755 starrocks/drop_table.sh
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json
 create mode 100644 starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json

diff --git a/starrocks/create_and_load.sh b/starrocks/create_and_load.sh
index 45863ca..e8d84bd 100755
--- a/starrocks/create_and_load.sh
+++ b/starrocks/create_and_load.sh
@@ -21,13 +21,11 @@ ERROR_LOG="$7"
 [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
 
 
-# Create database
+echo "Create database"
 mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
 
-# Execute DDL
+echo "Execute DDL"
 mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE"
 
-# Load data
+echo "Load data"
 ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
-
-echo "Script completed successfully."
diff --git a/starrocks/drop_table.sh b/starrocks/drop_table.sh
new file mode 100755
index 0000000..5aec315
--- /dev/null
+++ b/starrocks/drop_table.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
+    exit 1
+fi
+
+DB_NAME="$1"
+TABLE_NAME="$2"
+
+echo "Dropping table: $DB_NAME.$TABLE_NAME"
+
+mysql -P 9030 -h 127.0.0.1 -u root -e "DROP TABLE IF EXISTS $DB_NAME.$TABLE_NAME"
diff --git a/starrocks/load_data.sh b/starrocks/load_data.sh
index 125bb75..13b10f3 100755
--- a/starrocks/load_data.sh
+++ b/starrocks/load_data.sh
@@ -38,7 +38,7 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
     fi
     MAX_ATTEMPT=10
     attempt=0
-    while [ $attempt -lt $MAX_ATTEMPT ] 
+    while [ $attempt -lt $MAX_ATTEMPT ]
     do
         # Attempt the import
         http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
@@ -68,5 +68,3 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
         break
     fi
 done
-
-echo "Script completed successfully."
diff --git a/starrocks/main.sh b/starrocks/main.sh
index fc90bac..d825140 100755
--- a/starrocks/main.sh
+++ b/starrocks/main.sh
@@ -43,6 +43,7 @@ benchmark() {
     ./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count"
     ./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans"
     ./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage"
+    ./drop_table.sh "bluesky_${size}m_${suffix}" bluesky
 }
 
 case $choice in
diff --git a/starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json
new file mode 100644
index 0000000..b5cf4c4
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json
@@ -0,0 +1,25 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "retains_structure": "yes",
+  "tags": [
+  ],
+  "dataset_size": 100000000,
+  "dataset_size_readable": "100m",
+  "num_loaded_documents": 100000000,
+  "data_compression": "lz4",
+  "total_size": 19182000000,
+  "total_size_readable": "19.182 GB",
+  "result": [
+    [null, null, null],
+    [null, null, null],
+    [null, null, null],
+    [null, null, null],
+    [null, null, null]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json
new file mode 100644
index 0000000..19acf94
--- /dev/null
+++ b/starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json
@@ -0,0 +1,25 @@
+{
+  "system": "Starrocks",
+  "version": "3.4.0-e94580b",
+  "os": "Ubuntu 24.04",
+  "date": "2025-01-13",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "retains_structure": "yes",
+  "tags": [
+  ],
+  "dataset_size": 100000000,
+  "dataset_size_readable": "100m",
+  "num_loaded_documents": 100000000,
+  "data_compression": "zstd",
+  "total_size": 31200000000,
+  "total_size_readable": "31.200 GB",
+  "result": [
+    [null, null, null],
+    [null, null, null],
+    [null, null, null],
+    [null, null, null],
+    [null, null, null]
+  ]
+}
diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json
index 506051d..07a36cc 100644
--- a/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json
+++ b/starrocks/results/m6i.8xlarge_bluesky_100m_lz4.json
@@ -6,6 +6,7 @@
   "machine": "m6i.8xlarge, 10000gib gp3",
   "cluster_size": 1,
   "comment": "",
+  "retains_structure": "yes",
   "tags": [
   ],
   "dataset_size": 100000000,
diff --git a/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json
index 239d8da..d147ee1 100644
--- a/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json
+++ b/starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json
@@ -6,6 +6,7 @@
   "machine": "m6i.8xlarge, 10000gib gp3",
   "cluster_size": 1,
   "comment": "",
+  "retains_structure": "yes",
   "tags": [
   ],
   "dataset_size": 100000000,
diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json
index 3d0e830..6c9443e 100644
--- a/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json
+++ b/starrocks/results/m6i.8xlarge_bluesky_10m_lz4.json
@@ -6,6 +6,7 @@
   "machine": "m6i.8xlarge, 10000gib gp3",
   "cluster_size": 1,
   "comment": "",
+  "retains_structure": "yes",
   "tags": [
   ],
   "dataset_size": 10000000,
diff --git a/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json
index 4c9a4e7..266a147 100644
--- a/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json
+++ b/starrocks/results/m6i.8xlarge_bluesky_10m_zstd.json
@@ -6,6 +6,7 @@
   "machine": "m6i.8xlarge, 10000gib gp3",
   "cluster_size": 1,
   "comment": "",
+  "retains_structure": "yes",
   "tags": [
   ],
   "dataset_size": 10000000,
diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json
index 9ba51b6..40f2283 100644
--- a/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json
+++ b/starrocks/results/m6i.8xlarge_bluesky_1m_lz4.json
@@ -6,6 +6,7 @@
   "machine": "m6i.8xlarge, 10000gib gp3",
   "cluster_size": 1,
   "comment": "",
+  "retains_structure": "yes",
   "tags": [
   ],
   "dataset_size": 1000000,
diff --git a/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json
index 7373bb6..f5de5d0 100644
--- a/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json
+++ b/starrocks/results/m6i.8xlarge_bluesky_1m_zstd.json
@@ -6,6 +6,7 @@
   "machine": "m6i.8xlarge, 10000gib gp3",
   "cluster_size": 1,
   "comment": "",
+  "retains_structure": "yes",
   "tags": [
   ],
   "dataset_size": 1000000,