Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions starrocks/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
exit 1
fi

# Arguments
DB_NAME="$1"
RESULT_FILE_RUNTIMES="$2"
RESULT_FILE_MEMORY_USAGE="$3"

# Construct the query log file name using $DB_NAME
QUERY_LOG_FILE="query_log.txt"

# Print the database name
echo "Running queries on database: $DB_NAME"

# Run queries and log the output
./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt

# Process the query log and prepare the result
RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')

# Output the result
if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
echo "Result written to $RESULT_FILE_RUNTIMES"
else
echo "$RESULT"
fi
13 changes: 13 additions & 0 deletions starrocks/count.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
exit 1
fi

# Arguments
DB_NAME="$1"
TABLE_NAME="$2"

mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM '$TABLE_NAME';"
31 changes: 31 additions & 0 deletions starrocks/create_and_load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 7 ]]; then
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
exit 1
fi

# Arguments
DB_NAME="$1"
TABLE_NAME="$2"
DDL_FILE="$3"
DATA_DIRECTORY="$4"
NUM_FILES="$5"
SUCCESS_LOG="$6"
ERROR_LOG="$7"

# Validate arguments
[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }


echo "Create database"
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"

echo "Execute DDL"
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE"

echo "Load data"
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
12 changes: 12 additions & 0 deletions starrocks/ddl_lz4.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CREATE TABLE bluesky (
`id` BIGINT AUTO_INCREMENT,
-- Main JSON column (comes after key columns)
`data` JSON NULL COMMENT "Main JSON object",
-- Key columns (must come first in the schema and in the same order as DUPLICATE KEY)
`kind` VARCHAR(255) AS get_json_string(data, '$.kind'),
`operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'),
`collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
`did` VARCHAR(255) AS get_json_string(data, '$.did'),
`time_us` BIGINT AS get_json_int(data, '$.time_us')
) ENGINE=OLAP
ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`);
15 changes: 15 additions & 0 deletions starrocks/ddl_zstd.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
CREATE TABLE bluesky (
`id` BIGINT AUTO_INCREMENT,
-- Main JSON column (comes after key columns)
`data` JSON NULL COMMENT "Main JSON object",
-- Key columns (must come first in the schema and in the same order as ORDER BY)
`kind` VARCHAR(255) AS get_json_string(data, '$.kind'),
`operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'),
`collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
`did` VARCHAR(255) AS get_json_string(data, '$.did'),
`time_us` BIGINT AS get_json_int(data, '$.time_us')
) ENGINE=OLAP
ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`)
PROPERTIES (
"compression" = "ZSTD"
);
14 changes: 14 additions & 0 deletions starrocks/drop_table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
exit 1
fi

DB_NAME="$1"
TABLE_NAME="$2"

echo "Dropping table: $DB_NAME.$TABLE_NAME"

mysql -P 9030 -h 127.0.0.1 -u root -e "DROP TABLE IF EXISTS $DB_NAME.$TABLE_NAME"
7 changes: 7 additions & 0 deletions starrocks/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

sudo snap install docker
sudo apt-get update
sudo apt-get install -y mysql-client
sudo docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name quickstart starrocks/allin1-ubuntu

70 changes: 70 additions & 0 deletions starrocks/load_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 6 ]]; then
echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
exit 1
fi


# Arguments
DATA_DIRECTORY="$1"
DB_NAME="$2"
TABLE_NAME="$3"
MAX_FILES="$4"
SUCCESS_LOG="$5"
ERROR_LOG="$6"

# Validate arguments
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }

# Create a temporary directory for uncompressed files
TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit

# Load data
counter=0
for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
echo "Processing file: $file"

# Uncompress the file into the TEMP_DIR
uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
gunzip -c "$file" > "$uncompressed_file"

if [[ $? -ne 0 ]]; then
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
continue
fi
MAX_ATTEMPT=10
attempt=0
while [ $attempt -lt $MAX_ATTEMPT ]
do
# Attempt the import
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
response_body="$(cat /tmp/curl_body)"
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
echo $response_status
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
if [ "$response_status" = "Success" ]
then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing
attempt=$((MAX_ATTEMPT))
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
attempt=$((attempt + 1))
sleep 2
fi
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
attempt=$((attempt + 1))
sleep 2
fi
done

counter=$((counter + 1))
if [[ $counter -ge $MAX_FILES ]]; then
break
fi
done
76 changes: 76 additions & 0 deletions starrocks/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

# Default data directory
DEFAULT_DATA_DIRECTORY=~/data/bluesky

# Allow the user to optionally provide the data directory as an argument
DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"

# Define success and error log files
SUCCESS_LOG="${2:-success.log}"
ERROR_LOG="${3:-error.log}"

# Define prefix for output files
OUTPUT_PREFIX="${4:-_m6i.8xlarge}"

# Check if the directory exists
if [[ ! -d "$DATA_DIRECTORY" ]]; then
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
exit 1
fi

echo "Select the dataset size to benchmark:"
echo "1) 1m (default)"
echo "2) 10m"
echo "3) 100m"
echo "4) 1000m"
echo "5) all"
read -p "Enter the number corresponding to your choice: " choice

./install.sh

benchmark() {
local size=$1
local suffix=$2
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
if (( file_count < size )); then
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
exit 1
fi
./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size"
./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count"
./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans"
./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage"
./drop_table.sh "bluesky_${size}m_${suffix}" bluesky
}

case $choice in
2)
benchmark 10 lz4
benchmark 10 zstd
;;
3)
benchmark 100 lz4
benchmark 100 zstd
;;
4)
benchmark 1000 lz4
benchmark 1000 zstd
;;
5)
benchmark 1 lz4
benchmark 1 zstd
benchmark 10 lz4
benchmark 10 zstd
benchmark 100 lz4
benchmark 100 zstd
benchmark 1000 lz4
benchmark 1000 zstd
;;
*)
benchmark 1 lz4
benchmark 1 zstd
;;
esac
24 changes: 24 additions & 0 deletions starrocks/physical_query_plans.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <DB_NAME>"
exit 1
fi

# Arguments
DB_NAME="$1"

QUERY_NUM=1

cat queries.sql | while read -r query; do

# Print the query number
echo "------------------------------------------------------------------------------------------------------------------------"
echo "Physical query plan for query Q$QUERY_NUM:"
echo
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query"

# Increment the query number
QUERY_NUM=$((QUERY_NUM + 1))
done;
5 changes: 5 additions & 0 deletions starrocks/queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT cast(data->'commit.collection' AS VARCHAR) AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
SELECT cast(data->'commit.collection' AS VARCHAR) AS event, count() AS count, count(DISTINCT cast(data->'did' AS VARCHAR)) AS users FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') GROUP BY event ORDER BY count DESC;
SELECT cast(data->'commit.collection' AS VARCHAR) AS event, hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT cast(data->'$.did' as VARCHAR) as user_id, min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
SELECT cast(data->'$.did' as VARCHAR) as user_id, date_diff('millisecond', min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
66 changes: 66 additions & 0 deletions starrocks/queries_formatted.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
------------------------------------------------------------------------------------------------------------------------
-- Q1 - Top event types
------------------------------------------------------------------------------------------------------------------------

SELECT cast(data->'commit.collection' AS VARCHAR) AS event,
count() AS count
FROM bluesky
GROUP BY event
ORDER BY count DESC;

------------------------------------------------------------------------------------------------------------------------
-- Q2 - Top event types together with unique users per event type
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'commit.collection' AS VARCHAR) AS event,
count() AS count,
count(DISTINCT cast(data->'did' AS VARCHAR)) AS users
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
GROUP BY event
ORDER BY count DESC;

------------------------------------------------------------------------------------------------------------------------
-- Q3 - When do people use BlueSky
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'commit.collection' AS VARCHAR) AS event,
hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day,
count() AS count
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR)))
GROUP BY event, hour_of_day
ORDER BY hour_of_day, event;

------------------------------------------------------------------------------------------------------------------------
-- Q4 - top 3 post veterans
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'$.did' as VARCHAR) as user_id,
min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
AND (data->'commit.collection' = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY first_post_ts ASC
LIMIT 3;

------------------------------------------------------------------------------------------------------------------------
-- Q5 - top 3 users with longest activity
------------------------------------------------------------------------------------------------------------------------
SELECT
cast(data->'$.did' as VARCHAR) as user_id,
date_diff('millisecond',
min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),
max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span
FROM bluesky
WHERE (data->'kind' = 'commit')
AND (data->'commit.operation' = 'create')
AND (data->'commit.collection' = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY activity_span DESC
LIMIT 3;
Loading