Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ While the main benchmark uses a specific machine configuration for reproducibili
- [x] SingleStore
- [x] GreptimeDB
- [x] FerretDB
- [x] Apache Doris
- [ ] Quickwit
- [ ] Meilisearch
- [ ] Sneller
Expand Down
35 changes: 35 additions & 0 deletions doris/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

# If you change something in this file, please change also in starrocks/benchmark.sh.

# Check if the required arguments are provided
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <QUERIES_FILE>"
exit 1
fi

# Arguments
DB_NAME="$1"
RESULT_FILE_RUNTIMES="$2"
QUERIES_FILE="$3"

# Construct the query log file name using $DB_NAME
QUERY_LOG_FILE="query_log.txt"

# Print the database name
echo "Running queries on database: $DB_NAME"

# Run queries and log the output
./run_queries.sh "$DB_NAME" "$QUERIES_FILE" 2>&1 | tee query_log.txt

# Process the query log and prepare the result
RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')

# Output the result
if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
echo "Result written to $RESULT_FILE_RUNTIMES"
else
echo "$RESULT"
fi
15 changes: 15 additions & 0 deletions doris/count.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# If you change something in this file, please change also in starrocks/count.sh.

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
exit 1
fi

# Arguments
DB_NAME="$1"
TABLE_NAME="$2"

mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"
32 changes: 32 additions & 0 deletions doris/create_and_load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 6 ]]; then
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
exit 1
fi

# Arguments
DB_NAME="$1"
TABLE_NAME="$2"
DATA_DIRECTORY="$3"
NUM_FILES="$4"
SUCCESS_LOG="$5"
ERROR_LOG="$6"

# Validate arguments
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }


echo "Create database"
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"

echo "Execute DDL"
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql"

echo "Load data"
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"

echo "Sleep 120 sec to collect data size"
sleep 120s
8 changes: 8 additions & 0 deletions doris/ddl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
CREATE TABLE bluesky (
`id` BIGINT NOT NULL AUTO_INCREMENT,
`data` variant NOT NULL
)
DISTRIBUTED BY HASH(id) BUCKETS 32
PROPERTIES (
"replication_num"="1"
);
15 changes: 15 additions & 0 deletions doris/drop_table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# If you change something in this file, please change also in starrocks/drop_table.sh.

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
exit 1
fi

DB_NAME="$1"
TABLE_NAME="$2"

echo "Dropping table: $DB_NAME.$TABLE_NAME"
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"
8 changes: 8 additions & 0 deletions doris/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

wget https://apache-doris-releases.oss-accelerate.aliyuncs.com/${DORIS_FULL_NAME}.tar.gz
mkdir ${DORIS_FULL_NAME}
tar -xvf ${DORIS_FULL_NAME}.tar.gz --strip-components 1 -C ${DORIS_FULL_NAME}

sudo apt-get update
sudo apt-get install -y mysql-client openjdk-17-jre-headless # somehow _EXACTLY_ v17 is needed
75 changes: 75 additions & 0 deletions doris/load_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 6 ]]; then
echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
exit 1
fi


# Arguments
DATA_DIRECTORY="$1"
DB_NAME="$2"
TABLE_NAME="$3"
MAX_FILES="$4"
SUCCESS_LOG="$5"
ERROR_LOG="$6"

# Validate arguments
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }

# Create a temporary directory for uncompressed files
TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit

# Load data
counter=0
start=0
for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
echo "Processing file: $file"
num=$(echo "$file" | sed -n 's/[^0-9]*\([0-9]\+\).*/\1/p')
if [ "$num" -le "$start" ]; then
continue
fi

# Uncompress the file into the TEMP_DIR
uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
gunzip -c "$file" > "$uncompressed_file"

if [[ $? -ne 0 ]]; then
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
continue
fi
MAX_ATTEMPT=10
attempt=0
while [ $attempt -lt $MAX_ATTEMPT ]
do
# Attempt the import
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "max_filter_ratio: 0.1" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
response_body="$(cat /tmp/curl_body)"
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
echo $response_status
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
if [ "$response_status" = "Success" ]
then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing
attempt=$((MAX_ATTEMPT))
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
attempt=$((attempt + 1))
sleep 2
fi
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
attempt=$((attempt + 1))
sleep 2
fi
done

counter=$((counter + 1))
if [[ $counter -ge $MAX_FILES ]]; then
break
fi
done
79 changes: 79 additions & 0 deletions doris/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash

# If you change something in this file, please change also in starrocks/main.sh.

export DORIS_FULL_NAME="apache-doris-3.0.5-bin-x64"

DEFAULT_CHOICE=ask
DEFAULT_DATA_DIRECTORY=~/data/bluesky

# Allow the user to optionally provide the scale factor ("choice") as an argument
CHOICE="${1:-$DEFAULT_CHOICE}"

# Allow the user to optionally provide the data directory as an argument
DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"

# Define success and error log files
SUCCESS_LOG="${3:-success.log}"
ERROR_LOG="${4:-error.log}"

# Define prefix for output files
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"

# Check if the directory exists
if [[ ! -d "$DATA_DIRECTORY" ]]; then
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
exit 1
fi

if [ "$CHOICE" = "ask" ]; then
echo "Select the dataset size to benchmark:"
echo "1) 1m (default)"
echo "2) 10m"
echo "3) 100m"
echo "4) 1000m"
echo "5) all"
read -p "Enter the number corresponding to your choice: " CHOICE
fi;

./install.sh
./start.sh

benchmark() {
local size=$1
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
if (( file_count < size )); then
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
exit 1
fi
./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" "queries.sql"
./drop_table.sh "bluesky_${size}m" bluesky
}

case $CHOICE in
2)
benchmark 10
;;
3)
benchmark 100
;;
4)
benchmark 1000
;;
5)
benchmark 1
benchmark 10
benchmark 100
benchmark 1000
;;
*)
benchmark 1
;;
esac

./stop.sh
./uninstall.sh
5 changes: 5 additions & 0 deletions doris/queries_default.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count, COUNT(DISTINCT cast(data['did'] AS TEXT )) AS users FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' GROUP BY event ORDER BY count DESC;
SELECT cast(data['commit']['collection'] AS TEXT ) AS event, HOUR(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT cast(data['did'] AS TEXT ) AS user_id, MIN(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS first_post_ts FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
SELECT cast(data['did'] AS TEXT ) AS user_id, MILLISECONDS_DIFF(MAX(from_microsecond(CAST(data['time_us'] AS BIGINT))),MIN(from_microsecond(CAST(data['time_us'] AS BIGINT)))) AS activity_span FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
5 changes: 5 additions & 0 deletions doris/queries_materialized.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT collection AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
SELECT collection AS event, COUNT(*) AS count, COUNT(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND operation = 'create' GROUP BY event ORDER BY count DESC;
SELECT collection AS event, HOUR(time) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT did AS user_id, MIN(time) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
SELECT did AS user_id, MILLISECONDS_DIFF(MAX(time),MIN(time)) AS activity_span FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
71 changes: 71 additions & 0 deletions doris/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
------------------------------------------------------------------------------------------------------------------------
Result for query Q1:
+----------------------------+--------+
| event | count |
+----------------------------+--------+
| app.bsky.feed.like | 448944 |
| app.bsky.graph.follow | 360374 |
| app.bsky.feed.post | 90816 |
| app.bsky.feed.repost | 58540 |
| app.bsky.graph.block | 14040 |
| app.bsky.actor.profile | 11762 |
| app.bsky.graph.listitem | 8103 |
| NULL | 5328 |
| app.bsky.graph.listblock | 895 |
| app.bsky.graph.starterpack | 405 |
| app.bsky.graph.list | 356 |
| app.bsky.feed.threadgate | 255 |
| app.bsky.feed.postgate | 104 |
| app.bsky.feed.generator | 74 |
| app.bsky.labeler.service | 4 |
+----------------------------+--------+

------------------------------------------------------------------------------------------------------------------------
Result for query Q2:
+----------------------------+--------+--------+
| event | count | users |
+----------------------------+--------+--------+
| app.bsky.feed.like | 444523 | 117617 |
| app.bsky.graph.follow | 337978 | 63957 |
| app.bsky.feed.post | 86812 | 50464 |
| app.bsky.feed.repost | 56993 | 26581 |
| app.bsky.graph.block | 13838 | 5785 |
| app.bsky.graph.listitem | 7568 | 1078 |
| app.bsky.actor.profile | 5337 | 5337 |
| app.bsky.graph.listblock | 860 | 449 |
| app.bsky.graph.list | 259 | 218 |
| app.bsky.feed.threadgate | 228 | 196 |
| app.bsky.graph.starterpack | 104 | 101 |
| app.bsky.feed.postgate | 101 | 82 |
| app.bsky.feed.generator | 10 | 9 |
+----------------------------+--------+--------+

------------------------------------------------------------------------------------------------------------------------
Result for query Q3:
+----------------------+-------------+--------+
| event | hour_of_day | count |
+----------------------+-------------+--------+
| app.bsky.feed.like | 16 | 444523 |
| app.bsky.feed.post | 16 | 86812 |
| app.bsky.feed.repost | 16 | 56993 |
+----------------------+-------------+--------+

------------------------------------------------------------------------------------------------------------------------
Result for query Q4:
+----------------------------------+----------------------------+
| user_id | first_post_ts |
+----------------------------------+----------------------------+
| did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21 16:25:49.000167 |
| did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21 16:25:49.001905 |
| did:plc:s4bwqchfzm6gjqfeb6mexgbu | 2024-11-21 16:25:49.003907 |
+----------------------------------+----------------------------+

------------------------------------------------------------------------------------------------------------------------
Result for query Q5:
+----------------------------------+---------------+
| user_id | activity_span |
+----------------------------------+---------------+
| did:plc:tsyymlun4eqjuw7hqrhmwagd | 813006 |
| did:plc:3ug235sfy2pz7cawmpsftb65 | 811602 |
| did:plc:doxhhgtxqiv47tmcovpbcqai | 811404 |
+----------------------------------+---------------+
21 changes: 21 additions & 0 deletions doris/results/m6i.8xlarge_bluesky_1000m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"system": "Apache Doris",
"version": "doris-3.0.5-rc01-e277cfb83f",
"os": "Ubuntu 24.04",
"date": "2025-05-13",
"machine": "m6i.8xlarge, 10000gib gp3",
"retains_structure": "yes",
"tags": [
],
"dataset_size": 1000000000,
"num_loaded_documents": 999999994,
"total_size": 214623810748,
"data_size": 214623810748,
"result": [
[4.24,1.67,1.66],
[222.33,11.01,10.16],
[27.82,7.47,7.45],
[235.20,6.41,6.07],
[234.71,6.37,5.78]
]
}
Loading