From c2a356191fe5b0053bd34e0d803ca97d09cc5bb4 Mon Sep 17 00:00:00 2001 From: Conor McCarter Date: Thu, 6 Nov 2025 16:51:14 -0800 Subject: [PATCH 1/3] Add Databricks and benchmark results for most SQL warehouse options --- README.md | 1 - databricks/.env.example | 22 ++ databricks/NOTES.md | 4 + databricks/README.md | 47 ++++ databricks/benchmark.py | 361 +++++++++++++++++++++++++++++++ databricks/benchmark.sh | 22 ++ databricks/create.sql | 109 ++++++++++ databricks/queries.sql | 43 ++++ databricks/query.py | 88 ++++++++ databricks/results/2x-large.json | 56 +++++ databricks/results/2x-small.json | 56 +++++ databricks/results/4x-large.json | 56 +++++ databricks/results/large.json | 56 +++++ databricks/results/medium.json | 56 +++++ databricks/results/small.json | 56 +++++ databricks/results/x-large.json | 56 +++++ databricks/results/x-small.json | 56 +++++ databricks/run.sh | 15 ++ 18 files changed, 1159 insertions(+), 1 deletion(-) create mode 100644 databricks/.env.example create mode 100644 databricks/NOTES.md create mode 100644 databricks/README.md create mode 100755 databricks/benchmark.py create mode 100755 databricks/benchmark.sh create mode 100644 databricks/create.sql create mode 100644 databricks/queries.sql create mode 100755 databricks/query.py create mode 100644 databricks/results/2x-large.json create mode 100644 databricks/results/2x-small.json create mode 100644 databricks/results/4x-large.json create mode 100644 databricks/results/large.json create mode 100644 databricks/results/medium.json create mode 100644 databricks/results/small.json create mode 100644 databricks/results/x-large.json create mode 100644 databricks/results/x-small.json create mode 100755 databricks/run.sh diff --git a/README.md b/README.md index 5661ebcf8..6c93c0464 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,6 @@ Please help us add more systems and run the benchmarks on more types of VMs: - [ ] Azure Synapse - [ ] Boilingdata - [ ] CockroachDB Serverless -- [ ] Databricks - [ ] DolphinDB - [ ] Dremio (without publishing) - [ ] DuckDB operating like "Athena" on remote Parquet files diff --git a/databricks/.env.example b/databricks/.env.example new file mode 100644 index 000000000..a0b353aab --- /dev/null +++ b/databricks/.env.example @@ -0,0 +1,22 @@ +# Databricks Configuration +# Copy this file to .env and fill in your actual values + +# Your Databricks workspace hostname (e.g., dbc-xxxxxxxx-xxxx.cloud.databricks.com) +DATABRICKS_SERVER_HOSTNAME=your-workspace-hostname.cloud.databricks.com + +# SQL Warehouse HTTP path (found in your SQL Warehouse settings) +# Uncomment the warehouse size you want to use +DATABRICKS_HTTP_PATH=/sql/1.0/warehouses/your-warehouse-id + +# Instance type name for results file naming & results machine type label +databricks_instance_type=Large + +# Your Databricks personal access token +DATABRICKS_TOKEN=your-databricks-token + +# Unity Catalog and Schema names +DATABRICKS_CATALOG=clickbench_catalog +DATABRICKS_SCHEMA=clickbench_schema + +# Parquet data location +DATABRICKS_PARQUET_LOCATION=s3://some/path/hits.parquet diff --git a/databricks/NOTES.md b/databricks/NOTES.md new file mode 100644 index 000000000..d29fb021e --- /dev/null +++ b/databricks/NOTES.md @@ -0,0 +1,4 @@ +I created each warehouse in the Databricks UI. +Besides the warehouse size, the only other change I made to default settings was to set the sleep time to 5 minutes to save money (the 4x large warehouse is very expensive). + +Once the warehouse was created, I'd save the warehouse path to use in the .env file for each run. \ No newline at end of file diff --git a/databricks/README.md b/databricks/README.md new file mode 100644 index 000000000..4b4bc47b3 --- /dev/null +++ b/databricks/README.md @@ -0,0 +1,47 @@ +## Setup + +1. Create a Databricks workspace and SQL Warehouse +2. Generate a personal access token from your Databricks workspace +3. Copy `.env.example` to `.env` and fill in your values: + +```bash +cp .env.example .env +# Edit .env with your actual credentials +``` + +Required environment variables: +- `DATABRICKS_SERVER_HOSTNAME`: Your workspace hostname (e.g., `dbc-xxxxxxxx-xxxx.cloud.databricks.com`) +- `DATABRICKS_HTTP_PATH`: SQL Warehouse path (e.g., `/sql/1.0/warehouses/your-warehouse-id`) +- `DATABRICKS_TOKEN`: Your personal access token +- `databricks_instance_type`: Instance type name for results file naming, e.g., "2X-Large" +- `DATABRICKS_CATALOG`: Unity Catalog name +- `DATABRICKS_SCHEMA`: Schema name +- `DATABRICKS_PARQUET_LOCATION`: S3 path to the parquet file + +## Running the Benchmark + +```bash +./benchmark.sh +``` + +## How It Works + +1. **benchmark.sh**: Entry point that installs dependencies via `uv` and runs the benchmark +2. **benchmark.py**: Orchestrates the full benchmark: + - Creates the catalog and schema + - Creates the `hits` table with explicit schema (including TIMESTAMP conversion) + - Loads data from the parquet file using `INSERT INTO` with type conversions + - Runs all queries via `run.sh` + - Collects timing metrics from Databricks REST API + - Outputs results to JSON in the `results/` directory +3. **run.sh**: Iterates through queries.sql and executes each query +4. **query.py**: Executes individual queries and retrieves execution times from Databricks REST API (`/api/2.0/sql/history/queries/{query_id}`) +5. **queries.sql**: Contains the 43 benchmark queries + +## Notes + +- Query execution times are pulled from the Databricks REST API, which provides server-side metrics +- The data is loaded from a parquet file with explicit type conversions (Unix timestamps → TIMESTAMP, Unix dates → DATE) +- The benchmark uses Databricks SQL Connector for Python +- Results include load time, data size, and individual query execution times (3 runs per query) +- Results are saved to `results/{instance_type}.json` diff --git a/databricks/benchmark.py b/databricks/benchmark.py new file mode 100755 index 000000000..497fee222 --- /dev/null +++ b/databricks/benchmark.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 + +from databricks import sql +import json +import os +import sys +import subprocess +import time +import requests + +def write_result_to_file(run_metadata, query_results): + # Ensure results directory exists + os.makedirs('results', exist_ok=True) + + # Get instance type and convert to lowercase for filename + instance_type = os.getenv('databricks_instance_type') + if not instance_type: + raise Exception("Missing required environment variable: databricks_instance_type") + filename = os.path.join('results', instance_type.lower() + ".json") + with open(filename, 'w') as f: + print("{", file=f) + for key in run_metadata: + print(f'\t"{key}": {json.dumps(run_metadata[key])},', file=f) + + print('\t"result": [', file=f) + num_lines = len(query_results) + for i in range(num_lines): + print(f"\t\t{query_results[i]}", end='', file=f) + print("," if i < num_lines - 1 else "", file=f) + + print("\t]\n}", file=f) + +def load_data(run_metadata): + server_hostname = os.getenv('DATABRICKS_SERVER_HOSTNAME') + http_path = os.getenv('DATABRICKS_HTTP_PATH') + access_token = os.getenv('DATABRICKS_TOKEN') + catalog = os.getenv('DATABRICKS_CATALOG') + schema = os.getenv('DATABRICKS_SCHEMA') + parquet_location = os.getenv('DATABRICKS_PARQUET_LOCATION') + + if not all([server_hostname, http_path, access_token, catalog, schema, parquet_location]): + raise Exception("Missing required environment variables: DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, DATABRICKS_CATALOG, DATABRICKS_SCHEMA, DATABRICKS_PARQUET_LOCATION") + + print(f'Connecting to Databricks; loading the data into {catalog}.{schema}', file=sys.stderr) + + connection = sql.connect( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token + ) + + cursor = connection.cursor() + + # Create catalog and schema if they don't exist + cursor.execute(f'CREATE CATALOG IF NOT EXISTS {catalog}') + cursor.execute(f'USE CATALOG {catalog}') + cursor.execute(f'CREATE SCHEMA IF NOT EXISTS {schema}') + cursor.execute(f'USE SCHEMA {schema}') + + print(f'Creating table and loading data from {parquet_location}...', file=sys.stderr) + + # Drop table if exists + cursor.execute(f'DROP TABLE IF EXISTS {catalog}.{schema}.hits') + + # Create table with explicit schema (EventTime as TIMESTAMP) + create_query = f""" + CREATE TABLE {catalog}.{schema}.hits ( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title STRING, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate DATE NOT NULL, + CounterID INT NOT NULL, + ClientIP INT NOT NULL, + RegionID INT NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL STRING, + Referer STRING, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INT NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INT NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 STRING, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor STRING NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel STRING, + Params STRING, + IPNetworkID INT NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase STRING, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INT NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset STRING, + CodeVersion INT NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL STRING, + HID INT NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor STRING NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INT NOT NULL, + WindowName INT NOT NULL, + OpenerName INT NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage STRING, + BrowserCountry STRING, + SocialNetwork STRING, + SocialAction STRING, + HTTPError SMALLINT NOT NULL, + SendTiming INT NOT NULL, + DNSTiming INT NOT NULL, + ConnectTiming INT NOT NULL, + ResponseStartTiming INT NOT NULL, + ResponseEndTiming INT NOT NULL, + FetchTiming INT NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage STRING, + ParamPrice BIGINT NOT NULL, + ParamOrderID STRING, + ParamCurrency STRING, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName STRING, + OpenstatCampaignID STRING, + OpenstatAdID STRING, + OpenstatSourceID STRING, + UTMSource STRING, + UTMMedium STRING, + UTMCampaign STRING, + UTMContent STRING, + UTMTerm STRING, + FromTag STRING, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INT NOT NULL + ) + """ + cursor.execute(create_query) + + # Insert data from parquet file with type conversions + load_query = f""" + INSERT INTO {catalog}.{schema}.hits + SELECT + WatchID, + JavaEnable, + Title, + GoodEvent, + CAST(FROM_UNIXTIME(EventTime) AS TIMESTAMP) AS EventTime, + DATE_FROM_UNIX_DATE(EventDate) AS EventDate, + CounterID, + ClientIP, + RegionID, + UserID, + CounterClass, + OS, + UserAgent, + URL, + Referer, + IsRefresh, + RefererCategoryID, + RefererRegionID, + URLCategoryID, + URLRegionID, + ResolutionWidth, + ResolutionHeight, + ResolutionDepth, + FlashMajor, + FlashMinor, + FlashMinor2, + NetMajor, + NetMinor, + UserAgentMajor, + UserAgentMinor, + CookieEnable, + JavascriptEnable, + IsMobile, + MobilePhone, + MobilePhoneModel, + Params, + IPNetworkID, + TraficSourceID, + SearchEngineID, + SearchPhrase, + AdvEngineID, + IsArtifical, + WindowClientWidth, + WindowClientHeight, + ClientTimeZone, + CAST(FROM_UNIXTIME(ClientEventTime) AS TIMESTAMP) AS ClientEventTime, + SilverlightVersion1, + SilverlightVersion2, + SilverlightVersion3, + SilverlightVersion4, + PageCharset, + CodeVersion, + IsLink, + IsDownload, + IsNotBounce, + FUniqID, + OriginalURL, + HID, + IsOldCounter, + IsEvent, + IsParameter, + DontCountHits, + WithHash, + HitColor, + CAST(FROM_UNIXTIME(LocalEventTime) AS TIMESTAMP) AS LocalEventTime, + Age, + Sex, + Income, + Interests, + Robotness, + RemoteIP, + WindowName, + OpenerName, + HistoryLength, + BrowserLanguage, + BrowserCountry, + SocialNetwork, + SocialAction, + HTTPError, + SendTiming, + DNSTiming, + ConnectTiming, + ResponseStartTiming, + ResponseEndTiming, + FetchTiming, + SocialSourceNetworkID, + SocialSourcePage, + ParamPrice, + ParamOrderID, + ParamCurrency, + ParamCurrencyID, + OpenstatServiceName, + OpenstatCampaignID, + OpenstatAdID, + OpenstatSourceID, + UTMSource, + UTMMedium, + UTMCampaign, + UTMContent, + UTMTerm, + FromTag, + HasGCLID, + RefererHash, + URLHash, + CLID + FROM parquet.`{parquet_location}` + """ + + cursor.execute(load_query) + load_query_id = cursor.query_id + + # Get load time from REST API + print(f"Getting load time for query {load_query_id}...", file=sys.stderr) + max_retries = 3 + + for retry in range(max_retries): + time.sleep(2) + + url = f"https://{server_hostname}/api/2.0/sql/history/queries/{load_query_id}" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if 'duration' in data: + load_duration = round(data['duration'] / 1000.0, 3) + run_metadata["load_time"] = load_duration + print(f"Table created successfully in {load_duration}s", file=sys.stderr) + break + except Exception as api_error: + print(f"API error on retry {retry + 1}: {api_error}", file=sys.stderr) + + # Get table size from DESCRIBE DETAIL + cursor.execute(f"DESCRIBE DETAIL {catalog}.{schema}.hits") + result = cursor.fetchone() + run_metadata["data_size"] = result[10] # sizeInBytes column + print(f"Table size: {run_metadata['data_size']} bytes", file=sys.stderr) + + print(f'Finished loading the data in {run_metadata["load_time"]}s; data size = {run_metadata["data_size"]} bytes', file=sys.stderr) + + cursor.close() + connection.close() + +def run_queries(): + # Run the benchmark script + result = subprocess.run( + ["./run.sh"], + stdout=subprocess.PIPE, + text=True, + timeout=3600, # 1 hour timeout + ) + + if result.returncode != 0: + raise Exception(f"Benchmark failed with return code {result.returncode}") + + return result.stdout + +if __name__ == "__main__": + instance_type = os.getenv('databricks_instance_type') + if not instance_type: + raise Exception("Missing required environment variable: databricks_instance_type") + + run_metadata = { + "system": "Databricks", + "date": time.strftime("%Y-%m-%d"), + "machine": f"Databricks: {instance_type}", + "cluster_size": "", + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + } + + load_data(run_metadata) + + query_output = run_queries() + + write_result_to_file(run_metadata, query_output.strip().split('\n')) diff --git a/databricks/benchmark.sh b/databricks/benchmark.sh new file mode 100755 index 000000000..6928ba071 --- /dev/null +++ b/databricks/benchmark.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Load environment variables +if [ -f .env ]; then + set -a + source .env + set +a +else + echo "Error: .env file not found. Please copy .env.example to .env and fill in your credentials." + exit 1 +fi + +# Create virtual environment if it doesn't exist +if [ ! -d ".venv" ]; then + uv venv +fi + +# Install dependencies +uv pip install databricks-sql-connector + +# Run benchmark +uv run python ./benchmark.py 2>&1 diff --git a/databricks/create.sql b/databricks/create.sql new file mode 100644 index 000000000..729e40370 --- /dev/null +++ b/databricks/create.sql @@ -0,0 +1,109 @@ +-- This is not used in the setup script, but is included here for reference. +-- The actual table is created in benchmark.py +CREATE OR REPLACE TABLE hits ( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title STRING, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate DATE NOT NULL, + CounterID INT NOT NULL, + ClientIP INT NOT NULL, + RegionID INT NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL STRING, + Referer STRING, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INT NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INT NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 STRING, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor STRING NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel STRING, + Params STRING, + IPNetworkID INT NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase STRING, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INT NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset STRING, + CodeVersion INT NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL STRING, + HID INT NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor STRING NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INT NOT NULL, + WindowName INT NOT NULL, + OpenerName INT NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage STRING, + BrowserCountry STRING, + SocialNetwork STRING, + SocialAction STRING, + HTTPError SMALLINT NOT NULL, + SendTiming INT NOT NULL, + DNSTiming INT NOT NULL, + ConnectTiming INT NOT NULL, + ResponseStartTiming INT NOT NULL, + ResponseEndTiming INT NOT NULL, + FetchTiming INT NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage STRING, + ParamPrice BIGINT NOT NULL, + ParamOrderID STRING, + ParamCurrency STRING, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName STRING, + OpenstatCampaignID STRING, + OpenstatAdID STRING, + OpenstatSourceID STRING, + UTMSource STRING, + UTMMedium STRING, + UTMCampaign STRING, + UTMContent STRING, + UTMTerm STRING, + FromTag STRING, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INT NOT NULL +); \ No newline at end of file diff --git a/databricks/queries.sql b/databricks/queries.sql new file mode 100644 index 000000000..8fafcbcf9 --- /dev/null +++ b/databricks/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '$1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/databricks/query.py b/databricks/query.py new file mode 100755 index 000000000..0ec552c36 --- /dev/null +++ b/databricks/query.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +from databricks import sql +import os +import sys +import time +import requests + +query = sys.stdin.read() +print(f"running {query}", file=sys.stderr) + +# Get connection parameters from environment variables +server_hostname = os.getenv('DATABRICKS_SERVER_HOSTNAME') +http_path = os.getenv('DATABRICKS_HTTP_PATH') +access_token = os.getenv('DATABRICKS_TOKEN') +catalog = os.getenv('DATABRICKS_CATALOG', 'main') +schema = os.getenv('DATABRICKS_SCHEMA', 'clickbench') + +if not all([server_hostname, http_path, access_token]): + print("Error: Missing required environment variables:", file=sys.stderr) + print(" DATABRICKS_SERVER_HOSTNAME", file=sys.stderr) + print(" DATABRICKS_HTTP_PATH", file=sys.stderr) + print(" DATABRICKS_TOKEN", file=sys.stderr) + sys.exit(1) + +connection = sql.connect( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token, + catalog=catalog, + schema=schema +) + +print('[', end='') + +for try_num in range(3): + if try_num > 0: + print(',', end='') + + try: + cursor = connection.cursor() + + # Execute the query + cursor.execute(query) + results = cursor.fetchall() + query_id = cursor.query_id + + # Get execution time from REST API + duration = None + max_retries = 3 + + for retry in range(max_retries): + # Wait a moment for query to complete and be available + time.sleep(1 if retry == 0 else 2) + + # Call the query history API + url = f"https://{server_hostname}/api/2.0/sql/history/queries/{query_id}" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if 'duration' in data: + # Duration is in milliseconds, convert to seconds + duration = round(data['duration'] / 1000.0, 3) + break + except Exception as api_error: + print(f"API error on retry {retry + 1}: {api_error}", file=sys.stderr) + + if duration is None: + # Fallback: if metrics aren't available after retries, use null + duration = 'null' + print(f"Could not retrieve metrics for query_id {query_id} after {max_retries} retries", file=sys.stderr) + + print(duration if isinstance(duration, str) else duration, end='') + + cursor.close() + except Exception as e: + print('null', end='') + print(f"query <{query.strip()}> errored out on attempt <{try_num+1}>: {e}", file=sys.stderr) + +print(']') + +connection.close() diff --git a/databricks/results/2x-large.json b/databricks/results/2x-large.json new file mode 100644 index 000000000..0c4709633 --- /dev/null +++ b/databricks/results/2x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: 2X-Large", + "cluster_size": 64, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 25.978, + "data_size": 10219802927, + "result": [ + [0.747, 0.108, 0.098], + [2.372, 0.107, 0.109], + [0.601, 0.095, 0.459], + [0.606, 0.11, 0.123], + [0.806, 0.115, 0.108], + [0.934, 0.108, 0.112], + [0.72, 0.107, 0.114], + [0.444, 0.099, 0.111], + [1.023, 0.103, 0.115], + [1.094, 0.108, 0.113], + [0.674, 0.097, 0.101], + [0.688, 0.103, 0.104], + [0.755, 0.104, 0.098], + [0.874, 0.104, 0.102], + [0.844, 0.096, 0.097], + [0.669, 0.101, 0.112], + [0.894, 0.096, 0.102], + [0.614, 0.095, 0.094], + [1.254, 0.121, 0.111], + [0.382, 0.092, 0.094], + [0.908, 0.095, 0.091], + [0.617, 0.113, 0.1], + [1.029, 0.115, 0.104], + [1.392, 0.1, 0.114], + [0.389, 0.108, 0.1], + [0.359, 0.1, 0.09], + [0.386, 0.1, 0.096], + [0.616, 0.102, 0.099], + [3.37, 0.103, 0.108], + [0.639, 0.15, 0.136], + [0.643, 0.095, 0.096], + [0.688, 0.104, 0.098], + [0.914, 0.094, 0.109], + [1.573, 0.107, 0.098], + [1.605, 0.108, 0.109], + [0.646, 0.108, 0.099], + [0.66, 0.112, 0.104], + [0.426, 0.093, 0.095], + [0.546, 0.108, 0.101], + [1.081, 0.102, 0.129], + [0.53, 0.103, 0.101], + [0.418, 0.131, 0.103], + [0.443, 0.105, 0.107] + ] +} diff --git a/databricks/results/2x-small.json b/databricks/results/2x-small.json new file mode 100644 index 000000000..9de0aa0a3 --- /dev/null +++ b/databricks/results/2x-small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: 2X-Small", + "cluster_size": 1, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 125.99, + "data_size": 10219802927, + "result": [ + [0.714, 0.128, 0.13], + [1.382, 0.129, 0.128], + [0.883, 0.129, 0.117], + [0.984, 0.12, 0.127], + [1.447, 0.119, 0.121], + [2.362, 0.117, 0.101], + [0.848, 0.113, 0.103], + [0.509, 0.104, 0.108], + [2.435, 0.108, 0.107], + [3.127, 0.118, 0.108], + [1.063, 0.104, 0.109], + [1.015, 0.12, 0.103], + [2.953, 0.121, 0.098], + [3.39, 0.125, 0.115], + [3.569, 0.101, 0.135], + [2.122, 0.12, 0.102], + [5.774, 0.098, 0.132], + [3.587, 0.104, 0.103], + [9.929, 0.147, 0.101], + [0.389, 0.104, 0.099], + [3.301, 0.102, 0.099], + [2.708, 0.104, 0.099], + [5.225, 0.113, 0.133], + [13.016, 0.105, 0.106], + [1.329, 0.103, 0.107], + [0.903, 0.099, 0.097], + [1.249, 0.13, 0.093], + [2.484, 0.108, 0.109], + [20.366, 0.119, 0.104], + [1.298, 0.154, 0.147], + [1.915, 0.096, 0.11], + [2.117, 0.095, 0.104], + [7.453, 0.119, 0.092], + [11.578, 0.095, 0.093], + [10.26, 0.113, 0.097], + [2.217, 0.101, 0.095], + [0.664, 0.11, 0.101], + [0.441, 0.092, 0.106], + [0.683, 0.111, 0.106], + [1.11, 0.106, 0.114], + [0.539, 0.101, 0.095], + [0.434, 0.361, 0.096], + [0.483, 0.097, 0.105] + ] +} diff --git a/databricks/results/4x-large.json b/databricks/results/4x-large.json new file mode 100644 index 000000000..62224ed5c --- /dev/null +++ b/databricks/results/4x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: 4X-Large", + "cluster_size": 256, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 38.981, + "data_size": 10219802927, + "result": [ + [0.597, 0.158, 0.116], + [3.236, 0.113, 0.103], + [0.619, 0.105, 0.101], + [0.681, 0.132, 0.107], + [1.405, 0.108, 0.099], + [1.105, 0.103, 0.095], + [0.719, 0.101, 0.1], + [3.267, 0.104, 0.104], + [4.144, 0.101, 0.115], + [1.431, 0.108, 0.114], + [2.427, 0.101, 0.119], + [6.409, 0.11, 0.101], + [4.826, 0.096, 0.103], + [1.097, 0.102, 0.108], + [1.116, 0.103, 0.103], + [0.794, 0.109, 0.14], + [0.962, 0.095, 0.096], + [0.626, 0.096, 0.096], + [1.732, 0.125, 0.102], + [0.319, 0.092, 0.105], + [0.981, 0.096, 0.123], + [0.614, 0.103, 0.111], + [1.069, 0.1, 0.104], + [1.419, 0.105, 0.107], + [0.382, 0.091, 0.105], + [0.321, 0.104, 0.099], + [0.368, 0.108, 0.09], + [0.71, 0.099, 0.095], + [3.437, 0.109, 0.112], + [0.685, 0.157, 0.166], + [0.743, 0.099, 0.159], + [1.045, 0.094, 0.097], + [1.117, 0.095, 0.102], + [1.796, 0.093, 0.099], + [1.855, 0.097, 0.112], + [0.746, 0.104, 0.097], + [0.67, 0.104, 0.099], + [0.432, 0.094, 0.094], + [0.596, 0.095, 0.125], + [1.126, 0.105, 0.104], + [0.522, 0.103, 0.103], + [0.453, 0.126, 0.107], + [0.454, 0.106, 0.106] + ] +} diff --git a/databricks/results/large.json b/databricks/results/large.json new file mode 100644 index 000000000..afc2cabdc --- /dev/null +++ b/databricks/results/large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: Large", + "cluster_size": 16, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 20.207, + "data_size": 10219802927, + "result": [ + [0.606, 0.113, 0.122], + [2.986, 0.111, 0.108], + [0.592, 0.108, 0.105], + [0.704, 0.103, 0.103], + [0.814, 0.105, 0.108], + [0.897, 0.1, 0.115], + [0.776, 0.1, 0.104], + [0.437, 0.105, 0.105], + [1.137, 0.104, 0.1], + [1.104, 0.111, 0.115], + [0.796, 0.105, 0.105], + [0.702, 0.113, 0.104], + [0.669, 0.099, 0.104], + [0.935, 0.102, 0.101], + [0.816, 0.106, 0.11], + [0.602, 0.103, 0.092], + [0.86, 0.11, 0.107], + [0.875, 0.1, 0.095], + [1.851, 0.116, 0.107], + [0.328, 0.096, 0.093], + [1.007, 0.11, 0.11], + [0.615, 0.108, 0.1], + [1.385, 0.109, 0.124], + [1.913, 0.108, 0.102], + [0.403, 0.092, 0.098], + [0.336, 0.09, 0.097], + [0.392, 0.271, 0.107], + [0.724, 0.103, 0.105], + [3.35, 0.1, 0.103], + [0.621, 0.147, 0.137], + [0.695, 0.094, 0.09], + [0.744, 0.092, 0.093], + [1.463, 0.106, 0.095], + [1.776, 0.107, 0.098], + [1.821, 0.112, 0.115], + [0.597, 0.106, 0.102], + [0.843, 0.118, 0.114], + [0.47, 0.095, 0.18], + [0.544, 0.109, 0.109], + [1.071, 0.119, 0.102], + [0.501, 0.114, 0.095], + [0.424, 0.12, 0.095], + [0.43, 0.101, 0.112] + ] +} diff --git a/databricks/results/medium.json b/databricks/results/medium.json new file mode 100644 index 000000000..f2fb57124 --- /dev/null +++ b/databricks/results/medium.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: Medium", + "cluster_size": 1, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 58.818, + "data_size": 10219802927, + "result": [ + [0.655, 0.125, 0.12], + [2.588, 0.11, 0.114], + [0.872, 0.122, 0.109], + [0.895, 0.103, 0.117], + [0.829, 0.11, 0.108], + [1.106, 0.106, 0.108], + [0.934, 0.106, 0.101], + [0.607, 0.103, 0.11], + [1.303, 0.124, 0.109], + [1.28, 0.119, 0.143], + [0.836, 0.109, 0.103], + [0.818, 0.112, 0.103], + [0.796, 0.108, 0.105], + [0.941, 0.104, 0.111], + [1.004, 0.221, 0.105], + [0.72, 0.101, 0.107], + [1.32, 0.097, 0.106], + [0.872, 0.108, 0.1], + [2.514, 0.12, 0.097], + [0.374, 0.096, 0.096], + [1.302, 0.102, 0.095], + [0.784, 0.113, 0.106], + [1.676, 0.114, 0.103], + [2.723, 0.105, 0.117], + [0.499, 0.098, 0.095], + [0.407, 0.094, 0.108], + [0.482, 0.107, 0.102], + [0.792, 0.115, 0.111], + [3.748, 0.105, 0.113], + [0.695, 0.158, 0.15], + [0.702, 0.169, 0.12], + [0.818, 0.098, 0.11], + [2.375, 0.147, 0.099], + [2.13, 0.101, 0.099], + [2.07, 0.104, 0.108], + [0.729, 0.099, 0.101], + [0.699, 0.112, 0.112], + [0.488, 0.095, 0.097], + [0.62, 0.103, 0.106], + [1.167, 0.106, 0.109], + [0.525, 0.107, 0.109], + [0.454, 0.141, 0.119], + [0.477, 0.122, 0.101] + ] +} diff --git a/databricks/results/small.json b/databricks/results/small.json new file mode 100644 index 000000000..7a9721053 --- /dev/null +++ b/databricks/results/small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: Small", + "cluster_size": 4, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 57.949, + "data_size": 10219802927, + "result": [ + [0.61, 0.104, 0.114], + [1.338, 0.1, 0.108], + [0.627, 0.098, 0.112], + [0.752, 0.106, 0.097], + [0.9, 0.109, 0.095], + [1.183, 0.102, 0.099], + [0.782, 0.098, 0.098], + [0.427, 0.108, 0.116], + [1.279, 0.102, 0.106], + [1.459, 0.112, 0.113], + [0.799, 0.11, 0.108], + [0.795, 0.109, 0.097], + [1.054, 0.101, 0.102], + [1.199, 0.102, 0.108], + [1.138, 0.101, 0.1], + [0.841, 0.098, 0.097], + [3.005, 0.12, 0.096], + [1.282, 0.094, 0.09], + [4.636, 0.105, 0.131], + [0.385, 0.087, 0.099], + [1.598, 0.092, 0.096], + [1.028, 0.102, 0.105], + [1.797, 0.108, 0.101], + [4.426, 0.102, 0.105], + [0.746, 0.096, 0.114], + [0.452, 0.103, 0.095], + [0.576, 0.122, 0.103], + [0.984, 0.102, 0.107], + [6.114, 0.105, 0.112], + [0.751, 0.204, 0.13], + [0.827, 0.104, 0.093], + [1.169, 0.101, 0.091], + [4.099, 0.095, 0.092], + [3.929, 0.106, 0.09], + [3.93, 0.095, 0.099], + [0.971, 0.1, 0.096], + [0.692, 0.106, 0.097], + [0.457, 0.1, 0.099], + [0.571, 0.097, 0.107], + [1.181, 0.113, 0.099], + [0.498, 0.156, 0.11], + [0.467, 0.104, 0.1], + [0.436, 0.104, 0.115] + ] +} diff --git a/databricks/results/x-large.json b/databricks/results/x-large.json new file mode 100644 index 000000000..c2207e129 --- /dev/null +++ b/databricks/results/x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: X-Large", + "cluster_size": 32, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 30.201, + "data_size": 10219802927, + "result": [ + [0.625, 0.126, 0.119], + [1.28, 0.115, 0.11], + [0.765, 0.107, 0.111], + [0.678, 0.11, 0.128], + [0.771, 0.109, 0.106], + [0.879, 0.107, 0.114], + [0.714, 0.113, 0.105], + [2.3, 0.108, 0.111], + [3.976, 0.114, 0.106], + [1.284, 0.116, 0.114], + [0.716, 0.107, 0.103], + [0.738, 0.113, 0.106], + [0.778, 0.12, 0.108], + [0.818, 0.105, 0.116], + [0.839, 0.102, 0.124], + [2.784, 0.124, 0.099], + [1.041, 0.097, 0.099], + [2.169, 0.11, 0.094], + [3.134, 0.122, 0.098], + [0.492, 0.093, 0.093], + [1.983, 0.102, 0.092], + [0.629, 0.107, 0.103], + [1.217, 0.1, 0.101], + [1.853, 0.11, 0.123], + [0.455, 0.105, 0.09], + [0.335, 0.099, 0.1], + [0.431, 0.094, 0.098], + [0.668, 0.096, 0.105], + [3.355, 0.109, 0.113], + [0.679, 0.157, 0.143], + [0.629, 0.097, 0.106], + [0.827, 0.11, 0.111], + [1.064, 0.102, 0.116], + [1.913, 0.106, 0.094], + [1.906, 0.111, 0.106], + [0.665, 0.108, 0.106], + [0.653, 0.1, 0.111], + [0.422, 0.102, 0.097], + [0.555, 0.113, 0.108], + [1.054, 0.103, 0.114], + [0.497, 0.105, 0.103], + [0.402, 0.139, 0.102], + [0.446, 0.108, 0.106] + ] +} diff --git a/databricks/results/x-small.json b/databricks/results/x-small.json new file mode 100644 index 000000000..65fe38aad --- /dev/null +++ b/databricks/results/x-small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: X-Small", + "cluster_size": 2, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 78.857, + "data_size": 10219802927, + "result": [ + [0.737, 0.228, 0.15], + [1.412, 0.131, 0.131], + [0.807, 0.123, 0.116], + [0.932, 0.115, 0.125], + [1.119, 0.116, 0.116], + [1.743, 0.158, 0.114], + [0.887, 0.152, 0.105], + [0.515, 0.102, 0.124], + [1.793, 0.104, 0.108], + [1.905, 0.132, 0.113], + [0.884, 0.104, 0.103], + [0.867, 0.125, 0.106], + [1.453, 0.117, 0.107], + [1.863, 0.108, 0.122], + [2.464, 0.126, 0.101], + [1.315, 0.103, 0.102], + [3.63, 0.102, 0.125], + [2.14, 0.114, 0.099], + [6.03, 0.11, 0.1], + [0.374, 0.106, 0.098], + [1.982, 0.105, 0.101], + [1.621, 0.121, 0.1], + [2.877, 0.107, 0.102], + [6.877, 0.112, 0.108], + [0.833, 0.099, 0.097], + [0.573, 0.097, 0.114], + [0.851, 0.105, 0.107], + [1.522, 0.11, 0.11], + [11.223, 0.126, 0.11], + [0.95, 0.142, 0.129], + [1.263, 0.102, 0.097], + [1.902, 0.103, 0.111], + [5.577, 0.113, 0.099], + [5.908, 0.103, 0.095], + [6.459, 0.122, 0.121], + [1.433, 0.111, 0.104], + [0.837, 0.098, 0.1], + [0.475, 0.102, 0.103], + [0.708, 0.109, 0.108], + [1.196, 0.109, 0.113], + [0.473, 0.114, 0.101], + [0.409, 0.128, 0.113], + [0.443, 0.11, 0.212] + ] +} diff --git a/databricks/run.sh b/databricks/run.sh new file mode 100755 index 000000000..b335f912b --- /dev/null +++ b/databricks/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Determine the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Use venv python if available, otherwise system python +if [ -f "$SCRIPT_DIR/.venv/bin/python3" ]; then + PYTHON="$SCRIPT_DIR/.venv/bin/python3" +else + PYTHON="python3" +fi + +cat queries.sql | while read -r query; do + $PYTHON ./query.py <<< "${query}" +done From 062b78938640ce374d160d59f69ce06867f5aabc Mon Sep 17 00:00:00 2001 From: Conor McCarter Date: Thu, 13 Nov 2025 12:50:09 -0500 Subject: [PATCH 2/3] Turn off query result cache --- databricks/query.py | 3 ++ databricks/results/2x-large.json | 90 +++++++++++++++---------------- databricks/results/2x-small.json | 90 +++++++++++++++---------------- databricks/results/4x-large.json | 90 +++++++++++++++---------------- databricks/results/large.json | 90 +++++++++++++++---------------- databricks/results/medium.json | 92 ++++++++++++++++---------------- databricks/results/small.json | 90 +++++++++++++++---------------- databricks/results/x-large.json | 90 +++++++++++++++---------------- databricks/results/x-small.json | 90 +++++++++++++++---------------- 9 files changed, 364 insertions(+), 361 deletions(-) diff --git a/databricks/query.py b/databricks/query.py index 0ec552c36..09f25cc7f 100755 --- a/databricks/query.py +++ b/databricks/query.py @@ -40,6 +40,9 @@ try: cursor = connection.cursor() + # Disable query result caching for accurate benchmarking + cursor.execute("SET use_cached_result = false") + # Execute the query cursor.execute(query) results = cursor.fetchall() diff --git a/databricks/results/2x-large.json b/databricks/results/2x-large.json index 0c4709633..c6c68e508 100644 --- a/databricks/results/2x-large.json +++ b/databricks/results/2x-large.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: 2X-Large", "cluster_size": 64, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 25.978, + "load_time": 36.227, "data_size": 10219802927, "result": [ - [0.747, 0.108, 0.098], - [2.372, 0.107, 0.109], - [0.601, 0.095, 0.459], - [0.606, 0.11, 0.123], - [0.806, 0.115, 0.108], - [0.934, 0.108, 0.112], - [0.72, 0.107, 0.114], - [0.444, 0.099, 0.111], - [1.023, 0.103, 0.115], - [1.094, 0.108, 0.113], - [0.674, 0.097, 0.101], - [0.688, 0.103, 0.104], - [0.755, 0.104, 0.098], - [0.874, 0.104, 0.102], - [0.844, 0.096, 0.097], - [0.669, 0.101, 0.112], - [0.894, 0.096, 0.102], - [0.614, 0.095, 0.094], - [1.254, 0.121, 0.111], - [0.382, 0.092, 0.094], - [0.908, 0.095, 0.091], - [0.617, 0.113, 0.1], - [1.029, 0.115, 0.104], - [1.392, 0.1, 0.114], - [0.389, 0.108, 0.1], - [0.359, 0.1, 0.09], - [0.386, 0.1, 0.096], - [0.616, 0.102, 0.099], - [3.37, 0.103, 0.108], - [0.639, 0.15, 0.136], - [0.643, 0.095, 0.096], - [0.688, 0.104, 0.098], - [0.914, 0.094, 0.109], - [1.573, 0.107, 0.098], - [1.605, 0.108, 0.109], - [0.646, 0.108, 0.099], - [0.66, 0.112, 0.104], - [0.426, 0.093, 0.095], - [0.546, 0.108, 0.101], - [1.081, 0.102, 0.129], - [0.53, 0.103, 0.101], - [0.418, 0.131, 0.103], - [0.443, 0.105, 0.107] + [0.552, 0.314, 0.28], + [3.022, 0.463, 0.382], + [0.582, 0.37, 0.367], + [0.568, 0.36, 0.342], + [0.696, 0.589, 0.68], + [0.886, 0.706, 0.701], + [0.663, 0.468, 0.46], + [0.409, 0.344, 2.586], + [1.195, 0.896, 0.874], + [3.439, 1.187, 1.128], + [0.663, 0.505, 0.494], + [0.609, 0.498, 0.484], + [0.7, 0.632, 0.575], + [0.696, 0.652, 0.67], + [2.734, 0.784, 0.703], + [1.388, 2.152, 0.678], + [0.77, 3.236, 0.867], + [0.638, 2.923, 0.704], + [1.569, 3.399, 1.06], + [0.281, 3.047, 0.305], + [0.909, 0.507, 0.507], + [0.65, 0.544, 0.519], + [0.925, 0.668, 0.663], + [1.382, 0.93, 0.92], + [0.378, 0.318, 0.332], + [0.303, 0.299, 0.335], + [0.318, 0.314, 0.31], + [0.617, 0.589, 0.587], + [3.488, 3.276, 3.166], + [0.619, 2.071, 0.583], + [0.896, 0.524, 3.0], + [0.87, 0.691, 2.66], + [1.066, 1.825, 4.829], + [2.253, 2.929, 1.561], + [1.556, 1.491, 1.557], + [0.709, 0.559, 0.565], + [0.624, 0.607, 3.21], + [0.54, 0.456, 2.633], + [0.6, 0.423, 0.386], + [1.059, 0.848, 0.84], + [0.473, 0.342, 0.339], + [0.362, 0.355, 0.36], + [0.392, 0.341, 0.353] ] } diff --git a/databricks/results/2x-small.json b/databricks/results/2x-small.json index 9de0aa0a3..714e96ea6 100644 --- a/databricks/results/2x-small.json +++ b/databricks/results/2x-small.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-10", "machine": "Databricks: 2X-Small", "cluster_size": 1, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 125.99, + "load_time": 90.392, "data_size": 10219802927, "result": [ - [0.714, 0.128, 0.13], - [1.382, 0.129, 0.128], - [0.883, 0.129, 0.117], - [0.984, 0.12, 0.127], - [1.447, 0.119, 0.121], - [2.362, 0.117, 0.101], - [0.848, 0.113, 0.103], - [0.509, 0.104, 0.108], - [2.435, 0.108, 0.107], - [3.127, 0.118, 0.108], - [1.063, 0.104, 0.109], - [1.015, 0.12, 0.103], - [2.953, 0.121, 0.098], - [3.39, 0.125, 0.115], - [3.569, 0.101, 0.135], - [2.122, 0.12, 0.102], - [5.774, 0.098, 0.132], - [3.587, 0.104, 0.103], - [9.929, 0.147, 0.101], - [0.389, 0.104, 0.099], - [3.301, 0.102, 0.099], - [2.708, 0.104, 0.099], - [5.225, 0.113, 0.133], - [13.016, 0.105, 0.106], - [1.329, 0.103, 0.107], - [0.903, 0.099, 0.097], - [1.249, 0.13, 0.093], - [2.484, 0.108, 0.109], - [20.366, 0.119, 0.104], - [1.298, 0.154, 0.147], - [1.915, 0.096, 0.11], - [2.117, 0.095, 0.104], - [7.453, 0.119, 0.092], - [11.578, 0.095, 0.093], - [10.26, 0.113, 0.097], - [2.217, 0.101, 0.095], - [0.664, 0.11, 0.101], - [0.441, 0.092, 0.106], - [0.683, 0.111, 0.106], - [1.11, 0.106, 0.114], - [0.539, 0.101, 0.095], - [0.434, 0.361, 0.096], - [0.483, 0.097, 0.105] + [0.757, 0.404, 0.373], + [1.393, 0.478, 0.51], + [0.773, 0.43, 0.408], + [0.721, 0.438, 0.417], + [0.975, 0.918, 1.196], + [1.518, 1.277, 1.326], + [0.804, 0.667, 0.459], + [0.485, 0.421, 0.384], + [1.63, 1.433, 1.426], + [2.129, 1.77, 1.863], + [0.914, 0.694, 0.647], + [0.787, 0.701, 0.635], + [1.332, 1.231, 1.226], + [1.884, 1.836, 1.965], + [1.715, 1.311, 1.326], + [1.408, 1.203, 1.247], + [2.576, 2.475, 3.578], + [2.016, 2.042, 1.949], + [6.167, 4.53, 3.941], + [0.325, 0.277, 0.388], + [1.96, 1.419, 1.378], + [1.544, 1.504, 1.562], + [2.988, 2.461, 2.268], + [7.105, 3.983, 3.979], + [0.726, 0.769, 0.78], + [0.932, 0.621, 0.542], + [0.75, 0.678, 0.727], + [1.569, 1.497, 1.456], + [11.266, 11.2, 10.878], + [0.904, 0.834, 0.828], + [1.06, 1.091, 1.133], + [1.432, 1.224, 1.401], + [4.227, 3.462, 3.487], + [6.79, 7.119, 6.939], + [6.632, 6.893, 6.614], + [1.299, 1.274, 1.25], + [0.661, 0.558, 0.58], + [0.408, 0.412, 0.408], + [0.582, 0.405, 0.423], + [1.161, 0.928, 0.9], + [0.492, 0.39, 0.438], + [0.399, 0.367, 0.358], + [0.415, 0.416, 0.416] ] } diff --git a/databricks/results/4x-large.json b/databricks/results/4x-large.json index 62224ed5c..014b3d233 100644 --- a/databricks/results/4x-large.json +++ b/databricks/results/4x-large.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: 4X-Large", "cluster_size": 256, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 38.981, + "load_time": 29.748, "data_size": 10219802927, "result": [ - [0.597, 0.158, 0.116], - [3.236, 0.113, 0.103], - [0.619, 0.105, 0.101], - [0.681, 0.132, 0.107], - [1.405, 0.108, 0.099], - [1.105, 0.103, 0.095], - [0.719, 0.101, 0.1], - [3.267, 0.104, 0.104], - [4.144, 0.101, 0.115], - [1.431, 0.108, 0.114], - [2.427, 0.101, 0.119], - [6.409, 0.11, 0.101], - [4.826, 0.096, 0.103], - [1.097, 0.102, 0.108], - [1.116, 0.103, 0.103], - [0.794, 0.109, 0.14], - [0.962, 0.095, 0.096], - [0.626, 0.096, 0.096], - [1.732, 0.125, 0.102], - [0.319, 0.092, 0.105], - [0.981, 0.096, 0.123], - [0.614, 0.103, 0.111], - [1.069, 0.1, 0.104], - [1.419, 0.105, 0.107], - [0.382, 0.091, 0.105], - [0.321, 0.104, 0.099], - [0.368, 0.108, 0.09], - [0.71, 0.099, 0.095], - [3.437, 0.109, 0.112], - [0.685, 0.157, 0.166], - [0.743, 0.099, 0.159], - [1.045, 0.094, 0.097], - [1.117, 0.095, 0.102], - [1.796, 0.093, 0.099], - [1.855, 0.097, 0.112], - [0.746, 0.104, 0.097], - [0.67, 0.104, 0.099], - [0.432, 0.094, 0.094], - [0.596, 0.095, 0.125], - [1.126, 0.105, 0.104], - [0.522, 0.103, 0.103], - [0.453, 0.126, 0.107], - [0.454, 0.106, 0.106] + [0.635, 0.305, 0.275], + [3.117, 0.402, 0.401], + [0.679, 0.353, 0.336], + [0.559, 2.072, 0.481], + [4.6, 2.145, 0.664], + [1.137, 0.716, 0.727], + [3.011, 2.326, 2.373], + [0.531, 0.353, 0.375], + [1.614, 1.037, 0.853], + [1.31, 1.686, 1.062], + [4.223, 3.829, 2.826], + [0.688, 0.512, 0.678], + [1.626, 0.728, 0.715], + [0.722, 0.738, 0.757], + [0.901, 0.85, 0.885], + [0.612, 0.622, 0.632], + [0.879, 0.767, 0.791], + [0.545, 0.5, 0.505], + [1.378, 1.075, 1.084], + [0.258, 0.229, 0.225], + [0.952, 0.476, 0.494], + [0.525, 0.501, 0.501], + [0.997, 0.684, 0.677], + [1.355, 0.82, 0.798], + [0.328, 0.298, 0.308], + [0.288, 0.299, 0.268], + [0.34, 0.295, 0.315], + [0.584, 0.533, 0.539], + [3.315, 3.213, 3.167], + [0.606, 0.531, 0.469], + [0.708, 0.548, 0.575], + [0.765, 0.616, 0.663], + [0.906, 0.918, 0.901], + [1.51, 1.469, 1.486], + [1.508, 1.533, 1.496], + [0.683, 0.708, 0.627], + [0.57, 0.559, 0.576], + [0.392, 0.398, 0.396], + [0.556, 0.396, 0.393], + [1.016, 0.877, 0.849], + [0.475, 0.384, 0.387], + [0.486, 0.359, 0.376], + [0.41, 0.377, 0.345] ] } diff --git a/databricks/results/large.json b/databricks/results/large.json index afc2cabdc..14b121ae9 100644 --- a/databricks/results/large.json +++ b/databricks/results/large.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: Large", "cluster_size": 16, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 20.207, + "load_time": 45.462, "data_size": 10219802927, "result": [ - [0.606, 0.113, 0.122], - [2.986, 0.111, 0.108], - [0.592, 0.108, 0.105], - [0.704, 0.103, 0.103], - [0.814, 0.105, 0.108], - [0.897, 0.1, 0.115], - [0.776, 0.1, 0.104], - [0.437, 0.105, 0.105], - [1.137, 0.104, 0.1], - [1.104, 0.111, 0.115], - [0.796, 0.105, 0.105], - [0.702, 0.113, 0.104], - [0.669, 0.099, 0.104], - [0.935, 0.102, 0.101], - [0.816, 0.106, 0.11], - [0.602, 0.103, 0.092], - [0.86, 0.11, 0.107], - [0.875, 0.1, 0.095], - [1.851, 0.116, 0.107], - [0.328, 0.096, 0.093], - [1.007, 0.11, 0.11], - [0.615, 0.108, 0.1], - [1.385, 0.109, 0.124], - [1.913, 0.108, 0.102], - [0.403, 0.092, 0.098], - [0.336, 0.09, 0.097], - [0.392, 0.271, 0.107], - [0.724, 0.103, 0.105], - [3.35, 0.1, 0.103], - [0.621, 0.147, 0.137], - [0.695, 0.094, 0.09], - [0.744, 0.092, 0.093], - [1.463, 0.106, 0.095], - [1.776, 0.107, 0.098], - [1.821, 0.112, 0.115], - [0.597, 0.106, 0.102], - [0.843, 0.118, 0.114], - [0.47, 0.095, 0.18], - [0.544, 0.109, 0.109], - [1.071, 0.119, 0.102], - [0.501, 0.114, 0.095], - [0.424, 0.12, 0.095], - [0.43, 0.101, 0.112] + [0.534, 0.281, 0.266], + [2.414, 0.4, 0.399], + [0.656, 0.342, 0.39], + [0.745, 0.34, 0.333], + [0.731, 3.706, 0.815], + [1.408, 0.663, 0.679], + [0.666, 0.772, 0.534], + [0.583, 0.346, 0.325], + [1.063, 0.847, 0.825], + [1.203, 1.077, 1.054], + [0.653, 0.453, 0.492], + [0.635, 0.477, 0.47], + [0.585, 0.599, 0.604], + [0.7, 0.639, 0.668], + [0.766, 0.624, 0.611], + [0.58, 0.513, 0.579], + [0.955, 0.77, 0.763], + [0.597, 0.566, 0.567], + [1.731, 1.462, 1.221], + [0.297, 0.216, 0.243], + [1.036, 0.532, 0.543], + [0.605, 0.555, 0.587], + [1.265, 0.882, 0.826], + [2.062, 1.339, 1.312], + [0.425, 0.368, 0.348], + [0.313, 0.325, 0.296], + [0.349, 0.365, 0.338], + [0.591, 0.598, 0.611], + [3.375, 3.368, 3.253], + [0.567, 0.517, 0.488], + [0.684, 0.587, 0.488], + [0.82, 0.69, 2.796], + [2.041, 1.339, 1.212], + [1.95, 1.85, 2.056], + [1.787, 1.819, 1.882], + [0.599, 0.55, 0.546], + [0.591, 0.579, 0.565], + [0.36, 0.375, 0.364], + [0.5, 0.37, 0.368], + [1.041, 0.849, 0.83], + [0.42, 0.342, 0.336], + [0.495, 0.331, 0.339], + [0.372, 0.329, 0.324] ] } diff --git a/databricks/results/medium.json b/databricks/results/medium.json index f2fb57124..593983b15 100644 --- a/databricks/results/medium.json +++ b/databricks/results/medium.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: Medium", - "cluster_size": 1, + "cluster_size": 8, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 58.818, + "load_time": 52.928, "data_size": 10219802927, "result": [ - [0.655, 0.125, 0.12], - [2.588, 0.11, 0.114], - [0.872, 0.122, 0.109], - [0.895, 0.103, 0.117], - [0.829, 0.11, 0.108], - [1.106, 0.106, 0.108], - [0.934, 0.106, 0.101], - [0.607, 0.103, 0.11], - [1.303, 0.124, 0.109], - [1.28, 0.119, 0.143], - [0.836, 0.109, 0.103], - [0.818, 0.112, 0.103], - [0.796, 0.108, 0.105], - [0.941, 0.104, 0.111], - [1.004, 0.221, 0.105], - [0.72, 0.101, 0.107], - [1.32, 0.097, 0.106], - [0.872, 0.108, 0.1], - [2.514, 0.12, 0.097], - [0.374, 0.096, 0.096], - [1.302, 0.102, 0.095], - [0.784, 0.113, 0.106], - [1.676, 0.114, 0.103], - [2.723, 0.105, 0.117], - [0.499, 0.098, 0.095], - [0.407, 0.094, 0.108], - [0.482, 0.107, 0.102], - [0.792, 0.115, 0.111], - [3.748, 0.105, 0.113], - [0.695, 0.158, 0.15], - [0.702, 0.169, 0.12], - [0.818, 0.098, 0.11], - [2.375, 0.147, 0.099], - [2.13, 0.101, 0.099], - [2.07, 0.104, 0.108], - [0.729, 0.099, 0.101], - [0.699, 0.112, 0.112], - [0.488, 0.095, 0.097], - [0.62, 0.103, 0.106], - [1.167, 0.106, 0.109], - [0.525, 0.107, 0.109], - [0.454, 0.141, 0.119], - [0.477, 0.122, 0.101] + [0.617, 0.278, 0.274], + [2.491, 0.452, 0.366], + [0.636, 0.43, 0.392], + [0.693, 0.35, 0.38], + [0.814, 0.954, 0.596], + [0.936, 0.747, 0.744], + [0.745, 0.457, 0.446], + [0.415, 0.358, 0.347], + [1.125, 0.898, 0.887], + [1.164, 1.093, 1.076], + [0.708, 0.464, 0.509], + [0.635, 0.515, 0.481], + [0.683, 0.659, 0.839], + [0.777, 0.917, 0.767], + [1.051, 0.727, 0.693], + [0.622, 0.617, 0.579], + [0.959, 1.206, 1.091], + [0.734, 0.688, 0.701], + [2.163, 1.896, 1.619], + [0.288, 0.275, 0.232], + [1.152, 0.576, 0.571], + [0.619, 0.652, 0.607], + [1.286, 0.877, 0.886], + [2.407, 1.429, 1.244], + [0.432, 0.586, 0.441], + [0.395, 0.353, 0.326], + [0.404, 0.38, 0.368], + [0.651, 0.646, 0.596], + [3.814, 3.827, 3.608], + [0.597, 0.559, 0.498], + [0.59, 0.523, 0.527], + [0.799, 0.609, 0.694], + [2.415, 1.809, 1.414], + [2.014, 2.302, 2.054], + [1.921, 1.973, 2.06], + [0.631, 0.604, 0.614], + [0.709, 0.563, 0.571], + [0.383, 0.378, 0.364], + [0.5, 0.392, 0.395], + [1.072, 0.84, 0.819], + [0.438, 0.347, 0.363], + [0.37, 0.45, 0.338], + [0.389, 0.333, 0.331] ] } diff --git a/databricks/results/small.json b/databricks/results/small.json index 7a9721053..36c13d900 100644 --- a/databricks/results/small.json +++ b/databricks/results/small.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: Small", "cluster_size": 4, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 57.949, + "load_time": 67.284, "data_size": 10219802927, "result": [ - [0.61, 0.104, 0.114], - [1.338, 0.1, 0.108], - [0.627, 0.098, 0.112], - [0.752, 0.106, 0.097], - [0.9, 0.109, 0.095], - [1.183, 0.102, 0.099], - [0.782, 0.098, 0.098], - [0.427, 0.108, 0.116], - [1.279, 0.102, 0.106], - [1.459, 0.112, 0.113], - [0.799, 0.11, 0.108], - [0.795, 0.109, 0.097], - [1.054, 0.101, 0.102], - [1.199, 0.102, 0.108], - [1.138, 0.101, 0.1], - [0.841, 0.098, 0.097], - [3.005, 0.12, 0.096], - [1.282, 0.094, 0.09], - [4.636, 0.105, 0.131], - [0.385, 0.087, 0.099], - [1.598, 0.092, 0.096], - [1.028, 0.102, 0.105], - [1.797, 0.108, 0.101], - [4.426, 0.102, 0.105], - [0.746, 0.096, 0.114], - [0.452, 0.103, 0.095], - [0.576, 0.122, 0.103], - [0.984, 0.102, 0.107], - [6.114, 0.105, 0.112], - [0.751, 0.204, 0.13], - [0.827, 0.104, 0.093], - [1.169, 0.101, 0.091], - [4.099, 0.095, 0.092], - [3.929, 0.106, 0.09], - [3.93, 0.095, 0.099], - [0.971, 0.1, 0.096], - [0.692, 0.106, 0.097], - [0.457, 0.1, 0.099], - [0.571, 0.097, 0.107], - [1.181, 0.113, 0.099], - [0.498, 0.156, 0.11], - [0.467, 0.104, 0.1], - [0.436, 0.104, 0.115] + [0.572, 0.302, 0.29], + [1.314, 0.379, 0.375], + [0.706, 0.519, 0.395], + [0.782, 0.421, 0.398], + [0.764, 0.724, 0.747], + [1.166, 0.925, 0.935], + [0.819, 0.464, 0.43], + [0.36, 0.331, 0.355], + [1.333, 1.048, 1.041], + [1.345, 1.356, 1.228], + [0.74, 0.506, 0.49], + [0.678, 0.501, 0.499], + [0.906, 0.85, 0.83], + [1.089, 1.048, 1.104], + [1.177, 0.917, 1.026], + [0.944, 0.861, 0.743], + [1.648, 1.871, 1.624], + [1.293, 1.176, 1.197], + [3.115, 2.425, 3.603], + [0.318, 0.247, 0.241], + [1.389, 0.789, 0.826], + [0.958, 0.88, 0.854], + [1.686, 1.259, 1.474], + [4.003, 2.139, 2.323], + [0.659, 0.681, 0.707], + [1.053, 0.458, 0.563], + [0.548, 0.896, 0.493], + [0.904, 0.888, 0.847], + [6.125, 5.512, 5.46], + [0.684, 0.798, 0.65], + [0.818, 0.72, 0.682], + [0.916, 0.965, 0.731], + [3.057, 2.656, 2.175], + [3.599, 3.405, 3.362], + [3.623, 3.872, 3.713], + [0.872, 0.891, 0.892], + [0.59, 0.639, 0.557], + [0.351, 0.358, 0.36], + [0.538, 0.376, 0.42], + [1.138, 0.856, 0.845], + [0.444, 0.376, 0.317], + [0.377, 0.338, 0.32], + [0.377, 0.359, 0.335] ] } diff --git a/databricks/results/x-large.json b/databricks/results/x-large.json index c2207e129..ff9cdebe3 100644 --- a/databricks/results/x-large.json +++ b/databricks/results/x-large.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: X-Large", "cluster_size": 32, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 30.201, + "load_time": 40.675, "data_size": 10219802927, "result": [ - [0.625, 0.126, 0.119], - [1.28, 0.115, 0.11], - [0.765, 0.107, 0.111], - [0.678, 0.11, 0.128], - [0.771, 0.109, 0.106], - [0.879, 0.107, 0.114], - [0.714, 0.113, 0.105], - [2.3, 0.108, 0.111], - [3.976, 0.114, 0.106], - [1.284, 0.116, 0.114], - [0.716, 0.107, 0.103], - [0.738, 0.113, 0.106], - [0.778, 0.12, 0.108], - [0.818, 0.105, 0.116], - [0.839, 0.102, 0.124], - [2.784, 0.124, 0.099], - [1.041, 0.097, 0.099], - [2.169, 0.11, 0.094], - [3.134, 0.122, 0.098], - [0.492, 0.093, 0.093], - [1.983, 0.102, 0.092], - [0.629, 0.107, 0.103], - [1.217, 0.1, 0.101], - [1.853, 0.11, 0.123], - [0.455, 0.105, 0.09], - [0.335, 0.099, 0.1], - [0.431, 0.094, 0.098], - [0.668, 0.096, 0.105], - [3.355, 0.109, 0.113], - [0.679, 0.157, 0.143], - [0.629, 0.097, 0.106], - [0.827, 0.11, 0.111], - [1.064, 0.102, 0.116], - [1.913, 0.106, 0.094], - [1.906, 0.111, 0.106], - [0.665, 0.108, 0.106], - [0.653, 0.1, 0.111], - [0.422, 0.102, 0.097], - [0.555, 0.113, 0.108], - [1.054, 0.103, 0.114], - [0.497, 0.105, 0.103], - [0.402, 0.139, 0.102], - [0.446, 0.108, 0.106] + [0.571, 0.315, 0.27], + [2.524, 0.445, 0.371], + [0.524, 0.361, 0.339], + [0.565, 0.323, 0.337], + [0.62, 0.587, 0.64], + [0.798, 0.608, 0.762], + [0.628, 0.437, 0.423], + [0.362, 0.326, 0.332], + [4.545, 0.993, 0.954], + [1.319, 1.239, 1.038], + [0.632, 0.444, 0.447], + [0.611, 0.485, 0.507], + [0.732, 0.621, 0.593], + [0.675, 0.657, 0.637], + [0.691, 0.661, 0.616], + [0.584, 0.531, 0.566], + [0.734, 0.71, 0.716], + [0.546, 0.544, 0.499], + [1.168, 0.927, 0.928], + [0.271, 0.208, 0.223], + [0.873, 0.493, 0.493], + [0.592, 0.51, 0.503], + [1.003, 0.71, 0.709], + [1.366, 0.939, 1.011], + [0.341, 0.322, 0.381], + [0.304, 0.292, 0.301], + [0.333, 0.331, 0.339], + [0.6, 0.52, 0.583], + [3.191, 3.114, 3.197], + [0.553, 0.498, 0.493], + [0.612, 0.547, 0.565], + [0.67, 0.527, 0.492], + [1.087, 1.368, 0.981], + [1.575, 1.544, 1.519], + [1.502, 1.66, 1.675], + [0.567, 0.542, 0.552], + [0.565, 0.57, 0.55], + [0.365, 0.366, 0.377], + [0.575, 0.382, 0.376], + [1.021, 0.817, 0.822], + [0.429, 0.339, 0.339], + [0.354, 0.335, 0.346], + [0.382, 0.326, 0.322] ] } diff --git a/databricks/results/x-small.json b/databricks/results/x-small.json index 65fe38aad..c3a680b2a 100644 --- a/databricks/results/x-small.json +++ b/databricks/results/x-small.json @@ -1,56 +1,56 @@ { "system": "Databricks", - "date": "2025-11-06", + "date": "2025-11-13", "machine": "Databricks: X-Small", "cluster_size": 2, "proprietary": "yes", "tuned": "no", "tags": ["managed", "column-oriented"], - "load_time": 78.857, + "load_time": 91.254, "data_size": 10219802927, "result": [ - [0.737, 0.228, 0.15], - [1.412, 0.131, 0.131], - [0.807, 0.123, 0.116], - [0.932, 0.115, 0.125], - [1.119, 0.116, 0.116], - [1.743, 0.158, 0.114], - [0.887, 0.152, 0.105], - [0.515, 0.102, 0.124], - [1.793, 0.104, 0.108], - [1.905, 0.132, 0.113], - [0.884, 0.104, 0.103], - [0.867, 0.125, 0.106], - [1.453, 0.117, 0.107], - [1.863, 0.108, 0.122], - [2.464, 0.126, 0.101], - [1.315, 0.103, 0.102], - [3.63, 0.102, 0.125], - [2.14, 0.114, 0.099], - [6.03, 0.11, 0.1], - [0.374, 0.106, 0.098], - [1.982, 0.105, 0.101], - [1.621, 0.121, 0.1], - [2.877, 0.107, 0.102], - [6.877, 0.112, 0.108], - [0.833, 0.099, 0.097], - [0.573, 0.097, 0.114], - [0.851, 0.105, 0.107], - [1.522, 0.11, 0.11], - [11.223, 0.126, 0.11], - [0.95, 0.142, 0.129], - [1.263, 0.102, 0.097], - [1.902, 0.103, 0.111], - [5.577, 0.113, 0.099], - [5.908, 0.103, 0.095], - [6.459, 0.122, 0.121], - [1.433, 0.111, 0.104], - [0.837, 0.098, 0.1], - [0.475, 0.102, 0.103], - [0.708, 0.109, 0.108], - [1.196, 0.109, 0.113], - [0.473, 0.114, 0.101], - [0.409, 0.128, 0.113], - [0.443, 0.11, 0.212] + [0.681, 0.306, 0.269], + [1.44, 0.393, 0.351], + [0.653, 0.492, 0.438], + [0.756, 0.418, 0.381], + [1.037, 0.889, 0.899], + [1.605, 1.297, 1.259], + [0.718, 0.459, 0.432], + [0.396, 0.371, 0.387], + [1.569, 1.378, 1.245], + [1.756, 1.946, 1.903], + [0.871, 0.591, 0.646], + [0.874, 0.62, 0.617], + [1.347, 1.466, 1.354], + [1.808, 1.643, 1.754], + [1.656, 1.571, 1.542], + [1.296, 1.291, 1.171], + [3.088, 2.775, 2.415], + [2.029, 2.052, 2.041], + [5.499, 4.354, 4.611], + [0.315, 0.271, 0.254], + [2.104, 1.22, 1.273], + [1.387, 1.392, 1.386], + [2.775, 2.336, 2.22], + [6.857, 3.872, 3.513], + [0.715, 0.866, 0.734], + [0.535, 0.676, 0.523], + [0.681, 0.755, 0.723], + [1.45, 1.337, 1.267], + [10.987, 11.04, 10.956], + [0.873, 0.811, 0.784], + [1.092, 1.178, 0.972], + [1.425, 1.512, 1.398], + [5.212, 4.141, 3.656], + [5.814, 5.669, 6.126], + [5.893, 5.416, 5.67], + [1.278, 1.239, 1.263], + [0.604, 0.539, 0.552], + [0.353, 0.371, 0.364], + [0.548, 0.387, 0.358], + [1.115, 0.894, 0.87], + [0.417, 0.362, 0.566], + [0.382, 0.347, 0.35], + [0.372, 0.343, 0.329] ] } From acdd41f2f30263a6f94efb24fc207522bcc7659f Mon Sep 17 00:00:00 2001 From: Conor McCarter Date: Mon, 17 Nov 2025 16:41:04 -0500 Subject: [PATCH 3/3] Implement PR feedback --- databricks/.env.example | 5 +- databricks/NOTES.md | 4 -- databricks/README.md | 11 +--- databricks/benchmark.sh | 13 +++++ databricks/create.sql | 109 ---------------------------------------- 5 files changed, 17 insertions(+), 125 deletions(-) delete mode 100644 databricks/NOTES.md delete mode 100644 databricks/create.sql diff --git a/databricks/.env.example b/databricks/.env.example index a0b353aab..90ed60ba7 100644 --- a/databricks/.env.example +++ b/databricks/.env.example @@ -18,5 +18,6 @@ DATABRICKS_TOKEN=your-databricks-token DATABRICKS_CATALOG=clickbench_catalog DATABRICKS_SCHEMA=clickbench_schema -# Parquet data location -DATABRICKS_PARQUET_LOCATION=s3://some/path/hits.parquet +# Parquet data location (must use s3:// format) +DATABRICKS_PARQUET_LOCATION=s3://clickhouse-public-datasets/hits_compatible/hits.parquet + diff --git a/databricks/NOTES.md b/databricks/NOTES.md deleted file mode 100644 index d29fb021e..000000000 --- a/databricks/NOTES.md +++ /dev/null @@ -1,4 +0,0 @@ -I created each warehouse in the Databricks UI. -Besides the warehouse size, the only other change I made to default settings was to set the sleep time to 5 minutes to save money (the 4x large warehouse is very expensive). - -Once the warehouse was created, I'd save the warehouse path to use in the .env file for each run. \ No newline at end of file diff --git a/databricks/README.md b/databricks/README.md index 4b4bc47b3..5c51d21c7 100644 --- a/databricks/README.md +++ b/databricks/README.md @@ -1,6 +1,6 @@ ## Setup -1. Create a Databricks workspace and SQL Warehouse +1. Create a Databricks workspace and SQL Warehouse (you can do this in the Datbricks UI). Once the SQL Warehouse has been created, copy the warehouse path to use in the .env file 2. Generate a personal access token from your Databricks workspace 3. Copy `.env.example` to `.env` and fill in your values: @@ -9,15 +9,6 @@ cp .env.example .env # Edit .env with your actual credentials ``` -Required environment variables: -- `DATABRICKS_SERVER_HOSTNAME`: Your workspace hostname (e.g., `dbc-xxxxxxxx-xxxx.cloud.databricks.com`) -- `DATABRICKS_HTTP_PATH`: SQL Warehouse path (e.g., `/sql/1.0/warehouses/your-warehouse-id`) -- `DATABRICKS_TOKEN`: Your personal access token -- `databricks_instance_type`: Instance type name for results file naming, e.g., "2X-Large" -- `DATABRICKS_CATALOG`: Unity Catalog name -- `DATABRICKS_SCHEMA`: Schema name -- `DATABRICKS_PARQUET_LOCATION`: S3 path to the parquet file - ## Running the Benchmark ```bash diff --git a/databricks/benchmark.sh b/databricks/benchmark.sh index 6928ba071..67a7a63c2 100755 --- a/databricks/benchmark.sh +++ b/databricks/benchmark.sh @@ -1,5 +1,18 @@ #!/bin/bash +# Ensure uv is installed, using snap on Linux and the official installer on macOS +if ! command -v uv >/dev/null 2>&1; then + if command -v snap >/dev/null 2>&1; then + sudo snap install --classic astral-uv + elif [[ "$OSTYPE" == "darwin"* ]]; then + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$PATH" + else + echo "Error: uv is not installed and snap is unavailable. Please install uv manually." >&2 + exit 1 + fi +fi + # Load environment variables if [ -f .env ]; then set -a diff --git a/databricks/create.sql b/databricks/create.sql deleted file mode 100644 index 729e40370..000000000 --- a/databricks/create.sql +++ /dev/null @@ -1,109 +0,0 @@ --- This is not used in the setup script, but is included here for reference. --- The actual table is created in benchmark.py -CREATE OR REPLACE TABLE hits ( - WatchID BIGINT NOT NULL, - JavaEnable SMALLINT NOT NULL, - Title STRING, - GoodEvent SMALLINT NOT NULL, - EventTime TIMESTAMP NOT NULL, - EventDate DATE NOT NULL, - CounterID INT NOT NULL, - ClientIP INT NOT NULL, - RegionID INT NOT NULL, - UserID BIGINT NOT NULL, - CounterClass SMALLINT NOT NULL, - OS SMALLINT NOT NULL, - UserAgent SMALLINT NOT NULL, - URL STRING, - Referer STRING, - IsRefresh SMALLINT NOT NULL, - RefererCategoryID SMALLINT NOT NULL, - RefererRegionID INT NOT NULL, - URLCategoryID SMALLINT NOT NULL, - URLRegionID INT NOT NULL, - ResolutionWidth SMALLINT NOT NULL, - ResolutionHeight SMALLINT NOT NULL, - ResolutionDepth SMALLINT NOT NULL, - FlashMajor SMALLINT NOT NULL, - FlashMinor SMALLINT NOT NULL, - FlashMinor2 STRING, - NetMajor SMALLINT NOT NULL, - NetMinor SMALLINT NOT NULL, - UserAgentMajor SMALLINT NOT NULL, - UserAgentMinor STRING NOT NULL, - CookieEnable SMALLINT NOT NULL, - JavascriptEnable SMALLINT NOT NULL, - IsMobile SMALLINT NOT NULL, - MobilePhone SMALLINT NOT NULL, - MobilePhoneModel STRING, - Params STRING, - IPNetworkID INT NOT NULL, - TraficSourceID SMALLINT NOT NULL, - SearchEngineID SMALLINT NOT NULL, - SearchPhrase STRING, - AdvEngineID SMALLINT NOT NULL, - IsArtifical SMALLINT NOT NULL, - WindowClientWidth SMALLINT NOT NULL, - WindowClientHeight SMALLINT NOT NULL, - ClientTimeZone SMALLINT NOT NULL, - ClientEventTime TIMESTAMP NOT NULL, - SilverlightVersion1 SMALLINT NOT NULL, - SilverlightVersion2 SMALLINT NOT NULL, - SilverlightVersion3 INT NOT NULL, - SilverlightVersion4 SMALLINT NOT NULL, - PageCharset STRING, - CodeVersion INT NOT NULL, - IsLink SMALLINT NOT NULL, - IsDownload SMALLINT NOT NULL, - IsNotBounce SMALLINT NOT NULL, - FUniqID BIGINT NOT NULL, - OriginalURL STRING, - HID INT NOT NULL, - IsOldCounter SMALLINT NOT NULL, - IsEvent SMALLINT NOT NULL, - IsParameter SMALLINT NOT NULL, - DontCountHits SMALLINT NOT NULL, - WithHash SMALLINT NOT NULL, - HitColor STRING NOT NULL, - LocalEventTime TIMESTAMP NOT NULL, - Age SMALLINT NOT NULL, - Sex SMALLINT NOT NULL, - Income SMALLINT NOT NULL, - Interests SMALLINT NOT NULL, - Robotness SMALLINT NOT NULL, - RemoteIP INT NOT NULL, - WindowName INT NOT NULL, - OpenerName INT NOT NULL, - HistoryLength SMALLINT NOT NULL, - BrowserLanguage STRING, - BrowserCountry STRING, - SocialNetwork STRING, - SocialAction STRING, - HTTPError SMALLINT NOT NULL, - SendTiming INT NOT NULL, - DNSTiming INT NOT NULL, - ConnectTiming INT NOT NULL, - ResponseStartTiming INT NOT NULL, - ResponseEndTiming INT NOT NULL, - FetchTiming INT NOT NULL, - SocialSourceNetworkID SMALLINT NOT NULL, - SocialSourcePage STRING, - ParamPrice BIGINT NOT NULL, - ParamOrderID STRING, - ParamCurrency STRING, - ParamCurrencyID SMALLINT NOT NULL, - OpenstatServiceName STRING, - OpenstatCampaignID STRING, - OpenstatAdID STRING, - OpenstatSourceID STRING, - UTMSource STRING, - UTMMedium STRING, - UTMCampaign STRING, - UTMContent STRING, - UTMTerm STRING, - FromTag STRING, - HasGCLID SMALLINT NOT NULL, - RefererHash BIGINT NOT NULL, - URLHash BIGINT NOT NULL, - CLID INT NOT NULL -); \ No newline at end of file