diff --git a/README.md b/README.md index 5661ebcf8..6c93c0464 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,6 @@ Please help us add more systems and run the benchmarks on more types of VMs: - [ ] Azure Synapse - [ ] Boilingdata - [ ] CockroachDB Serverless -- [ ] Databricks - [ ] DolphinDB - [ ] Dremio (without publishing) - [ ] DuckDB operating like "Athena" on remote Parquet files diff --git a/databricks/.env.example b/databricks/.env.example new file mode 100644 index 000000000..90ed60ba7 --- /dev/null +++ b/databricks/.env.example @@ -0,0 +1,23 @@ +# Databricks Configuration +# Copy this file to .env and fill in your actual values + +# Your Databricks workspace hostname (e.g., dbc-xxxxxxxx-xxxx.cloud.databricks.com) +DATABRICKS_SERVER_HOSTNAME=your-workspace-hostname.cloud.databricks.com + +# SQL Warehouse HTTP path (found in your SQL Warehouse settings) +# Uncomment the warehouse size you want to use +DATABRICKS_HTTP_PATH=/sql/1.0/warehouses/your-warehouse-id + +# Instance type name for results file naming & results machine type label +databricks_instance_type=Large + +# Your Databricks personal access token +DATABRICKS_TOKEN=your-databricks-token + +# Unity Catalog and Schema names +DATABRICKS_CATALOG=clickbench_catalog +DATABRICKS_SCHEMA=clickbench_schema + +# Parquet data location (must use s3:// format) +DATABRICKS_PARQUET_LOCATION=s3://clickhouse-public-datasets/hits_compatible/hits.parquet + diff --git a/databricks/README.md b/databricks/README.md new file mode 100644 index 000000000..5c51d21c7 --- /dev/null +++ b/databricks/README.md @@ -0,0 +1,38 @@ +## Setup + +1. Create a Databricks workspace and SQL Warehouse (you can do this in the Datbricks UI). Once the SQL Warehouse has been created, copy the warehouse path to use in the .env file +2. Generate a personal access token from your Databricks workspace +3. Copy `.env.example` to `.env` and fill in your values: + +```bash +cp .env.example .env +# Edit .env with your actual credentials +``` + +## Running the Benchmark + +```bash +./benchmark.sh +``` + +## How It Works + +1. **benchmark.sh**: Entry point that installs dependencies via `uv` and runs the benchmark +2. **benchmark.py**: Orchestrates the full benchmark: + - Creates the catalog and schema + - Creates the `hits` table with explicit schema (including TIMESTAMP conversion) + - Loads data from the parquet file using `INSERT INTO` with type conversions + - Runs all queries via `run.sh` + - Collects timing metrics from Databricks REST API + - Outputs results to JSON in the `results/` directory +3. **run.sh**: Iterates through queries.sql and executes each query +4. **query.py**: Executes individual queries and retrieves execution times from Databricks REST API (`/api/2.0/sql/history/queries/{query_id}`) +5. **queries.sql**: Contains the 43 benchmark queries + +## Notes + +- Query execution times are pulled from the Databricks REST API, which provides server-side metrics +- The data is loaded from a parquet file with explicit type conversions (Unix timestamps → TIMESTAMP, Unix dates → DATE) +- The benchmark uses Databricks SQL Connector for Python +- Results include load time, data size, and individual query execution times (3 runs per query) +- Results are saved to `results/{instance_type}.json` diff --git a/databricks/benchmark.py b/databricks/benchmark.py new file mode 100755 index 000000000..497fee222 --- /dev/null +++ b/databricks/benchmark.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 + +from databricks import sql +import json +import os +import sys +import subprocess +import time +import requests + +def write_result_to_file(run_metadata, query_results): + # Ensure results directory exists + os.makedirs('results', exist_ok=True) + + # Get instance type and convert to lowercase for filename + instance_type = os.getenv('databricks_instance_type') + if not instance_type: + raise Exception("Missing required environment variable: databricks_instance_type") + filename = os.path.join('results', instance_type.lower() + ".json") + with open(filename, 'w') as f: + print("{", file=f) + for key in run_metadata: + print(f'\t"{key}": {json.dumps(run_metadata[key])},', file=f) + + print('\t"result": [', file=f) + num_lines = len(query_results) + for i in range(num_lines): + print(f"\t\t{query_results[i]}", end='', file=f) + print("," if i < num_lines - 1 else "", file=f) + + print("\t]\n}", file=f) + +def load_data(run_metadata): + server_hostname = os.getenv('DATABRICKS_SERVER_HOSTNAME') + http_path = os.getenv('DATABRICKS_HTTP_PATH') + access_token = os.getenv('DATABRICKS_TOKEN') + catalog = os.getenv('DATABRICKS_CATALOG') + schema = os.getenv('DATABRICKS_SCHEMA') + parquet_location = os.getenv('DATABRICKS_PARQUET_LOCATION') + + if not all([server_hostname, http_path, access_token, catalog, schema, parquet_location]): + raise Exception("Missing required environment variables: DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, DATABRICKS_CATALOG, DATABRICKS_SCHEMA, DATABRICKS_PARQUET_LOCATION") + + print(f'Connecting to Databricks; loading the data into {catalog}.{schema}', file=sys.stderr) + + connection = sql.connect( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token + ) + + cursor = connection.cursor() + + # Create catalog and schema if they don't exist + cursor.execute(f'CREATE CATALOG IF NOT EXISTS {catalog}') + cursor.execute(f'USE CATALOG {catalog}') + cursor.execute(f'CREATE SCHEMA IF NOT EXISTS {schema}') + cursor.execute(f'USE SCHEMA {schema}') + + print(f'Creating table and loading data from {parquet_location}...', file=sys.stderr) + + # Drop table if exists + cursor.execute(f'DROP TABLE IF EXISTS {catalog}.{schema}.hits') + + # Create table with explicit schema (EventTime as TIMESTAMP) + create_query = f""" + CREATE TABLE {catalog}.{schema}.hits ( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title STRING, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate DATE NOT NULL, + CounterID INT NOT NULL, + ClientIP INT NOT NULL, + RegionID INT NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL STRING, + Referer STRING, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INT NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INT NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 STRING, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor STRING NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel STRING, + Params STRING, + IPNetworkID INT NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase STRING, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INT NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset STRING, + CodeVersion INT NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL STRING, + HID INT NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor STRING NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INT NOT NULL, + WindowName INT NOT NULL, + OpenerName INT NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage STRING, + BrowserCountry STRING, + SocialNetwork STRING, + SocialAction STRING, + HTTPError SMALLINT NOT NULL, + SendTiming INT NOT NULL, + DNSTiming INT NOT NULL, + ConnectTiming INT NOT NULL, + ResponseStartTiming INT NOT NULL, + ResponseEndTiming INT NOT NULL, + FetchTiming INT NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage STRING, + ParamPrice BIGINT NOT NULL, + ParamOrderID STRING, + ParamCurrency STRING, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName STRING, + OpenstatCampaignID STRING, + OpenstatAdID STRING, + OpenstatSourceID STRING, + UTMSource STRING, + UTMMedium STRING, + UTMCampaign STRING, + UTMContent STRING, + UTMTerm STRING, + FromTag STRING, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INT NOT NULL + ) + """ + cursor.execute(create_query) + + # Insert data from parquet file with type conversions + load_query = f""" + INSERT INTO {catalog}.{schema}.hits + SELECT + WatchID, + JavaEnable, + Title, + GoodEvent, + CAST(FROM_UNIXTIME(EventTime) AS TIMESTAMP) AS EventTime, + DATE_FROM_UNIX_DATE(EventDate) AS EventDate, + CounterID, + ClientIP, + RegionID, + UserID, + CounterClass, + OS, + UserAgent, + URL, + Referer, + IsRefresh, + RefererCategoryID, + RefererRegionID, + URLCategoryID, + URLRegionID, + ResolutionWidth, + ResolutionHeight, + ResolutionDepth, + FlashMajor, + FlashMinor, + FlashMinor2, + NetMajor, + NetMinor, + UserAgentMajor, + UserAgentMinor, + CookieEnable, + JavascriptEnable, + IsMobile, + MobilePhone, + MobilePhoneModel, + Params, + IPNetworkID, + TraficSourceID, + SearchEngineID, + SearchPhrase, + AdvEngineID, + IsArtifical, + WindowClientWidth, + WindowClientHeight, + ClientTimeZone, + CAST(FROM_UNIXTIME(ClientEventTime) AS TIMESTAMP) AS ClientEventTime, + SilverlightVersion1, + SilverlightVersion2, + SilverlightVersion3, + SilverlightVersion4, + PageCharset, + CodeVersion, + IsLink, + IsDownload, + IsNotBounce, + FUniqID, + OriginalURL, + HID, + IsOldCounter, + IsEvent, + IsParameter, + DontCountHits, + WithHash, + HitColor, + CAST(FROM_UNIXTIME(LocalEventTime) AS TIMESTAMP) AS LocalEventTime, + Age, + Sex, + Income, + Interests, + Robotness, + RemoteIP, + WindowName, + OpenerName, + HistoryLength, + BrowserLanguage, + BrowserCountry, + SocialNetwork, + SocialAction, + HTTPError, + SendTiming, + DNSTiming, + ConnectTiming, + ResponseStartTiming, + ResponseEndTiming, + FetchTiming, + SocialSourceNetworkID, + SocialSourcePage, + ParamPrice, + ParamOrderID, + ParamCurrency, + ParamCurrencyID, + OpenstatServiceName, + OpenstatCampaignID, + OpenstatAdID, + OpenstatSourceID, + UTMSource, + UTMMedium, + UTMCampaign, + UTMContent, + UTMTerm, + FromTag, + HasGCLID, + RefererHash, + URLHash, + CLID + FROM parquet.`{parquet_location}` + """ + + cursor.execute(load_query) + load_query_id = cursor.query_id + + # Get load time from REST API + print(f"Getting load time for query {load_query_id}...", file=sys.stderr) + max_retries = 3 + + for retry in range(max_retries): + time.sleep(2) + + url = f"https://{server_hostname}/api/2.0/sql/history/queries/{load_query_id}" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if 'duration' in data: + load_duration = round(data['duration'] / 1000.0, 3) + run_metadata["load_time"] = load_duration + print(f"Table created successfully in {load_duration}s", file=sys.stderr) + break + except Exception as api_error: + print(f"API error on retry {retry + 1}: {api_error}", file=sys.stderr) + + # Get table size from DESCRIBE DETAIL + cursor.execute(f"DESCRIBE DETAIL {catalog}.{schema}.hits") + result = cursor.fetchone() + run_metadata["data_size"] = result[10] # sizeInBytes column + print(f"Table size: {run_metadata['data_size']} bytes", file=sys.stderr) + + print(f'Finished loading the data in {run_metadata["load_time"]}s; data size = {run_metadata["data_size"]} bytes', file=sys.stderr) + + cursor.close() + connection.close() + +def run_queries(): + # Run the benchmark script + result = subprocess.run( + ["./run.sh"], + stdout=subprocess.PIPE, + text=True, + timeout=3600, # 1 hour timeout + ) + + if result.returncode != 0: + raise Exception(f"Benchmark failed with return code {result.returncode}") + + return result.stdout + +if __name__ == "__main__": + instance_type = os.getenv('databricks_instance_type') + if not instance_type: + raise Exception("Missing required environment variable: databricks_instance_type") + + run_metadata = { + "system": "Databricks", + "date": time.strftime("%Y-%m-%d"), + "machine": f"Databricks: {instance_type}", + "cluster_size": "", + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + } + + load_data(run_metadata) + + query_output = run_queries() + + write_result_to_file(run_metadata, query_output.strip().split('\n')) diff --git a/databricks/benchmark.sh b/databricks/benchmark.sh new file mode 100755 index 000000000..67a7a63c2 --- /dev/null +++ b/databricks/benchmark.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Ensure uv is installed, using snap on Linux and the official installer on macOS +if ! command -v uv >/dev/null 2>&1; then + if command -v snap >/dev/null 2>&1; then + sudo snap install --classic astral-uv + elif [[ "$OSTYPE" == "darwin"* ]]; then + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$PATH" + else + echo "Error: uv is not installed and snap is unavailable. Please install uv manually." >&2 + exit 1 + fi +fi + +# Load environment variables +if [ -f .env ]; then + set -a + source .env + set +a +else + echo "Error: .env file not found. Please copy .env.example to .env and fill in your credentials." + exit 1 +fi + +# Create virtual environment if it doesn't exist +if [ ! -d ".venv" ]; then + uv venv +fi + +# Install dependencies +uv pip install databricks-sql-connector + +# Run benchmark +uv run python ./benchmark.py 2>&1 diff --git a/databricks/queries.sql b/databricks/queries.sql new file mode 100644 index 000000000..8fafcbcf9 --- /dev/null +++ b/databricks/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '$1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/databricks/query.py b/databricks/query.py new file mode 100755 index 000000000..09f25cc7f --- /dev/null +++ b/databricks/query.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +from databricks import sql +import os +import sys +import time +import requests + +query = sys.stdin.read() +print(f"running {query}", file=sys.stderr) + +# Get connection parameters from environment variables +server_hostname = os.getenv('DATABRICKS_SERVER_HOSTNAME') +http_path = os.getenv('DATABRICKS_HTTP_PATH') +access_token = os.getenv('DATABRICKS_TOKEN') +catalog = os.getenv('DATABRICKS_CATALOG', 'main') +schema = os.getenv('DATABRICKS_SCHEMA', 'clickbench') + +if not all([server_hostname, http_path, access_token]): + print("Error: Missing required environment variables:", file=sys.stderr) + print(" DATABRICKS_SERVER_HOSTNAME", file=sys.stderr) + print(" DATABRICKS_HTTP_PATH", file=sys.stderr) + print(" DATABRICKS_TOKEN", file=sys.stderr) + sys.exit(1) + +connection = sql.connect( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token, + catalog=catalog, + schema=schema +) + +print('[', end='') + +for try_num in range(3): + if try_num > 0: + print(',', end='') + + try: + cursor = connection.cursor() + + # Disable query result caching for accurate benchmarking + cursor.execute("SET use_cached_result = false") + + # Execute the query + cursor.execute(query) + results = cursor.fetchall() + query_id = cursor.query_id + + # Get execution time from REST API + duration = None + max_retries = 3 + + for retry in range(max_retries): + # Wait a moment for query to complete and be available + time.sleep(1 if retry == 0 else 2) + + # Call the query history API + url = f"https://{server_hostname}/api/2.0/sql/history/queries/{query_id}" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if 'duration' in data: + # Duration is in milliseconds, convert to seconds + duration = round(data['duration'] / 1000.0, 3) + break + except Exception as api_error: + print(f"API error on retry {retry + 1}: {api_error}", file=sys.stderr) + + if duration is None: + # Fallback: if metrics aren't available after retries, use null + duration = 'null' + print(f"Could not retrieve metrics for query_id {query_id} after {max_retries} retries", file=sys.stderr) + + print(duration if isinstance(duration, str) else duration, end='') + + cursor.close() + except Exception as e: + print('null', end='') + print(f"query <{query.strip()}> errored out on attempt <{try_num+1}>: {e}", file=sys.stderr) + +print(']') + +connection.close() diff --git a/databricks/results/2x-large.json b/databricks/results/2x-large.json new file mode 100644 index 000000000..c6c68e508 --- /dev/null +++ b/databricks/results/2x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: 2X-Large", + "cluster_size": 64, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 36.227, + "data_size": 10219802927, + "result": [ + [0.552, 0.314, 0.28], + [3.022, 0.463, 0.382], + [0.582, 0.37, 0.367], + [0.568, 0.36, 0.342], + [0.696, 0.589, 0.68], + [0.886, 0.706, 0.701], + [0.663, 0.468, 0.46], + [0.409, 0.344, 2.586], + [1.195, 0.896, 0.874], + [3.439, 1.187, 1.128], + [0.663, 0.505, 0.494], + [0.609, 0.498, 0.484], + [0.7, 0.632, 0.575], + [0.696, 0.652, 0.67], + [2.734, 0.784, 0.703], + [1.388, 2.152, 0.678], + [0.77, 3.236, 0.867], + [0.638, 2.923, 0.704], + [1.569, 3.399, 1.06], + [0.281, 3.047, 0.305], + [0.909, 0.507, 0.507], + [0.65, 0.544, 0.519], + [0.925, 0.668, 0.663], + [1.382, 0.93, 0.92], + [0.378, 0.318, 0.332], + [0.303, 0.299, 0.335], + [0.318, 0.314, 0.31], + [0.617, 0.589, 0.587], + [3.488, 3.276, 3.166], + [0.619, 2.071, 0.583], + [0.896, 0.524, 3.0], + [0.87, 0.691, 2.66], + [1.066, 1.825, 4.829], + [2.253, 2.929, 1.561], + [1.556, 1.491, 1.557], + [0.709, 0.559, 0.565], + [0.624, 0.607, 3.21], + [0.54, 0.456, 2.633], + [0.6, 0.423, 0.386], + [1.059, 0.848, 0.84], + [0.473, 0.342, 0.339], + [0.362, 0.355, 0.36], + [0.392, 0.341, 0.353] + ] +} diff --git a/databricks/results/2x-small.json b/databricks/results/2x-small.json new file mode 100644 index 000000000..714e96ea6 --- /dev/null +++ b/databricks/results/2x-small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-10", + "machine": "Databricks: 2X-Small", + "cluster_size": 1, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 90.392, + "data_size": 10219802927, + "result": [ + [0.757, 0.404, 0.373], + [1.393, 0.478, 0.51], + [0.773, 0.43, 0.408], + [0.721, 0.438, 0.417], + [0.975, 0.918, 1.196], + [1.518, 1.277, 1.326], + [0.804, 0.667, 0.459], + [0.485, 0.421, 0.384], + [1.63, 1.433, 1.426], + [2.129, 1.77, 1.863], + [0.914, 0.694, 0.647], + [0.787, 0.701, 0.635], + [1.332, 1.231, 1.226], + [1.884, 1.836, 1.965], + [1.715, 1.311, 1.326], + [1.408, 1.203, 1.247], + [2.576, 2.475, 3.578], + [2.016, 2.042, 1.949], + [6.167, 4.53, 3.941], + [0.325, 0.277, 0.388], + [1.96, 1.419, 1.378], + [1.544, 1.504, 1.562], + [2.988, 2.461, 2.268], + [7.105, 3.983, 3.979], + [0.726, 0.769, 0.78], + [0.932, 0.621, 0.542], + [0.75, 0.678, 0.727], + [1.569, 1.497, 1.456], + [11.266, 11.2, 10.878], + [0.904, 0.834, 0.828], + [1.06, 1.091, 1.133], + [1.432, 1.224, 1.401], + [4.227, 3.462, 3.487], + [6.79, 7.119, 6.939], + [6.632, 6.893, 6.614], + [1.299, 1.274, 1.25], + [0.661, 0.558, 0.58], + [0.408, 0.412, 0.408], + [0.582, 0.405, 0.423], + [1.161, 0.928, 0.9], + [0.492, 0.39, 0.438], + [0.399, 0.367, 0.358], + [0.415, 0.416, 0.416] + ] +} diff --git a/databricks/results/4x-large.json b/databricks/results/4x-large.json new file mode 100644 index 000000000..014b3d233 --- /dev/null +++ b/databricks/results/4x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: 4X-Large", + "cluster_size": 256, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 29.748, + "data_size": 10219802927, + "result": [ + [0.635, 0.305, 0.275], + [3.117, 0.402, 0.401], + [0.679, 0.353, 0.336], + [0.559, 2.072, 0.481], + [4.6, 2.145, 0.664], + [1.137, 0.716, 0.727], + [3.011, 2.326, 2.373], + [0.531, 0.353, 0.375], + [1.614, 1.037, 0.853], + [1.31, 1.686, 1.062], + [4.223, 3.829, 2.826], + [0.688, 0.512, 0.678], + [1.626, 0.728, 0.715], + [0.722, 0.738, 0.757], + [0.901, 0.85, 0.885], + [0.612, 0.622, 0.632], + [0.879, 0.767, 0.791], + [0.545, 0.5, 0.505], + [1.378, 1.075, 1.084], + [0.258, 0.229, 0.225], + [0.952, 0.476, 0.494], + [0.525, 0.501, 0.501], + [0.997, 0.684, 0.677], + [1.355, 0.82, 0.798], + [0.328, 0.298, 0.308], + [0.288, 0.299, 0.268], + [0.34, 0.295, 0.315], + [0.584, 0.533, 0.539], + [3.315, 3.213, 3.167], + [0.606, 0.531, 0.469], + [0.708, 0.548, 0.575], + [0.765, 0.616, 0.663], + [0.906, 0.918, 0.901], + [1.51, 1.469, 1.486], + [1.508, 1.533, 1.496], + [0.683, 0.708, 0.627], + [0.57, 0.559, 0.576], + [0.392, 0.398, 0.396], + [0.556, 0.396, 0.393], + [1.016, 0.877, 0.849], + [0.475, 0.384, 0.387], + [0.486, 0.359, 0.376], + [0.41, 0.377, 0.345] + ] +} diff --git a/databricks/results/large.json b/databricks/results/large.json new file mode 100644 index 000000000..14b121ae9 --- /dev/null +++ b/databricks/results/large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: Large", + "cluster_size": 16, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 45.462, + "data_size": 10219802927, + "result": [ + [0.534, 0.281, 0.266], + [2.414, 0.4, 0.399], + [0.656, 0.342, 0.39], + [0.745, 0.34, 0.333], + [0.731, 3.706, 0.815], + [1.408, 0.663, 0.679], + [0.666, 0.772, 0.534], + [0.583, 0.346, 0.325], + [1.063, 0.847, 0.825], + [1.203, 1.077, 1.054], + [0.653, 0.453, 0.492], + [0.635, 0.477, 0.47], + [0.585, 0.599, 0.604], + [0.7, 0.639, 0.668], + [0.766, 0.624, 0.611], + [0.58, 0.513, 0.579], + [0.955, 0.77, 0.763], + [0.597, 0.566, 0.567], + [1.731, 1.462, 1.221], + [0.297, 0.216, 0.243], + [1.036, 0.532, 0.543], + [0.605, 0.555, 0.587], + [1.265, 0.882, 0.826], + [2.062, 1.339, 1.312], + [0.425, 0.368, 0.348], + [0.313, 0.325, 0.296], + [0.349, 0.365, 0.338], + [0.591, 0.598, 0.611], + [3.375, 3.368, 3.253], + [0.567, 0.517, 0.488], + [0.684, 0.587, 0.488], + [0.82, 0.69, 2.796], + [2.041, 1.339, 1.212], + [1.95, 1.85, 2.056], + [1.787, 1.819, 1.882], + [0.599, 0.55, 0.546], + [0.591, 0.579, 0.565], + [0.36, 0.375, 0.364], + [0.5, 0.37, 0.368], + [1.041, 0.849, 0.83], + [0.42, 0.342, 0.336], + [0.495, 0.331, 0.339], + [0.372, 0.329, 0.324] + ] +} diff --git a/databricks/results/medium.json b/databricks/results/medium.json new file mode 100644 index 000000000..593983b15 --- /dev/null +++ b/databricks/results/medium.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: Medium", + "cluster_size": 8, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 52.928, + "data_size": 10219802927, + "result": [ + [0.617, 0.278, 0.274], + [2.491, 0.452, 0.366], + [0.636, 0.43, 0.392], + [0.693, 0.35, 0.38], + [0.814, 0.954, 0.596], + [0.936, 0.747, 0.744], + [0.745, 0.457, 0.446], + [0.415, 0.358, 0.347], + [1.125, 0.898, 0.887], + [1.164, 1.093, 1.076], + [0.708, 0.464, 0.509], + [0.635, 0.515, 0.481], + [0.683, 0.659, 0.839], + [0.777, 0.917, 0.767], + [1.051, 0.727, 0.693], + [0.622, 0.617, 0.579], + [0.959, 1.206, 1.091], + [0.734, 0.688, 0.701], + [2.163, 1.896, 1.619], + [0.288, 0.275, 0.232], + [1.152, 0.576, 0.571], + [0.619, 0.652, 0.607], + [1.286, 0.877, 0.886], + [2.407, 1.429, 1.244], + [0.432, 0.586, 0.441], + [0.395, 0.353, 0.326], + [0.404, 0.38, 0.368], + [0.651, 0.646, 0.596], + [3.814, 3.827, 3.608], + [0.597, 0.559, 0.498], + [0.59, 0.523, 0.527], + [0.799, 0.609, 0.694], + [2.415, 1.809, 1.414], + [2.014, 2.302, 2.054], + [1.921, 1.973, 2.06], + [0.631, 0.604, 0.614], + [0.709, 0.563, 0.571], + [0.383, 0.378, 0.364], + [0.5, 0.392, 0.395], + [1.072, 0.84, 0.819], + [0.438, 0.347, 0.363], + [0.37, 0.45, 0.338], + [0.389, 0.333, 0.331] + ] +} diff --git a/databricks/results/small.json b/databricks/results/small.json new file mode 100644 index 000000000..36c13d900 --- /dev/null +++ b/databricks/results/small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: Small", + "cluster_size": 4, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 67.284, + "data_size": 10219802927, + "result": [ + [0.572, 0.302, 0.29], + [1.314, 0.379, 0.375], + [0.706, 0.519, 0.395], + [0.782, 0.421, 0.398], + [0.764, 0.724, 0.747], + [1.166, 0.925, 0.935], + [0.819, 0.464, 0.43], + [0.36, 0.331, 0.355], + [1.333, 1.048, 1.041], + [1.345, 1.356, 1.228], + [0.74, 0.506, 0.49], + [0.678, 0.501, 0.499], + [0.906, 0.85, 0.83], + [1.089, 1.048, 1.104], + [1.177, 0.917, 1.026], + [0.944, 0.861, 0.743], + [1.648, 1.871, 1.624], + [1.293, 1.176, 1.197], + [3.115, 2.425, 3.603], + [0.318, 0.247, 0.241], + [1.389, 0.789, 0.826], + [0.958, 0.88, 0.854], + [1.686, 1.259, 1.474], + [4.003, 2.139, 2.323], + [0.659, 0.681, 0.707], + [1.053, 0.458, 0.563], + [0.548, 0.896, 0.493], + [0.904, 0.888, 0.847], + [6.125, 5.512, 5.46], + [0.684, 0.798, 0.65], + [0.818, 0.72, 0.682], + [0.916, 0.965, 0.731], + [3.057, 2.656, 2.175], + [3.599, 3.405, 3.362], + [3.623, 3.872, 3.713], + [0.872, 0.891, 0.892], + [0.59, 0.639, 0.557], + [0.351, 0.358, 0.36], + [0.538, 0.376, 0.42], + [1.138, 0.856, 0.845], + [0.444, 0.376, 0.317], + [0.377, 0.338, 0.32], + [0.377, 0.359, 0.335] + ] +} diff --git a/databricks/results/x-large.json b/databricks/results/x-large.json new file mode 100644 index 000000000..ff9cdebe3 --- /dev/null +++ b/databricks/results/x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: X-Large", + "cluster_size": 32, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 40.675, + "data_size": 10219802927, + "result": [ + [0.571, 0.315, 0.27], + [2.524, 0.445, 0.371], + [0.524, 0.361, 0.339], + [0.565, 0.323, 0.337], + [0.62, 0.587, 0.64], + [0.798, 0.608, 0.762], + [0.628, 0.437, 0.423], + [0.362, 0.326, 0.332], + [4.545, 0.993, 0.954], + [1.319, 1.239, 1.038], + [0.632, 0.444, 0.447], + [0.611, 0.485, 0.507], + [0.732, 0.621, 0.593], + [0.675, 0.657, 0.637], + [0.691, 0.661, 0.616], + [0.584, 0.531, 0.566], + [0.734, 0.71, 0.716], + [0.546, 0.544, 0.499], + [1.168, 0.927, 0.928], + [0.271, 0.208, 0.223], + [0.873, 0.493, 0.493], + [0.592, 0.51, 0.503], + [1.003, 0.71, 0.709], + [1.366, 0.939, 1.011], + [0.341, 0.322, 0.381], + [0.304, 0.292, 0.301], + [0.333, 0.331, 0.339], + [0.6, 0.52, 0.583], + [3.191, 3.114, 3.197], + [0.553, 0.498, 0.493], + [0.612, 0.547, 0.565], + [0.67, 0.527, 0.492], + [1.087, 1.368, 0.981], + [1.575, 1.544, 1.519], + [1.502, 1.66, 1.675], + [0.567, 0.542, 0.552], + [0.565, 0.57, 0.55], + [0.365, 0.366, 0.377], + [0.575, 0.382, 0.376], + [1.021, 0.817, 0.822], + [0.429, 0.339, 0.339], + [0.354, 0.335, 0.346], + [0.382, 0.326, 0.322] + ] +} diff --git a/databricks/results/x-small.json b/databricks/results/x-small.json new file mode 100644 index 000000000..c3a680b2a --- /dev/null +++ b/databricks/results/x-small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-13", + "machine": "Databricks: X-Small", + "cluster_size": 2, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 91.254, + "data_size": 10219802927, + "result": [ + [0.681, 0.306, 0.269], + [1.44, 0.393, 0.351], + [0.653, 0.492, 0.438], + [0.756, 0.418, 0.381], + [1.037, 0.889, 0.899], + [1.605, 1.297, 1.259], + [0.718, 0.459, 0.432], + [0.396, 0.371, 0.387], + [1.569, 1.378, 1.245], + [1.756, 1.946, 1.903], + [0.871, 0.591, 0.646], + [0.874, 0.62, 0.617], + [1.347, 1.466, 1.354], + [1.808, 1.643, 1.754], + [1.656, 1.571, 1.542], + [1.296, 1.291, 1.171], + [3.088, 2.775, 2.415], + [2.029, 2.052, 2.041], + [5.499, 4.354, 4.611], + [0.315, 0.271, 0.254], + [2.104, 1.22, 1.273], + [1.387, 1.392, 1.386], + [2.775, 2.336, 2.22], + [6.857, 3.872, 3.513], + [0.715, 0.866, 0.734], + [0.535, 0.676, 0.523], + [0.681, 0.755, 0.723], + [1.45, 1.337, 1.267], + [10.987, 11.04, 10.956], + [0.873, 0.811, 0.784], + [1.092, 1.178, 0.972], + [1.425, 1.512, 1.398], + [5.212, 4.141, 3.656], + [5.814, 5.669, 6.126], + [5.893, 5.416, 5.67], + [1.278, 1.239, 1.263], + [0.604, 0.539, 0.552], + [0.353, 0.371, 0.364], + [0.548, 0.387, 0.358], + [1.115, 0.894, 0.87], + [0.417, 0.362, 0.566], + [0.382, 0.347, 0.35], + [0.372, 0.343, 0.329] + ] +} diff --git a/databricks/run.sh b/databricks/run.sh new file mode 100755 index 000000000..b335f912b --- /dev/null +++ b/databricks/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Determine the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Use venv python if available, otherwise system python +if [ -f "$SCRIPT_DIR/.venv/bin/python3" ]; then + PYTHON="$SCRIPT_DIR/.venv/bin/python3" +else + PYTHON="python3" +fi + +cat queries.sql | while read -r query; do + $PYTHON ./query.py <<< "${query}" +done