diff --git a/hive/README.md b/hive/README.md
new file mode 100644
index 0000000000..b835b9ffb2
--- /dev/null
+++ b/hive/README.md
@@ -0,0 +1,19 @@
+## Apache Hive on a single Parquet file
+
+This setup runs Apache Hive 4 inside Docker, configured with:
+- HiveServer2 and an embedded Derby metastore in a single container, and
+- Tez as the execution engine (the upstream default for Hive 4),
+
+so the entire benchmark reproduces on a single VM with nothing beyond
+Docker installed.
+
+The ClickBench `hits.parquet` file stores `EventTime`, `ClientEventTime`
+and `LocalEventTime` as Unix-epoch `BIGINT` values, and `EventDate` as a
+`INT` count of days since 1970-01-01. `create.sql` registers the parquet
+file as an external table (`hits_raw`) and then exposes a `hits` view
+that converts those columns to `TIMESTAMP` and `DATE`, so `queries.sql`
+matches the canonical ClickBench query text.
+
+The `results/20130923/` directory contains historical 100M-row and
+10M-row results from 2013; the current run targets the standard 100M-row
+ClickBench dataset.
diff --git a/hive/benchmark.sh b/hive/benchmark.sh
new file mode 100755
index 0000000000..0fbc3b8465
--- /dev/null
+++ b/hive/benchmark.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Thin shim — actual flow is in lib/benchmark-common.sh.
+export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
+# Embedded Derby metastore lives in the container's writable layer;
+# the cold-cycle docker rm + docker run in ./start wipes it. ./load
+# is idempotent and reruns create.sql every cold cycle so the schema
+# is present before the first try; the load wall-clock rolls into the
+# cold-try timing per the standard BENCH_DURABLE=no contract.
+export BENCH_DURABLE=no
+exec ../lib/benchmark-common.sh
diff --git a/hive/check b/hive/check
new file mode 100755
index 0000000000..74dd460d8b
--- /dev/null
+++ b/hive/check
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+# HiveServer2 exposes a JSON status endpoint on its web UI port (10002).
+# /jmx works once the JVM is fully up; while the container is starting
+# (or stopped) curl fails. That makes it a clean readiness signal — no
+# log-tailing, no false positives across stop+start cycles.
+curl -sfo /dev/null http://localhost:10002/jmx
diff --git a/hive/create.sql b/hive/create.sql
new file mode 100644
index 0000000000..6bb33916a0
--- /dev/null
+++ b/hive/create.sql
@@ -0,0 +1,149 @@
+CREATE DATABASE IF NOT EXISTS clickbench;
+USE clickbench;
+
+DROP VIEW IF EXISTS hits;
+DROP TABLE IF EXISTS hits_raw;
+
+CREATE EXTERNAL TABLE hits_raw (
+ WatchID bigint,
+ JavaEnable smallint,
+ Title string,
+ GoodEvent smallint,
+ EventTime bigint,
+ EventDate int,
+ CounterID int,
+ ClientIP int,
+ RegionID int,
+ UserID bigint,
+ CounterClass smallint,
+ OS smallint,
+ UserAgent smallint,
+ URL string,
+ Referer string,
+ IsRefresh smallint,
+ RefererCategoryID smallint,
+ RefererRegionID int,
+ URLCategoryID smallint,
+ URLRegionID int,
+ ResolutionWidth smallint,
+ ResolutionHeight smallint,
+ ResolutionDepth smallint,
+ FlashMajor smallint,
+ FlashMinor smallint,
+ FlashMinor2 string,
+ NetMajor smallint,
+ NetMinor smallint,
+ UserAgentMajor smallint,
+ UserAgentMinor string,
+ CookieEnable smallint,
+ JavascriptEnable smallint,
+ IsMobile smallint,
+ MobilePhone smallint,
+ MobilePhoneModel string,
+ Params string,
+ IPNetworkID int,
+ TraficSourceID smallint,
+ SearchEngineID smallint,
+ SearchPhrase string,
+ AdvEngineID smallint,
+ IsArtifical smallint,
+ WindowClientWidth smallint,
+ WindowClientHeight smallint,
+ ClientTimeZone smallint,
+ ClientEventTime bigint,
+ SilverlightVersion1 smallint,
+ SilverlightVersion2 smallint,
+ SilverlightVersion3 int,
+ SilverlightVersion4 smallint,
+ PageCharset string,
+ CodeVersion int,
+ IsLink smallint,
+ IsDownload smallint,
+ IsNotBounce smallint,
+ FUniqID bigint,
+ OriginalURL string,
+ HID int,
+ IsOldCounter smallint,
+ IsEvent smallint,
+ IsParameter smallint,
+ DontCountHits smallint,
+ WithHash smallint,
+ HitColor string,
+ LocalEventTime bigint,
+ Age smallint,
+ Sex smallint,
+ Income smallint,
+ Interests smallint,
+ Robotness smallint,
+ RemoteIP int,
+ WindowName int,
+ OpenerName int,
+ HistoryLength smallint,
+ BrowserLanguage string,
+ BrowserCountry string,
+ SocialNetwork string,
+ SocialAction string,
+ HTTPError smallint,
+ SendTiming int,
+ DNSTiming int,
+ ConnectTiming int,
+ ResponseStartTiming int,
+ ResponseEndTiming int,
+ FetchTiming int,
+ SocialSourceNetworkID smallint,
+ SocialSourcePage string,
+ ParamPrice bigint,
+ ParamOrderID string,
+ ParamCurrency string,
+ ParamCurrencyID smallint,
+ OpenstatServiceName string,
+ OpenstatCampaignID string,
+ OpenstatAdID string,
+ OpenstatSourceID string,
+ UTMSource string,
+ UTMMedium string,
+ UTMCampaign string,
+ UTMContent string,
+ UTMTerm string,
+ FromTag string,
+ HasGCLID smallint,
+ RefererHash bigint,
+ URLHash bigint,
+ CLID int
+)
+STORED AS PARQUET
+LOCATION 'file:///clickbench/hits';
+
+-- The Parquet file stores EventTime/ClientEventTime/LocalEventTime as Unix epoch seconds (BIGINT)
+-- and EventDate as days since 1970-01-01 (INT). Wrap the raw table in a view that exposes the
+-- standard ClickBench types so the queries below need no further adaptation. CAST() of
+-- from_unixtime() turns Hive's "yyyy-MM-dd HH:mm:ss" string into a TIMESTAMP, and
+-- date_add(DATE'1970-01-01', n) yields a DATE.
+CREATE VIEW hits AS
+SELECT
+ WatchID, JavaEnable, Title, GoodEvent,
+ CAST(from_unixtime(EventTime) AS TIMESTAMP) AS EventTime,
+ date_add(DATE '1970-01-01', EventDate) AS EventDate,
+ CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent,
+ URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID,
+ URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight,
+ ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor,
+ UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable,
+ IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID,
+ TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical,
+ WindowClientWidth, WindowClientHeight, ClientTimeZone,
+ CAST(from_unixtime(ClientEventTime) AS TIMESTAMP) AS ClientEventTime,
+ SilverlightVersion1, SilverlightVersion2, SilverlightVersion3,
+ SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload,
+ IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent,
+ IsParameter, DontCountHits, WithHash, HitColor,
+ CAST(from_unixtime(LocalEventTime) AS TIMESTAMP) AS LocalEventTime,
+ Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName,
+ OpenerName, HistoryLength, BrowserLanguage, BrowserCountry,
+ SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming,
+ ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming,
+ SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID,
+ ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID,
+ OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign,
+ UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID
+FROM hits_raw;
diff --git a/hive/data-size b/hive/data-size
new file mode 100755
index 0000000000..265f258e5d
--- /dev/null
+++ b/hive/data-size
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+# External Parquet table — report the source file size.
+stat -c %s data/hits/hits.parquet
diff --git a/hive/install b/hive/install
new file mode 100755
index 0000000000..9a1dddaa45
--- /dev/null
+++ b/hive/install
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+HIVE_VERSION=4.0.1
+
+# Hive's official image bundles its own JRE; only Docker is needed.
+if ! command -v docker >/dev/null 2>&1; then
+ sudo apt-get update -y
+ sudo apt-get install -y docker.io
+fi
+sudo apt-get install -y curl
+
+sudo docker pull apache/hive:${HIVE_VERSION}
+
+# Hive's external-table LOCATION points at /clickbench/hits inside the
+# container; that path is the bind mount target for ./data on the host.
+# Create the directory now so ./start can mount it before the first
+# ./load.
+mkdir -p data/hits
+# apache/hive runs as uid 1000 ("hive") and writes the embedded Derby
+# metastore + warehouse dirs under /opt/hive; the container also reads
+# /clickbench/hits to list its external table. Make sure that uid can
+# both read and write the bind-mount even when cloud-init runs as root.
+sudo chown -R 1000:1000 data
diff --git a/hive/load b/hive/load
new file mode 100755
index 0000000000..3fab680321
--- /dev/null
+++ b/hive/load
@@ -0,0 +1,37 @@
+#!/bin/bash
+set -e
+
+# Stage hits.parquet under data/hits/ — that dir is the Hive external
+# table's LOCATION via the /clickbench bind mount inside the container.
+#
+# Idempotent: BENCH_DURABLE=no triggers ./load again on every cold
+# cycle, but the dataset is 14 GB and re-staging it every cycle would
+# blow up the run time without changing the measurement. The first
+# invocation moves hits.parquet (delivered into cwd by
+# download-hits-parquet-single) into data/hits/; subsequent invocations
+# find no source file and reuse the staged copy.
+if [ -f hits.parquet ]; then
+ mkdir -p data/hits
+ mv -f hits.parquet data/hits/hits.parquet
+fi
+sudo chown -R 1000:1000 data
+
+# Run create.sql via beeline inside the container. -n hive matches the
+# default container user so the external LOCATION is readable; --silent
+# suppresses beeline's prompt/timing chrome which would otherwise leak
+# into ./load's stdout and confuse the driver's load-time parser.
+sudo docker cp create.sql hive:/tmp/create.sql
+# `< /dev/null` is load-bearing: bench_main runs ./load inside
+# `while read query; do ... done < queries.sql`, so our inherited
+# stdin IS the queries.sql fd. With -i, docker exec forwards host
+# stdin into the container until EOF; beeline (running with -f) never
+# reads it, so docker silently drains queries.sql while waiting for
+# beeline to consume nothing. The next bench_main read then hits EOF
+# and the whole query loop exits after Q1, with no error message
+# (Q1's [t,t,t] is the only timing in the log, then it jumps straight
+# to data-size). Redirecting stdin from /dev/null isolates this
+# docker call from the surrounding loop's input.
+sudo docker exec -i hive beeline -u 'jdbc:hive2://localhost:10000/' -n hive \
+ --silent=true -f /tmp/create.sql < /dev/null
+
+sync
diff --git a/hive/queries.sql b/hive/queries.sql
new file mode 100644
index 0000000000..1954d91a02
--- /dev/null
+++ b/hive/queries.sql
@@ -0,0 +1,43 @@
+SELECT COUNT(*) FROM hits;
+SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;
+SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;
+SELECT AVG(UserID) FROM hits;
+SELECT COUNT(DISTINCT UserID) FROM hits;
+SELECT COUNT(DISTINCT SearchPhrase) FROM hits;
+SELECT MIN(EventDate), MAX(EventDate) FROM hits;
+SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
+SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
+SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
+SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
+SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
+SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
+SELECT UserID, EXTRACT(MINUTE FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, EXTRACT(MINUTE FROM EventTime), SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
+SELECT UserID FROM hits WHERE UserID = 435090932899640449;
+SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
+SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
+SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
+SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '$1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '$1') HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;
+SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
+SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
+SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
+SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
+SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
+SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END, URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
+SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
+SELECT FLOOR_MINUTE(EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY FLOOR_MINUTE(EventTime) ORDER BY FLOOR_MINUTE(EventTime) LIMIT 10 OFFSET 1000;
diff --git a/hive/query b/hive/query
new file mode 100755
index 0000000000..9b13269aed
--- /dev/null
+++ b/hive/query
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Reads a SQL query from stdin, runs it via beeline against HiveServer2
+# in the running container.
+# Stdout: query result.
+# Stderr: query runtime in fractional seconds on the last line.
+# Exit non-zero on error.
+set -e
+
+query=$(cat)
+
+start=$(date +%s.%N)
+# `< /dev/null`: see hive/load for the long version. bench_run_query
+# pipes the query in via `printf | ./query`, so the printf pipe is
+# already drained by `query=$(cat)` above and our stdin is at EOF
+# here — but make the docker-exec stdin source explicit so this
+# script stays safe if anyone ever calls it without the printf-pipe
+# wrapping.
+sudo docker exec -i hive beeline -u 'jdbc:hive2://localhost:10000/clickbench' -n hive \
+ --silent=true --outputformat=tsv2 -e "$query" < /dev/null
+end=$(date +%s.%N)
+
+awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2
diff --git a/hive/start b/hive/start
new file mode 100755
index 0000000000..8b9aef7682
--- /dev/null
+++ b/hive/start
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+
+HIVE_VERSION=4.0.1
+
+if sudo docker ps --format '{{.Names}}' | grep -qx hive; then
+ exit 0
+fi
+
+# apache/hive's entrypoint writes /opt/hive/conf/hiveserver2.pid on first
+# launch and refuses to start HS2 again if the file already exists
+# ("HiveServer2 running as process 7. Stop it first."). After a
+# docker stop the file lives on in the writable layer, so `docker start`
+# on the same container never brings HS2 back up. Workaround: always
+# remove the previous container and run a fresh one. The embedded Derby
+# metastore lives inside the (ephemeral) container, so BENCH_DURABLE=no
+# in benchmark.sh re-runs ./load on every cold cycle to recreate the
+# schema after the restart — without that we'd lose the table catalog.
+sudo docker rm -f hive >/dev/null 2>&1 || true
+
+# Size HiveServer2's JVM heap to a fraction of host RAM. The image's
+# entrypoint exports HADOOP_CLIENT_OPTS="-Xmx1G $SERVICE_OPTS", so the
+# last -Xmx wins — that's how we override the 1 GB default. With 1 GB
+# the parquet vectorized reader OOMs on every non-trivial query
+# ("GC overhead limit exceeded" inside MapRecordProcessor). 60% of
+# host RAM is the same ratio presto/ uses, with a floor at 4 GB so
+# small VMs still get a usable size.
+RAM_GB=$(awk '/MemTotal/{ printf "%d", $2 / 1024 / 1024 }' /proc/meminfo)
+HEAP_GB=$(( RAM_GB * 60 / 100 ))
+[ "$HEAP_GB" -lt 4 ] && HEAP_GB=4
+
+# SERVICE_NAME=hiveserver2 launches an embedded Derby metastore alongside
+# HiveServer2 in the same JVM — single-node Hive without a separate
+# metastore service or HDFS.
+sudo docker run -d --name hive \
+ -p 10000:10000 -p 10002:10002 \
+ -e SERVICE_NAME=hiveserver2 \
+ -e SERVICE_OPTS="-Xmx${HEAP_GB}G" \
+ -v "$PWD/data:/clickbench" \
+ apache/hive:${HIVE_VERSION}
diff --git a/hive/stop b/hive/stop
new file mode 100755
index 0000000000..a2fc91022a
--- /dev/null
+++ b/hive/stop
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+sudo docker stop hive 2>/dev/null || true
+exit 0
diff --git a/hive/template.json b/hive/template.json
new file mode 100644
index 0000000000..6f13bc4944
--- /dev/null
+++ b/hive/template.json
@@ -0,0 +1,11 @@
+{
+ "system": "Hive (Parquet, single)",
+ "proprietary": "no",
+ "hardware": "cpu",
+ "tuned": "no",
+ "tags": [
+ "Java",
+ "MapReduce",
+ "Tez"
+ ]
+}
diff --git a/impala/README.md b/impala/README.md
new file mode 100644
index 0000000000..30294c64b1
--- /dev/null
+++ b/impala/README.md
@@ -0,0 +1,28 @@
+## Apache Impala on a single Parquet file
+
+This setup runs Apache Impala 4 via the upstream "quickstart" Docker
+images, orchestrated with `docker-compose`:
+
+- `hms` — Hive Metastore (Derby-backed; no MySQL/Postgres required)
+- `statestored` — cluster-membership broker
+- `catalogd` — metadata cache
+- `impalad-1` — single combined coordinator + executor
+
+The benchmark therefore reproduces on a single VM with nothing beyond
+Docker installed.
+
+**Hardware requirement:** Impala's C++ daemons (`statestored`, `catalogd`,
+`impalad`) refuse to start on a CPU without AVX — they log
+> This machine does not meet the minimum requirements for Impala
+> functionality. The CPU does not support AVX
+
+and exit. Only AVX-capable x86_64 machines work; Graviton/aarch64 hosts
+(including under QEMU emulation, which doesn't expose AVX) are not
+supported.
+
+The ClickBench `hits.parquet` file stores `EventTime`,
+`ClientEventTime` and `LocalEventTime` as Unix-epoch `BIGINT` values and
+`EventDate` as an `INT` count of days since 1970-01-01. `create.sql`
+registers the parquet file as an external table (`hits_raw`) and then
+exposes a `hits` view that converts those columns to `TIMESTAMP` and
+`DATE`, so `queries.sql` matches the canonical ClickBench query text.
diff --git a/impala/benchmark.sh b/impala/benchmark.sh
new file mode 100755
index 0000000000..35f9302c79
--- /dev/null
+++ b/impala/benchmark.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Thin shim — actual flow is in lib/benchmark-common.sh.
+#
+# First-cold Impala start has to bootstrap the Hive metastore schema,
+# wait for catalogd to register with statestored, and only then does
+# impalad-1 publish /healthz=OK. On c6a.4xlarge this overruns the
+# default 300s check window; 900s gives the four-container compose
+# stack room without making real crashes wait forever before surfacing.
+#
+# BENCH_RESTARTABLE=no skips the stop+start cycle between cold tries.
+# Restarting catalogd wipes its in-memory catalog; with our
+# -catalog_topic_mode=minimal + hms_event_polling_interval_s=0 setup
+# (chosen to dodge the HMS notification-log issues at startup),
+# catalogd doesn't proactively reload metadata from HMS, so
+# `use clickbench` then fails with "Database does not exist" on every
+# query. drop_caches alone gives a cold parquet read — the actual
+# ClickBench bottleneck — without losing the catalog state that ./load
+# put into the running cluster.
+export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
+export BENCH_DURABLE=yes
+export BENCH_RESTARTABLE=no
+export BENCH_CHECK_TIMEOUT=900
+exec ../lib/benchmark-common.sh
diff --git a/impala/check b/impala/check
new file mode 100755
index 0000000000..746bdae8bc
--- /dev/null
+++ b/impala/check
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Impala's coordinator exposes its web UI on 25000. /healthz returns
+# 200 "OK" once the daemon has finished registering with statestored
+# and catalogd — i.e. the cluster is actually ready to accept queries,
+# not just listening on the SQL ports. While the stack is stopped the
+# port is closed and curl fails as it should.
+#
+# On failure dump container status to stderr — bench_check_loop captures
+# the last call's stderr and prints it after a readiness timeout, so this
+# turns "did not succeed within 300s" into something actionable
+# (which container is missing/crashed) without spamming the run log
+# on every iteration.
+out=$(curl -sSf -m 5 http://localhost:25000/healthz 2>&1) || {
+ {
+ printf 'impala/check: /healthz unreachable: %s\n' "$out"
+ sudo docker ps -a --filter 'name=impala-' \
+ --format ' {{.Names}}: {{.Status}}'
+ # Dump the tail of every impala container regardless of state.
+ # Exited containers tell us *why* a daemon died (SIGSEGV, OOM,
+ # config error); running containers tell us about server-side
+ # exceptions like "Internal error processing
+ # get_current_notificationEventId" that the client only sees as
+ # a generic TApplicationException — the actual stack trace is
+ # only in the HMS container's log.
+ for c in $(sudo docker ps -a --filter 'name=impala-' \
+ --format '{{.Names}}'); do
+ echo "--- $c (last 50 log lines) ---"
+ sudo docker logs --tail 50 "$c" 2>&1 | sed 's/^/ /'
+ done
+ } >&2
+ exit 1
+}
+[ "$out" = "OK" ] || {
+ echo "impala/check: /healthz returned: $out" >&2
+ exit 1
+}
diff --git a/impala/create.sql b/impala/create.sql
new file mode 100644
index 0000000000..324de3309e
--- /dev/null
+++ b/impala/create.sql
@@ -0,0 +1,156 @@
+CREATE DATABASE IF NOT EXISTS clickbench;
+USE clickbench;
+
+DROP VIEW IF EXISTS hits;
+DROP TABLE IF EXISTS hits_raw;
+
+CREATE EXTERNAL TABLE hits_raw (
+ WatchID bigint,
+ JavaEnable smallint,
+ Title string,
+ GoodEvent smallint,
+ EventTime bigint,
+ EventDate int,
+ CounterID int,
+ ClientIP int,
+ RegionID int,
+ UserID bigint,
+ CounterClass smallint,
+ OS smallint,
+ UserAgent smallint,
+ URL string,
+ Referer string,
+ IsRefresh smallint,
+ RefererCategoryID smallint,
+ RefererRegionID int,
+ URLCategoryID smallint,
+ URLRegionID int,
+ ResolutionWidth smallint,
+ ResolutionHeight smallint,
+ ResolutionDepth smallint,
+ FlashMajor smallint,
+ FlashMinor smallint,
+ FlashMinor2 string,
+ NetMajor smallint,
+ NetMinor smallint,
+ UserAgentMajor smallint,
+ UserAgentMinor string,
+ CookieEnable smallint,
+ JavascriptEnable smallint,
+ IsMobile smallint,
+ MobilePhone smallint,
+ MobilePhoneModel string,
+ Params string,
+ IPNetworkID int,
+ TraficSourceID smallint,
+ SearchEngineID smallint,
+ SearchPhrase string,
+ AdvEngineID smallint,
+ IsArtifical smallint,
+ WindowClientWidth smallint,
+ WindowClientHeight smallint,
+ ClientTimeZone smallint,
+ ClientEventTime bigint,
+ SilverlightVersion1 smallint,
+ SilverlightVersion2 smallint,
+ SilverlightVersion3 int,
+ SilverlightVersion4 smallint,
+ PageCharset string,
+ CodeVersion int,
+ IsLink smallint,
+ IsDownload smallint,
+ IsNotBounce smallint,
+ FUniqID bigint,
+ OriginalURL string,
+ HID int,
+ IsOldCounter smallint,
+ IsEvent smallint,
+ IsParameter smallint,
+ DontCountHits smallint,
+ WithHash smallint,
+ HitColor string,
+ LocalEventTime bigint,
+ Age smallint,
+ Sex smallint,
+ Income smallint,
+ Interests smallint,
+ Robotness smallint,
+ RemoteIP int,
+ WindowName int,
+ OpenerName int,
+ HistoryLength smallint,
+ BrowserLanguage string,
+ BrowserCountry string,
+ SocialNetwork string,
+ SocialAction string,
+ HTTPError smallint,
+ SendTiming int,
+ DNSTiming int,
+ ConnectTiming int,
+ ResponseStartTiming int,
+ ResponseEndTiming int,
+ FetchTiming int,
+ SocialSourceNetworkID smallint,
+ SocialSourcePage string,
+ ParamPrice bigint,
+ ParamOrderID string,
+ ParamCurrency string,
+ ParamCurrencyID smallint,
+ OpenstatServiceName string,
+ OpenstatCampaignID string,
+ OpenstatAdID string,
+ OpenstatSourceID string,
+ UTMSource string,
+ UTMMedium string,
+ UTMCampaign string,
+ UTMContent string,
+ UTMTerm string,
+ FromTag string,
+ HasGCLID smallint,
+ RefererHash bigint,
+ URLHash bigint,
+ CLID int
+)
+STORED AS PARQUET
+LOCATION '/clickbench/hits';
+
+REFRESH hits_raw;
+
+-- The Parquet file stores EventTime/ClientEventTime/LocalEventTime as Unix epoch seconds (BIGINT)
+-- and EventDate as days since 1970-01-01 (INT). Wrap the raw table in a view that exposes the
+-- standard ClickBench types so queries.sql matches the canonical text. CAST(from_unixtime(...) AS
+-- TIMESTAMP) is the idiomatic Impala BIGINT-epoch -> TIMESTAMP conversion, and days_add() yields a
+-- DATE from an int day count.
+CREATE VIEW hits AS
+SELECT
+ WatchID, JavaEnable, Title, GoodEvent,
+ CAST(from_unixtime(EventTime) AS TIMESTAMP) AS EventTime,
+ days_add(DATE '1970-01-01', EventDate) AS EventDate,
+ CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent,
+ URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID,
+ URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight,
+ ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor,
+ UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable,
+ IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID,
+ TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical,
+ WindowClientWidth, WindowClientHeight, ClientTimeZone,
+ CAST(from_unixtime(ClientEventTime) AS TIMESTAMP) AS ClientEventTime,
+ SilverlightVersion1, SilverlightVersion2, SilverlightVersion3,
+ SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload,
+ IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent,
+ IsParameter, DontCountHits, WithHash, HitColor,
+ CAST(from_unixtime(LocalEventTime) AS TIMESTAMP) AS LocalEventTime,
+ Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName,
+ OpenerName, HistoryLength, BrowserLanguage, BrowserCountry,
+ SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming,
+ ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming,
+ SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID,
+ ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID,
+ OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign,
+ UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID
+FROM hits_raw;
+
+-- Make sure Impala's planner has up-to-date stats; without this the
+-- first query pays the metadata-collection wall-clock as if it were
+-- query work, and pure scan queries plan as full broadcasts.
+COMPUTE STATS hits_raw;
diff --git a/impala/data-size b/impala/data-size
new file mode 100755
index 0000000000..265f258e5d
--- /dev/null
+++ b/impala/data-size
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+# External Parquet table — report the source file size.
+stat -c %s data/hits/hits.parquet
diff --git a/impala/docker-compose.yml b/impala/docker-compose.yml
new file mode 100644
index 0000000000..99701d97c6
--- /dev/null
+++ b/impala/docker-compose.yml
@@ -0,0 +1,133 @@
+# Apache Impala 4 quickstart cluster, in four containers on one
+# user-defined bridge network. Mirrors the upstream
+# docker/quickstart-compose layout from the Apache Impala repo, minus
+# HDFS — the table data is bind-mounted into every container at
+# /clickbench so Impala reads it through its local-filesystem driver
+# rather than through HDFS or S3.
+#
+# Pinned to 4.5.0; 4.4.1 has the same jdo-api gap so the pin alone
+# doesn't matter, but 4.5.0 is the newest tagged release. The
+# missing jdo-api jar is supplied via a bind-mount under
+# /opt/impala/lib/ on the two daemons (./install fetches it from
+# Maven Central); without that, both impalad_coord_exec and catalogd
+# crash at startup with NoClassDefFoundError: javax/jdo/JDOException →
+# SIGSEGV in the C++ "Check failure" abort path, before /healthz ever
+# opens.
+
+services:
+ hms:
+ image: apache/impala:4.5.0-impala_quickstart_hms
+ container_name: impala-hms
+ command: ["hms"]
+ volumes:
+ - hms_warehouse:/var/lib/hive
+ - ./data:/clickbench:rw
+ # Register DbNotificationListener server-side so client RPCs to
+ # get_current_notificationEventId succeed; without it, DDL like
+ # CREATE DATABASE in ./load fails with
+ # TApplicationException: Internal error processing
+ # get_current_notificationEventId
+ - ./hms-hive-site.xml:/opt/hive/conf/hive-site.xml:ro
+ networks:
+ - impala-net
+ hostname: hms
+
+ statestored:
+ image: apache/impala:4.5.0-statestored
+ container_name: impala-statestored
+ command: ["-redirect_stdout_stderr=false", "-logtostderr", "-v=1"]
+ networks:
+ - impala-net
+ hostname: statestored
+
+ catalogd:
+ image: apache/impala:4.5.0-catalogd
+ container_name: impala-catalogd
+ depends_on:
+ - statestored
+ - hms
+ command:
+ - "-redirect_stdout_stderr=false"
+ - "-logtostderr"
+ - "-v=1"
+ # 0 disables HMS event polling. The entrypoint defaults to 1
+ # (poll every second), which causes catalogd to subscribe to the
+ # metastore's notification log on startup via
+ # get_current_notificationEventId. The quickstart hms image
+ # ships without DbNotificationListener configured, so that RPC
+ # throws "Internal error processing get_current_notificationEventId"
+ # and the FATAL aborts catalogd before /healthz on impalad ever
+ # reaches OK. We don't need event-driven cache invalidation in a
+ # single-process benchmark — `invalidate metadata` after ./load
+ # is enough.
+ - "-hms_event_polling_interval_s=0"
+ - "-invalidate_tables_timeout_s=999999"
+ # Override entrypoint default. With it on, the daemon expands
+ # `hms` to its reverse-DNS FQDN. Even with the network renamed,
+ # docker still appends the network as a hostname suffix, and the
+ # resulting hostname can pick up illegal URI characters; keep
+ # the configured `hms` literal so the URI parses cleanly.
+ - "-use_resolved_hostname=false"
+ volumes:
+ - ./data:/clickbench:rw
+ - ./data/extra-lib/jdo-api-3.0.1.jar:/opt/impala/lib/jdo-api-3.0.1.jar:ro
+ - ./hive-site.xml:/opt/impala/conf/hive-site.xml:ro
+ networks:
+ - impala-net
+ hostname: catalogd
+
+ impalad-1:
+ image: apache/impala:4.5.0-impalad_coord_exec
+ container_name: impala-impalad-1
+ depends_on:
+ - statestored
+ - catalogd
+ command:
+ - "-v=1"
+ - "-redirect_stdout_stderr=false"
+ - "-logtostderr"
+ - "-kudu_master_hosts="
+ - "-mt_dop_auto_fallback=true"
+ - "--default_query_options=default_file_format=parquet"
+ # See catalogd above — same FQDN-underscore problem applies here.
+ - "-use_resolved_hostname=false"
+ ports:
+ - "21000:21000"
+ - "21050:21050"
+ - "25000:25000"
+ volumes:
+ - ./data:/clickbench:rw
+ - ./data/extra-lib/jdo-api-3.0.1.jar:/opt/impala/lib/jdo-api-3.0.1.jar:ro
+ - ./hive-site.xml:/opt/impala/conf/hive-site.xml:ro
+ networks:
+ - impala-net
+ hostname: impalad-1
+
+ # impala-shell is not bundled in the coordinator image — it lives in
+ # the separate quickstart_client image. Run that as a long-lived
+ # sidecar (sleep infinity) and `docker exec` impala-shell into it for
+ # ./load and ./query. The image's default entrypoint runs a one-shot
+ # data-load script; override it so the container just stays alive.
+ client:
+ image: apache/impala:4.5.0-impala_quickstart_client
+ container_name: impala-client
+ depends_on:
+ - impalad-1
+ entrypoint: ["sleep", "infinity"]
+ networks:
+ - impala-net
+
+networks:
+ impala-net:
+ # Pin the network name; without `name:` compose auto-prefixes the
+ # current project (= cwd basename), producing `impala_impala-net`.
+ # Catalogd then reverse-resolves the hms IP and gets back a
+ # hostname containing an underscore (`impala-hms.impala_impala-net`),
+ # which java.net.URI rejects with
+ # URISyntaxException: Illegal character in hostname at index 26
+ # before HiveMetaStoreClient can connect.
+ name: impala-net
+ driver: bridge
+
+volumes:
+ hms_warehouse:
diff --git a/impala/hive-site.xml b/impala/hive-site.xml
new file mode 100644
index 0000000000..797c2bf577
--- /dev/null
+++ b/impala/hive-site.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+ hive.metastore.uris
+ thrift://hms:9083
+
+
diff --git a/impala/hms-hive-site.xml b/impala/hms-hive-site.xml
new file mode 100644
index 0000000000..7f2913b8ba
--- /dev/null
+++ b/impala/hms-hive-site.xml
@@ -0,0 +1,50 @@
+
+
+
+
+
+
+ hive.metastore.transactional.event.listeners
+ org.apache.hive.hcatalog.listener.DbNotificationListener
+
+
+
+
+ hive.metastore.event.db.notification.api.auth
+ false
+
+
+
+
+ hive.metastore.notifications.add.thrift.objects
+ true
+
+
diff --git a/impala/install b/impala/install
new file mode 100755
index 0000000000..612a3f5f2c
--- /dev/null
+++ b/impala/install
@@ -0,0 +1,42 @@
+#!/bin/bash
+set -e
+
+# Docker, the compose plugin, and curl. The compose v2 plugin lives in
+# the docker-compose-v2 package on Ubuntu 24.04 / Debian 12; the older
+# docker-compose Python package would also work but is deprecated.
+if ! command -v docker >/dev/null 2>&1; then
+ sudo apt-get update -y
+ sudo apt-get install -y docker.io docker-compose-v2
+else
+ sudo apt-get install -y docker-compose-v2
+fi
+sudo apt-get install -y curl
+
+# Pull all four service images up-front so the first ./start doesn't
+# include image-pull wall-clock — it stays a pure container-start
+# operation and ./check's readiness timeout is enough.
+sudo docker compose -f docker-compose.yml pull
+
+# Catalogd and impalad both bind-mount this dir as /clickbench. Create
+# it before the first ./start so the mount succeeds even when the
+# compose stack is brought up before the first ./load.
+mkdir -p data/hits
+
+# Apache Impala's published 4.4.x and 4.5.0 quickstart images ship the
+# embedded HMS client library but NOT the jdo-api jar it loads through
+# reflection, so impalad_coord_exec and catalogd both crash at startup
+# with NoClassDefFoundError: javax.jdo.JDOException → SIGSEGV in the
+# C++ "Check failure" abort path, before /healthz ever opens.
+# Pull jdo-api from Maven Central into a host-side dir; the compose
+# bind-mounts it under /opt/impala/lib/ in both daemon containers,
+# which is already on their Java classpath.
+JDO_API_VERSION=3.0.1
+mkdir -p data/extra-lib
+if [ ! -f "data/extra-lib/jdo-api-${JDO_API_VERSION}.jar" ]; then
+ curl -fsSL -o "data/extra-lib/jdo-api-${JDO_API_VERSION}.jar" \
+ "https://repo1.maven.org/maven2/javax/jdo/jdo-api/${JDO_API_VERSION}/jdo-api-${JDO_API_VERSION}.jar"
+fi
+
+# The Impala containers run their daemons as uid 1000 ("impala"); make
+# the bind-mount writable by that uid even when cloud-init runs as root.
+sudo chown -R 1000:1000 data
diff --git a/impala/load b/impala/load
new file mode 100755
index 0000000000..fc04e53bee
--- /dev/null
+++ b/impala/load
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+# Place hits.parquet inside the external table's LOCATION. The bind
+# mount on every container exposes this dir as /clickbench.
+ln -f hits.parquet data/hits/hits.parquet
+sudo chown -R 1000:1000 data
+
+# Apply the schema via impala-shell inside the running coordinator
+# container. -B (no-header) + --quiet keeps the output free of pretty
+# tables so the driver's load-time parser is not confused.
+sudo docker cp create.sql impala-client:/tmp/create.sql
+# `< /dev/null`: docker exec -i forwards host stdin to the container
+# until EOF, even when the inner program (impala-shell -f) never reads
+# it. If bench_main ever calls ./load while its `while read ... done <
+# queries.sql` redirect is active (BENCH_DURABLE=no, or any future use
+# in the loop), docker would drain the queries.sql fd and the loop
+# would exit silently after one query — exactly the failure mode hive
+# hit. Explicitly source stdin from /dev/null to keep this script
+# isolated.
+sudo docker exec -i impala-client impala-shell -i impalad-1:21050 -B --quiet -f /tmp/create.sql < /dev/null
+
+rm -f hits.parquet
+sync
diff --git a/impala/queries.sql b/impala/queries.sql
new file mode 100644
index 0000000000..348ce9a776
--- /dev/null
+++ b/impala/queries.sql
@@ -0,0 +1,43 @@
+SELECT COUNT(*) FROM hits;
+SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;
+SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;
+SELECT AVG(UserID) FROM hits;
+SELECT COUNT(DISTINCT UserID) FROM hits;
+SELECT COUNT(DISTINCT SearchPhrase) FROM hits;
+SELECT MIN(EventDate), MAX(EventDate) FROM hits;
+SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
+SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
+SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
+SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
+SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
+SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
+SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
+SELECT UserID, EXTRACT(MINUTE FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, EXTRACT(MINUTE FROM EventTime), SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
+SELECT UserID FROM hits WHERE UserID = 435090932899640449;
+SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
+SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
+SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
+SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
+SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;
+SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
+SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
+SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
+SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
+SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
+SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
+SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END, URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
+SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
+SELECT DATE_TRUNC('MINUTE', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('MINUTE', EventTime) ORDER BY DATE_TRUNC('MINUTE', EventTime) LIMIT 10 OFFSET 1000;
diff --git a/impala/query b/impala/query
new file mode 100755
index 0000000000..587719dd96
--- /dev/null
+++ b/impala/query
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Reads a SQL query from stdin, runs it via impala-shell in the running
+# coordinator container.
+# Stdout: query result.
+# Stderr: query runtime in fractional seconds on the last line.
+# Exit non-zero on error.
+set -e
+
+query=$(cat)
+
+start=$(date +%s.%N)
+# `< /dev/null`: see impala/load — keep docker exec from forwarding
+# whatever the caller's stdin is. bench_run_query already pipes the
+# query via `printf | ./query` so $(cat) drains stdin above, but make
+# the docker-exec stdin source explicit so the script stays safe if
+# anyone runs it outside that wrapper.
+sudo docker exec -i impala-client impala-shell -i impalad-1:21050 -d clickbench -B --quiet -q "$query" < /dev/null
+end=$(date +%s.%N)
+
+awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2
diff --git a/impala/start b/impala/start
new file mode 100755
index 0000000000..1373312a44
--- /dev/null
+++ b/impala/start
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+# Bring up the four-service quickstart stack. `docker compose up -d` is
+# idempotent — already-running services are left alone, stopped ones
+# are restarted, missing ones are created. We don't need a separate
+# branch for "already up".
+sudo docker compose -f docker-compose.yml up -d
diff --git a/impala/stop b/impala/stop
new file mode 100755
index 0000000000..3e5ed1387f
--- /dev/null
+++ b/impala/stop
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# `down` removes the containers but keeps the named volume (the HMS
+# Derby store), so the metastore catalog survives a cold cycle and the
+# next ./start does not need to re-CREATE the external table.
+sudo docker compose -f docker-compose.yml down 2>/dev/null || true
+exit 0
diff --git a/impala/template.json b/impala/template.json
new file mode 100644
index 0000000000..2bd8808526
--- /dev/null
+++ b/impala/template.json
@@ -0,0 +1,11 @@
+{
+ "system": "Impala (Parquet, single)",
+ "proprietary": "no",
+ "hardware": "cpu",
+ "tuned": "no",
+ "tags": [
+ "C++",
+ "MPP",
+ "column-oriented"
+ ]
+}