diff --git a/hive/README.md b/hive/README.md new file mode 100644 index 0000000000..b835b9ffb2 --- /dev/null +++ b/hive/README.md @@ -0,0 +1,19 @@ +## Apache Hive on a single Parquet file + +This setup runs Apache Hive 4 inside Docker, configured with: +- HiveServer2 and an embedded Derby metastore in a single container, and +- Tez as the execution engine (the upstream default for Hive 4), + +so the entire benchmark reproduces on a single VM with nothing beyond +Docker installed. + +The ClickBench `hits.parquet` file stores `EventTime`, `ClientEventTime` +and `LocalEventTime` as Unix-epoch `BIGINT` values, and `EventDate` as a +`INT` count of days since 1970-01-01. `create.sql` registers the parquet +file as an external table (`hits_raw`) and then exposes a `hits` view +that converts those columns to `TIMESTAMP` and `DATE`, so `queries.sql` +matches the canonical ClickBench query text. + +The `results/20130923/` directory contains historical 100M-row and +10M-row results from 2013; the current run targets the standard 100M-row +ClickBench dataset. diff --git a/hive/benchmark.sh b/hive/benchmark.sh new file mode 100755 index 0000000000..0fbc3b8465 --- /dev/null +++ b/hive/benchmark.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +# Embedded Derby metastore lives in the container's writable layer; +# the cold-cycle docker rm + docker run in ./start wipes it. ./load +# is idempotent and reruns create.sql every cold cycle so the schema +# is present before the first try; the load wall-clock rolls into the +# cold-try timing per the standard BENCH_DURABLE=no contract. +export BENCH_DURABLE=no +exec ../lib/benchmark-common.sh diff --git a/hive/check b/hive/check new file mode 100755 index 0000000000..74dd460d8b --- /dev/null +++ b/hive/check @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# HiveServer2 exposes a JSON status endpoint on its web UI port (10002). +# /jmx works once the JVM is fully up; while the container is starting +# (or stopped) curl fails. That makes it a clean readiness signal — no +# log-tailing, no false positives across stop+start cycles. +curl -sfo /dev/null http://localhost:10002/jmx diff --git a/hive/create.sql b/hive/create.sql new file mode 100644 index 0000000000..6bb33916a0 --- /dev/null +++ b/hive/create.sql @@ -0,0 +1,149 @@ +CREATE DATABASE IF NOT EXISTS clickbench; +USE clickbench; + +DROP VIEW IF EXISTS hits; +DROP TABLE IF EXISTS hits_raw; + +CREATE EXTERNAL TABLE hits_raw ( + WatchID bigint, + JavaEnable smallint, + Title string, + GoodEvent smallint, + EventTime bigint, + EventDate int, + CounterID int, + ClientIP int, + RegionID int, + UserID bigint, + CounterClass smallint, + OS smallint, + UserAgent smallint, + URL string, + Referer string, + IsRefresh smallint, + RefererCategoryID smallint, + RefererRegionID int, + URLCategoryID smallint, + URLRegionID int, + ResolutionWidth smallint, + ResolutionHeight smallint, + ResolutionDepth smallint, + FlashMajor smallint, + FlashMinor smallint, + FlashMinor2 string, + NetMajor smallint, + NetMinor smallint, + UserAgentMajor smallint, + UserAgentMinor string, + CookieEnable smallint, + JavascriptEnable smallint, + IsMobile smallint, + MobilePhone smallint, + MobilePhoneModel string, + Params string, + IPNetworkID int, + TraficSourceID smallint, + SearchEngineID smallint, + SearchPhrase string, + AdvEngineID smallint, + IsArtifical smallint, + WindowClientWidth smallint, + WindowClientHeight smallint, + ClientTimeZone smallint, + ClientEventTime bigint, + SilverlightVersion1 smallint, + SilverlightVersion2 smallint, + SilverlightVersion3 int, + SilverlightVersion4 smallint, + PageCharset string, + CodeVersion int, + IsLink smallint, + IsDownload smallint, + IsNotBounce smallint, + FUniqID bigint, + OriginalURL string, + HID int, + IsOldCounter smallint, + IsEvent smallint, + IsParameter smallint, + DontCountHits smallint, + WithHash smallint, + HitColor string, + LocalEventTime bigint, + Age smallint, + Sex smallint, + Income smallint, + Interests smallint, + Robotness smallint, + RemoteIP int, + WindowName int, + OpenerName int, + HistoryLength smallint, + BrowserLanguage string, + BrowserCountry string, + SocialNetwork string, + SocialAction string, + HTTPError smallint, + SendTiming int, + DNSTiming int, + ConnectTiming int, + ResponseStartTiming int, + ResponseEndTiming int, + FetchTiming int, + SocialSourceNetworkID smallint, + SocialSourcePage string, + ParamPrice bigint, + ParamOrderID string, + ParamCurrency string, + ParamCurrencyID smallint, + OpenstatServiceName string, + OpenstatCampaignID string, + OpenstatAdID string, + OpenstatSourceID string, + UTMSource string, + UTMMedium string, + UTMCampaign string, + UTMContent string, + UTMTerm string, + FromTag string, + HasGCLID smallint, + RefererHash bigint, + URLHash bigint, + CLID int +) +STORED AS PARQUET +LOCATION 'file:///clickbench/hits'; + +-- The Parquet file stores EventTime/ClientEventTime/LocalEventTime as Unix epoch seconds (BIGINT) +-- and EventDate as days since 1970-01-01 (INT). Wrap the raw table in a view that exposes the +-- standard ClickBench types so the queries below need no further adaptation. CAST() of +-- from_unixtime() turns Hive's "yyyy-MM-dd HH:mm:ss" string into a TIMESTAMP, and +-- date_add(DATE'1970-01-01', n) yields a DATE. +CREATE VIEW hits AS +SELECT + WatchID, JavaEnable, Title, GoodEvent, + CAST(from_unixtime(EventTime) AS TIMESTAMP) AS EventTime, + date_add(DATE '1970-01-01', EventDate) AS EventDate, + CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, + URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, + URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, + ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, + UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, + IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, + TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, + WindowClientWidth, WindowClientHeight, ClientTimeZone, + CAST(from_unixtime(ClientEventTime) AS TIMESTAMP) AS ClientEventTime, + SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, + SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, + IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, + IsParameter, DontCountHits, WithHash, HitColor, + CAST(from_unixtime(LocalEventTime) AS TIMESTAMP) AS LocalEventTime, + Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, + OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, + SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, + ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, + SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, + ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, + OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, + UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID +FROM hits_raw; diff --git a/hive/data-size b/hive/data-size new file mode 100755 index 0000000000..265f258e5d --- /dev/null +++ b/hive/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# External Parquet table — report the source file size. +stat -c %s data/hits/hits.parquet diff --git a/hive/install b/hive/install new file mode 100755 index 0000000000..9a1dddaa45 --- /dev/null +++ b/hive/install @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +HIVE_VERSION=4.0.1 + +# Hive's official image bundles its own JRE; only Docker is needed. +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io +fi +sudo apt-get install -y curl + +sudo docker pull apache/hive:${HIVE_VERSION} + +# Hive's external-table LOCATION points at /clickbench/hits inside the +# container; that path is the bind mount target for ./data on the host. +# Create the directory now so ./start can mount it before the first +# ./load. +mkdir -p data/hits +# apache/hive runs as uid 1000 ("hive") and writes the embedded Derby +# metastore + warehouse dirs under /opt/hive; the container also reads +# /clickbench/hits to list its external table. Make sure that uid can +# both read and write the bind-mount even when cloud-init runs as root. +sudo chown -R 1000:1000 data diff --git a/hive/load b/hive/load new file mode 100755 index 0000000000..3fab680321 --- /dev/null +++ b/hive/load @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +# Stage hits.parquet under data/hits/ — that dir is the Hive external +# table's LOCATION via the /clickbench bind mount inside the container. +# +# Idempotent: BENCH_DURABLE=no triggers ./load again on every cold +# cycle, but the dataset is 14 GB and re-staging it every cycle would +# blow up the run time without changing the measurement. The first +# invocation moves hits.parquet (delivered into cwd by +# download-hits-parquet-single) into data/hits/; subsequent invocations +# find no source file and reuse the staged copy. +if [ -f hits.parquet ]; then + mkdir -p data/hits + mv -f hits.parquet data/hits/hits.parquet +fi +sudo chown -R 1000:1000 data + +# Run create.sql via beeline inside the container. -n hive matches the +# default container user so the external LOCATION is readable; --silent +# suppresses beeline's prompt/timing chrome which would otherwise leak +# into ./load's stdout and confuse the driver's load-time parser. +sudo docker cp create.sql hive:/tmp/create.sql +# `< /dev/null` is load-bearing: bench_main runs ./load inside +# `while read query; do ... done < queries.sql`, so our inherited +# stdin IS the queries.sql fd. With -i, docker exec forwards host +# stdin into the container until EOF; beeline (running with -f) never +# reads it, so docker silently drains queries.sql while waiting for +# beeline to consume nothing. The next bench_main read then hits EOF +# and the whole query loop exits after Q1, with no error message +# (Q1's [t,t,t] is the only timing in the log, then it jumps straight +# to data-size). Redirecting stdin from /dev/null isolates this +# docker call from the surrounding loop's input. +sudo docker exec -i hive beeline -u 'jdbc:hive2://localhost:10000/' -n hive \ + --silent=true -f /tmp/create.sql < /dev/null + +sync diff --git a/hive/queries.sql b/hive/queries.sql new file mode 100644 index 0000000000..1954d91a02 --- /dev/null +++ b/hive/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, EXTRACT(MINUTE FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, EXTRACT(MINUTE FROM EventTime), SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '$1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '$1') HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END, URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT FLOOR_MINUTE(EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY FLOOR_MINUTE(EventTime) ORDER BY FLOOR_MINUTE(EventTime) LIMIT 10 OFFSET 1000; diff --git a/hive/query b/hive/query new file mode 100755 index 0000000000..9b13269aed --- /dev/null +++ b/hive/query @@ -0,0 +1,22 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via beeline against HiveServer2 +# in the running container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +start=$(date +%s.%N) +# `< /dev/null`: see hive/load for the long version. bench_run_query +# pipes the query in via `printf | ./query`, so the printf pipe is +# already drained by `query=$(cat)` above and our stdin is at EOF +# here — but make the docker-exec stdin source explicit so this +# script stays safe if anyone ever calls it without the printf-pipe +# wrapping. +sudo docker exec -i hive beeline -u 'jdbc:hive2://localhost:10000/clickbench' -n hive \ + --silent=true --outputformat=tsv2 -e "$query" < /dev/null +end=$(date +%s.%N) + +awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/hive/start b/hive/start new file mode 100755 index 0000000000..8b9aef7682 --- /dev/null +++ b/hive/start @@ -0,0 +1,40 @@ +#!/bin/bash +set -e + +HIVE_VERSION=4.0.1 + +if sudo docker ps --format '{{.Names}}' | grep -qx hive; then + exit 0 +fi + +# apache/hive's entrypoint writes /opt/hive/conf/hiveserver2.pid on first +# launch and refuses to start HS2 again if the file already exists +# ("HiveServer2 running as process 7. Stop it first."). After a +# docker stop the file lives on in the writable layer, so `docker start` +# on the same container never brings HS2 back up. Workaround: always +# remove the previous container and run a fresh one. The embedded Derby +# metastore lives inside the (ephemeral) container, so BENCH_DURABLE=no +# in benchmark.sh re-runs ./load on every cold cycle to recreate the +# schema after the restart — without that we'd lose the table catalog. +sudo docker rm -f hive >/dev/null 2>&1 || true + +# Size HiveServer2's JVM heap to a fraction of host RAM. The image's +# entrypoint exports HADOOP_CLIENT_OPTS="-Xmx1G $SERVICE_OPTS", so the +# last -Xmx wins — that's how we override the 1 GB default. With 1 GB +# the parquet vectorized reader OOMs on every non-trivial query +# ("GC overhead limit exceeded" inside MapRecordProcessor). 60% of +# host RAM is the same ratio presto/ uses, with a floor at 4 GB so +# small VMs still get a usable size. +RAM_GB=$(awk '/MemTotal/{ printf "%d", $2 / 1024 / 1024 }' /proc/meminfo) +HEAP_GB=$(( RAM_GB * 60 / 100 )) +[ "$HEAP_GB" -lt 4 ] && HEAP_GB=4 + +# SERVICE_NAME=hiveserver2 launches an embedded Derby metastore alongside +# HiveServer2 in the same JVM — single-node Hive without a separate +# metastore service or HDFS. +sudo docker run -d --name hive \ + -p 10000:10000 -p 10002:10002 \ + -e SERVICE_NAME=hiveserver2 \ + -e SERVICE_OPTS="-Xmx${HEAP_GB}G" \ + -v "$PWD/data:/clickbench" \ + apache/hive:${HIVE_VERSION} diff --git a/hive/stop b/hive/stop new file mode 100755 index 0000000000..a2fc91022a --- /dev/null +++ b/hive/stop @@ -0,0 +1,4 @@ +#!/bin/bash + +sudo docker stop hive 2>/dev/null || true +exit 0 diff --git a/hive/template.json b/hive/template.json new file mode 100644 index 0000000000..6f13bc4944 --- /dev/null +++ b/hive/template.json @@ -0,0 +1,11 @@ +{ + "system": "Hive (Parquet, single)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Java", + "MapReduce", + "Tez" + ] +} diff --git a/impala/README.md b/impala/README.md new file mode 100644 index 0000000000..30294c64b1 --- /dev/null +++ b/impala/README.md @@ -0,0 +1,28 @@ +## Apache Impala on a single Parquet file + +This setup runs Apache Impala 4 via the upstream "quickstart" Docker +images, orchestrated with `docker-compose`: + +- `hms` — Hive Metastore (Derby-backed; no MySQL/Postgres required) +- `statestored` — cluster-membership broker +- `catalogd` — metadata cache +- `impalad-1` — single combined coordinator + executor + +The benchmark therefore reproduces on a single VM with nothing beyond +Docker installed. + +**Hardware requirement:** Impala's C++ daemons (`statestored`, `catalogd`, +`impalad`) refuse to start on a CPU without AVX — they log +> This machine does not meet the minimum requirements for Impala +> functionality. The CPU does not support AVX + +and exit. Only AVX-capable x86_64 machines work; Graviton/aarch64 hosts +(including under QEMU emulation, which doesn't expose AVX) are not +supported. + +The ClickBench `hits.parquet` file stores `EventTime`, +`ClientEventTime` and `LocalEventTime` as Unix-epoch `BIGINT` values and +`EventDate` as an `INT` count of days since 1970-01-01. `create.sql` +registers the parquet file as an external table (`hits_raw`) and then +exposes a `hits` view that converts those columns to `TIMESTAMP` and +`DATE`, so `queries.sql` matches the canonical ClickBench query text. diff --git a/impala/benchmark.sh b/impala/benchmark.sh new file mode 100755 index 0000000000..35f9302c79 --- /dev/null +++ b/impala/benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +# +# First-cold Impala start has to bootstrap the Hive metastore schema, +# wait for catalogd to register with statestored, and only then does +# impalad-1 publish /healthz=OK. On c6a.4xlarge this overruns the +# default 300s check window; 900s gives the four-container compose +# stack room without making real crashes wait forever before surfacing. +# +# BENCH_RESTARTABLE=no skips the stop+start cycle between cold tries. +# Restarting catalogd wipes its in-memory catalog; with our +# -catalog_topic_mode=minimal + hms_event_polling_interval_s=0 setup +# (chosen to dodge the HMS notification-log issues at startup), +# catalogd doesn't proactively reload metadata from HMS, so +# `use clickbench` then fails with "Database does not exist" on every +# query. drop_caches alone gives a cold parquet read — the actual +# ClickBench bottleneck — without losing the catalog state that ./load +# put into the running cluster. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_DURABLE=yes +export BENCH_RESTARTABLE=no +export BENCH_CHECK_TIMEOUT=900 +exec ../lib/benchmark-common.sh diff --git a/impala/check b/impala/check new file mode 100755 index 0000000000..746bdae8bc --- /dev/null +++ b/impala/check @@ -0,0 +1,36 @@ +#!/bin/bash +# Impala's coordinator exposes its web UI on 25000. /healthz returns +# 200 "OK" once the daemon has finished registering with statestored +# and catalogd — i.e. the cluster is actually ready to accept queries, +# not just listening on the SQL ports. While the stack is stopped the +# port is closed and curl fails as it should. +# +# On failure dump container status to stderr — bench_check_loop captures +# the last call's stderr and prints it after a readiness timeout, so this +# turns "did not succeed within 300s" into something actionable +# (which container is missing/crashed) without spamming the run log +# on every iteration. +out=$(curl -sSf -m 5 http://localhost:25000/healthz 2>&1) || { + { + printf 'impala/check: /healthz unreachable: %s\n' "$out" + sudo docker ps -a --filter 'name=impala-' \ + --format ' {{.Names}}: {{.Status}}' + # Dump the tail of every impala container regardless of state. + # Exited containers tell us *why* a daemon died (SIGSEGV, OOM, + # config error); running containers tell us about server-side + # exceptions like "Internal error processing + # get_current_notificationEventId" that the client only sees as + # a generic TApplicationException — the actual stack trace is + # only in the HMS container's log. + for c in $(sudo docker ps -a --filter 'name=impala-' \ + --format '{{.Names}}'); do + echo "--- $c (last 50 log lines) ---" + sudo docker logs --tail 50 "$c" 2>&1 | sed 's/^/ /' + done + } >&2 + exit 1 +} +[ "$out" = "OK" ] || { + echo "impala/check: /healthz returned: $out" >&2 + exit 1 +} diff --git a/impala/create.sql b/impala/create.sql new file mode 100644 index 0000000000..324de3309e --- /dev/null +++ b/impala/create.sql @@ -0,0 +1,156 @@ +CREATE DATABASE IF NOT EXISTS clickbench; +USE clickbench; + +DROP VIEW IF EXISTS hits; +DROP TABLE IF EXISTS hits_raw; + +CREATE EXTERNAL TABLE hits_raw ( + WatchID bigint, + JavaEnable smallint, + Title string, + GoodEvent smallint, + EventTime bigint, + EventDate int, + CounterID int, + ClientIP int, + RegionID int, + UserID bigint, + CounterClass smallint, + OS smallint, + UserAgent smallint, + URL string, + Referer string, + IsRefresh smallint, + RefererCategoryID smallint, + RefererRegionID int, + URLCategoryID smallint, + URLRegionID int, + ResolutionWidth smallint, + ResolutionHeight smallint, + ResolutionDepth smallint, + FlashMajor smallint, + FlashMinor smallint, + FlashMinor2 string, + NetMajor smallint, + NetMinor smallint, + UserAgentMajor smallint, + UserAgentMinor string, + CookieEnable smallint, + JavascriptEnable smallint, + IsMobile smallint, + MobilePhone smallint, + MobilePhoneModel string, + Params string, + IPNetworkID int, + TraficSourceID smallint, + SearchEngineID smallint, + SearchPhrase string, + AdvEngineID smallint, + IsArtifical smallint, + WindowClientWidth smallint, + WindowClientHeight smallint, + ClientTimeZone smallint, + ClientEventTime bigint, + SilverlightVersion1 smallint, + SilverlightVersion2 smallint, + SilverlightVersion3 int, + SilverlightVersion4 smallint, + PageCharset string, + CodeVersion int, + IsLink smallint, + IsDownload smallint, + IsNotBounce smallint, + FUniqID bigint, + OriginalURL string, + HID int, + IsOldCounter smallint, + IsEvent smallint, + IsParameter smallint, + DontCountHits smallint, + WithHash smallint, + HitColor string, + LocalEventTime bigint, + Age smallint, + Sex smallint, + Income smallint, + Interests smallint, + Robotness smallint, + RemoteIP int, + WindowName int, + OpenerName int, + HistoryLength smallint, + BrowserLanguage string, + BrowserCountry string, + SocialNetwork string, + SocialAction string, + HTTPError smallint, + SendTiming int, + DNSTiming int, + ConnectTiming int, + ResponseStartTiming int, + ResponseEndTiming int, + FetchTiming int, + SocialSourceNetworkID smallint, + SocialSourcePage string, + ParamPrice bigint, + ParamOrderID string, + ParamCurrency string, + ParamCurrencyID smallint, + OpenstatServiceName string, + OpenstatCampaignID string, + OpenstatAdID string, + OpenstatSourceID string, + UTMSource string, + UTMMedium string, + UTMCampaign string, + UTMContent string, + UTMTerm string, + FromTag string, + HasGCLID smallint, + RefererHash bigint, + URLHash bigint, + CLID int +) +STORED AS PARQUET +LOCATION '/clickbench/hits'; + +REFRESH hits_raw; + +-- The Parquet file stores EventTime/ClientEventTime/LocalEventTime as Unix epoch seconds (BIGINT) +-- and EventDate as days since 1970-01-01 (INT). Wrap the raw table in a view that exposes the +-- standard ClickBench types so queries.sql matches the canonical text. CAST(from_unixtime(...) AS +-- TIMESTAMP) is the idiomatic Impala BIGINT-epoch -> TIMESTAMP conversion, and days_add() yields a +-- DATE from an int day count. +CREATE VIEW hits AS +SELECT + WatchID, JavaEnable, Title, GoodEvent, + CAST(from_unixtime(EventTime) AS TIMESTAMP) AS EventTime, + days_add(DATE '1970-01-01', EventDate) AS EventDate, + CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, + URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, + URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, + ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, + UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, + IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, + TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, + WindowClientWidth, WindowClientHeight, ClientTimeZone, + CAST(from_unixtime(ClientEventTime) AS TIMESTAMP) AS ClientEventTime, + SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, + SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, + IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, + IsParameter, DontCountHits, WithHash, HitColor, + CAST(from_unixtime(LocalEventTime) AS TIMESTAMP) AS LocalEventTime, + Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, + OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, + SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, + ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, + SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, + ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, + OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, + UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID +FROM hits_raw; + +-- Make sure Impala's planner has up-to-date stats; without this the +-- first query pays the metadata-collection wall-clock as if it were +-- query work, and pure scan queries plan as full broadcasts. +COMPUTE STATS hits_raw; diff --git a/impala/data-size b/impala/data-size new file mode 100755 index 0000000000..265f258e5d --- /dev/null +++ b/impala/data-size @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +# External Parquet table — report the source file size. +stat -c %s data/hits/hits.parquet diff --git a/impala/docker-compose.yml b/impala/docker-compose.yml new file mode 100644 index 0000000000..99701d97c6 --- /dev/null +++ b/impala/docker-compose.yml @@ -0,0 +1,133 @@ +# Apache Impala 4 quickstart cluster, in four containers on one +# user-defined bridge network. Mirrors the upstream +# docker/quickstart-compose layout from the Apache Impala repo, minus +# HDFS — the table data is bind-mounted into every container at +# /clickbench so Impala reads it through its local-filesystem driver +# rather than through HDFS or S3. +# +# Pinned to 4.5.0; 4.4.1 has the same jdo-api gap so the pin alone +# doesn't matter, but 4.5.0 is the newest tagged release. The +# missing jdo-api jar is supplied via a bind-mount under +# /opt/impala/lib/ on the two daemons (./install fetches it from +# Maven Central); without that, both impalad_coord_exec and catalogd +# crash at startup with NoClassDefFoundError: javax/jdo/JDOException → +# SIGSEGV in the C++ "Check failure" abort path, before /healthz ever +# opens. + +services: + hms: + image: apache/impala:4.5.0-impala_quickstart_hms + container_name: impala-hms + command: ["hms"] + volumes: + - hms_warehouse:/var/lib/hive + - ./data:/clickbench:rw + # Register DbNotificationListener server-side so client RPCs to + # get_current_notificationEventId succeed; without it, DDL like + # CREATE DATABASE in ./load fails with + # TApplicationException: Internal error processing + # get_current_notificationEventId + - ./hms-hive-site.xml:/opt/hive/conf/hive-site.xml:ro + networks: + - impala-net + hostname: hms + + statestored: + image: apache/impala:4.5.0-statestored + container_name: impala-statestored + command: ["-redirect_stdout_stderr=false", "-logtostderr", "-v=1"] + networks: + - impala-net + hostname: statestored + + catalogd: + image: apache/impala:4.5.0-catalogd + container_name: impala-catalogd + depends_on: + - statestored + - hms + command: + - "-redirect_stdout_stderr=false" + - "-logtostderr" + - "-v=1" + # 0 disables HMS event polling. The entrypoint defaults to 1 + # (poll every second), which causes catalogd to subscribe to the + # metastore's notification log on startup via + # get_current_notificationEventId. The quickstart hms image + # ships without DbNotificationListener configured, so that RPC + # throws "Internal error processing get_current_notificationEventId" + # and the FATAL aborts catalogd before /healthz on impalad ever + # reaches OK. We don't need event-driven cache invalidation in a + # single-process benchmark — `invalidate metadata` after ./load + # is enough. + - "-hms_event_polling_interval_s=0" + - "-invalidate_tables_timeout_s=999999" + # Override entrypoint default. With it on, the daemon expands + # `hms` to its reverse-DNS FQDN. Even with the network renamed, + # docker still appends the network as a hostname suffix, and the + # resulting hostname can pick up illegal URI characters; keep + # the configured `hms` literal so the URI parses cleanly. + - "-use_resolved_hostname=false" + volumes: + - ./data:/clickbench:rw + - ./data/extra-lib/jdo-api-3.0.1.jar:/opt/impala/lib/jdo-api-3.0.1.jar:ro + - ./hive-site.xml:/opt/impala/conf/hive-site.xml:ro + networks: + - impala-net + hostname: catalogd + + impalad-1: + image: apache/impala:4.5.0-impalad_coord_exec + container_name: impala-impalad-1 + depends_on: + - statestored + - catalogd + command: + - "-v=1" + - "-redirect_stdout_stderr=false" + - "-logtostderr" + - "-kudu_master_hosts=" + - "-mt_dop_auto_fallback=true" + - "--default_query_options=default_file_format=parquet" + # See catalogd above — same FQDN-underscore problem applies here. + - "-use_resolved_hostname=false" + ports: + - "21000:21000" + - "21050:21050" + - "25000:25000" + volumes: + - ./data:/clickbench:rw + - ./data/extra-lib/jdo-api-3.0.1.jar:/opt/impala/lib/jdo-api-3.0.1.jar:ro + - ./hive-site.xml:/opt/impala/conf/hive-site.xml:ro + networks: + - impala-net + hostname: impalad-1 + + # impala-shell is not bundled in the coordinator image — it lives in + # the separate quickstart_client image. Run that as a long-lived + # sidecar (sleep infinity) and `docker exec` impala-shell into it for + # ./load and ./query. The image's default entrypoint runs a one-shot + # data-load script; override it so the container just stays alive. + client: + image: apache/impala:4.5.0-impala_quickstart_client + container_name: impala-client + depends_on: + - impalad-1 + entrypoint: ["sleep", "infinity"] + networks: + - impala-net + +networks: + impala-net: + # Pin the network name; without `name:` compose auto-prefixes the + # current project (= cwd basename), producing `impala_impala-net`. + # Catalogd then reverse-resolves the hms IP and gets back a + # hostname containing an underscore (`impala-hms.impala_impala-net`), + # which java.net.URI rejects with + # URISyntaxException: Illegal character in hostname at index 26 + # before HiveMetaStoreClient can connect. + name: impala-net + driver: bridge + +volumes: + hms_warehouse: diff --git a/impala/hive-site.xml b/impala/hive-site.xml new file mode 100644 index 0000000000..797c2bf577 --- /dev/null +++ b/impala/hive-site.xml @@ -0,0 +1,17 @@ + + + + + + hive.metastore.uris + thrift://hms:9083 + + diff --git a/impala/hms-hive-site.xml b/impala/hms-hive-site.xml new file mode 100644 index 0000000000..7f2913b8ba --- /dev/null +++ b/impala/hms-hive-site.xml @@ -0,0 +1,50 @@ + + + + + + + hive.metastore.transactional.event.listeners + org.apache.hive.hcatalog.listener.DbNotificationListener + + + + + hive.metastore.event.db.notification.api.auth + false + + + + + hive.metastore.notifications.add.thrift.objects + true + + diff --git a/impala/install b/impala/install new file mode 100755 index 0000000000..612a3f5f2c --- /dev/null +++ b/impala/install @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +# Docker, the compose plugin, and curl. The compose v2 plugin lives in +# the docker-compose-v2 package on Ubuntu 24.04 / Debian 12; the older +# docker-compose Python package would also work but is deprecated. +if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io docker-compose-v2 +else + sudo apt-get install -y docker-compose-v2 +fi +sudo apt-get install -y curl + +# Pull all four service images up-front so the first ./start doesn't +# include image-pull wall-clock — it stays a pure container-start +# operation and ./check's readiness timeout is enough. +sudo docker compose -f docker-compose.yml pull + +# Catalogd and impalad both bind-mount this dir as /clickbench. Create +# it before the first ./start so the mount succeeds even when the +# compose stack is brought up before the first ./load. +mkdir -p data/hits + +# Apache Impala's published 4.4.x and 4.5.0 quickstart images ship the +# embedded HMS client library but NOT the jdo-api jar it loads through +# reflection, so impalad_coord_exec and catalogd both crash at startup +# with NoClassDefFoundError: javax.jdo.JDOException → SIGSEGV in the +# C++ "Check failure" abort path, before /healthz ever opens. +# Pull jdo-api from Maven Central into a host-side dir; the compose +# bind-mounts it under /opt/impala/lib/ in both daemon containers, +# which is already on their Java classpath. +JDO_API_VERSION=3.0.1 +mkdir -p data/extra-lib +if [ ! -f "data/extra-lib/jdo-api-${JDO_API_VERSION}.jar" ]; then + curl -fsSL -o "data/extra-lib/jdo-api-${JDO_API_VERSION}.jar" \ + "https://repo1.maven.org/maven2/javax/jdo/jdo-api/${JDO_API_VERSION}/jdo-api-${JDO_API_VERSION}.jar" +fi + +# The Impala containers run their daemons as uid 1000 ("impala"); make +# the bind-mount writable by that uid even when cloud-init runs as root. +sudo chown -R 1000:1000 data diff --git a/impala/load b/impala/load new file mode 100755 index 0000000000..fc04e53bee --- /dev/null +++ b/impala/load @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +# Place hits.parquet inside the external table's LOCATION. The bind +# mount on every container exposes this dir as /clickbench. +ln -f hits.parquet data/hits/hits.parquet +sudo chown -R 1000:1000 data + +# Apply the schema via impala-shell inside the running coordinator +# container. -B (no-header) + --quiet keeps the output free of pretty +# tables so the driver's load-time parser is not confused. +sudo docker cp create.sql impala-client:/tmp/create.sql +# `< /dev/null`: docker exec -i forwards host stdin to the container +# until EOF, even when the inner program (impala-shell -f) never reads +# it. If bench_main ever calls ./load while its `while read ... done < +# queries.sql` redirect is active (BENCH_DURABLE=no, or any future use +# in the loop), docker would drain the queries.sql fd and the loop +# would exit silently after one query — exactly the failure mode hive +# hit. Explicitly source stdin from /dev/null to keep this script +# isolated. +sudo docker exec -i impala-client impala-shell -i impalad-1:21050 -B --quiet -f /tmp/create.sql < /dev/null + +rm -f hits.parquet +sync diff --git a/impala/queries.sql b/impala/queries.sql new file mode 100644 index 0000000000..348ce9a776 --- /dev/null +++ b/impala/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, EXTRACT(MINUTE FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, EXTRACT(MINUTE FROM EventTime), SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END, URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('MINUTE', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('MINUTE', EventTime) ORDER BY DATE_TRUNC('MINUTE', EventTime) LIMIT 10 OFFSET 1000; diff --git a/impala/query b/impala/query new file mode 100755 index 0000000000..587719dd96 --- /dev/null +++ b/impala/query @@ -0,0 +1,20 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via impala-shell in the running +# coordinator container. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +set -e + +query=$(cat) + +start=$(date +%s.%N) +# `< /dev/null`: see impala/load — keep docker exec from forwarding +# whatever the caller's stdin is. bench_run_query already pipes the +# query via `printf | ./query` so $(cat) drains stdin above, but make +# the docker-exec stdin source explicit so the script stays safe if +# anyone runs it outside that wrapper. +sudo docker exec -i impala-client impala-shell -i impalad-1:21050 -d clickbench -B --quiet -q "$query" < /dev/null +end=$(date +%s.%N) + +awk -v s="$start" -v e="$end" 'BEGIN { printf "%.3f\n", e - s }' >&2 diff --git a/impala/start b/impala/start new file mode 100755 index 0000000000..1373312a44 --- /dev/null +++ b/impala/start @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +# Bring up the four-service quickstart stack. `docker compose up -d` is +# idempotent — already-running services are left alone, stopped ones +# are restarted, missing ones are created. We don't need a separate +# branch for "already up". +sudo docker compose -f docker-compose.yml up -d diff --git a/impala/stop b/impala/stop new file mode 100755 index 0000000000..3e5ed1387f --- /dev/null +++ b/impala/stop @@ -0,0 +1,7 @@ +#!/bin/bash + +# `down` removes the containers but keeps the named volume (the HMS +# Derby store), so the metastore catalog survives a cold cycle and the +# next ./start does not need to re-CREATE the external table. +sudo docker compose -f docker-compose.yml down 2>/dev/null || true +exit 0 diff --git a/impala/template.json b/impala/template.json new file mode 100644 index 0000000000..2bd8808526 --- /dev/null +++ b/impala/template.json @@ -0,0 +1,11 @@ +{ + "system": "Impala (Parquet, single)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "MPP", + "column-oriented" + ] +}