# Teradata GroupBy Runner â€” Quick Start (VS Code)

This notebook is a compact, click-run walkthrough against a real Teradata DB.

Run cells in order:
1. Setup + parameters
2. Connect + validate session
3. Run a few direct SQL examples
4. Compare sequential vs parallel groupby runner

In [15]:
import os
import sys
import time
from pathlib import Path

from sqlalchemy import create_engine, text
from sqlalchemy.pool import QueuePool

sys.path.insert(0, str(Path.cwd()))

from runner import (
    SequentialGroupByRunner,
    ParallelGroupByRunner,
)

# ---- Quick params (edit if needed) ----
TERADATA_HOST = os.getenv('TERADATA_HOST')
TERADATA_USER = os.getenv('TERADATA_USER')
TERADATA_PASSWORD = os.getenv('TERADATA_PASSWORD')
TERADATA_DATABASE = os.getenv('TERADATA_DATABASE', 'DEMO_USER')
MAX_WORKERS = 4

missing = [
    k for k, v in {
        'TERADATA_HOST': TERADATA_HOST,
        'TERADATA_USER': TERADATA_USER,
        'TERADATA_PASSWORD': TERADATA_PASSWORD,
    }.items() if not v
]

if missing:
    raise RuntimeError(f'Missing required env vars: {", ".join(missing)}')

connection_string = (
    f'teradatasql://{TERADATA_USER}:{TERADATA_PASSWORD}'
    f'@{TERADATA_HOST}/{TERADATA_DATABASE}'
)

engine = create_engine(
    connection_string,
    echo=False,
    poolclass=QueuePool,
    pool_size=8,
    max_overflow=4,
    pool_pre_ping=True,
    pool_recycle=1800,
)

print('Setup complete')
print(f'host={TERADATA_HOST}, user={TERADATA_USER}, db={TERADATA_DATABASE}')

Setup complete
host=test-l36lujzkc0420a7n.env.clearscape.teradata.com, user=demo_user, db=DEMO_USER


In [16]:
with engine.connect() as conn:
    res = conn.execute(text('HELP SESSION;'))
results = res.fetchall()
results

[('DEMO_USER', 'DBC', '26/03/01        ', '21:45:40        ', 'DEMO_USER', 'ASCII', 'UTF8', 'Teradata', 'IntegerDate', ' 00:00', 'LATIN', '2 ', '3 ', '0 ', '1 ', '3 ', 'YY/MM/DD', '.', ',', '3', '.', ',', '3', 'US Dollars', '$', 'USD', 'US Dollars', '$', 'USD', '-(3)9', '-(10)9', '-(5)9', '--(I).9(F)', '-9.99999999999999E-999', 'HH:MI:SS.S(F)Z', 'YYYY-MM-DDBHH:MI:SS.S(F)Z', '', 'DBC', '', 'N ', 'DEMO_USER', 'SR  ', '-(19)9', '', '', '', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'ANSIQUALIFIER', 'TERADATA', '1112211111222232222211121111112222322222                                        ', 'FN9', 'Transaction', 'None', 'DEMO_USER', 'DEMO_USER', None, 'DBC', 'DBC', None, 'DEMO_USER', 'DEMO_USER', None, None, None, None, 'DBC', 'DBC', None, None, None, None, 'DEMO_USER', 'DEMO_USER', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

In [17]:
with engine.connect() as conn:
    current_db = conn.execute(text('SELECT DATABASE')).scalar()
    session_id = conn.execute(text('SELECT SESSION')).scalar()

print(f'Connected: current_database={current_db}, session_id={session_id}')

queries = [
    ('Session', 'SELECT SESSION AS session_id', {}),
    ('Database', 'SELECT DATABASE AS current_database', {}),
    (
        'Top tables',
        '''
        SELECT TOP 5 TableName, TableKind
        FROM DBC.TablesV
        WHERE DataBaseName = :db
        ORDER BY TableName
        ''',
        {'db': TERADATA_DATABASE},
    ),
]

with engine.connect() as conn:
    for name, sql, params in queries:
        t0 = time.perf_counter()
        rows = conn.execute(text(sql), params).fetchall()
        elapsed = time.perf_counter() - t0
        print(f'\n{name} ({elapsed:.3f}s)')
        for row in rows[:3]:
            print(' ', row)
        if len(rows) > 3:
            print(f'  ... ({len(rows)} rows total)')

Connected: current_database=DEMO_USER, session_id=1214

Session (0.042s)
  (1214,)

Database (0.042s)
  ('DEMO_USER',)

Top tables (0.104s)
  ('get_data', 'P ')
  ('remove_data', 'P ')
  ('space_report', 'P ')


In [3]:
db = TERADATA_DATABASE.replace("'", "''")
groupby_queries = [
    f'''
    SELECT TableKind, COUNT(*) AS cnt
    FROM DBC.TablesV
    WHERE DataBaseName = '{db}'
    GROUP BY TableKind
    ''',
    f'''
    SELECT COUNT(*) AS total_tables
    FROM DBC.TablesV
    WHERE DataBaseName = '{db}'
    ''',
    f'''
    SELECT ColumnType, COUNT(*) AS cnt
    FROM DBC.ColumnsV
    WHERE DataBaseName = '{db}'
    GROUP BY ColumnType
    ''',
    f'''
    SELECT TOP 50 TableName, ColumnName
    FROM DBC.ColumnsV
    WHERE DataBaseName = '{db}'
    ORDER BY TableName, ColumnId
    ''',
]

seq_runner = SequentialGroupByRunner(
    engine=engine,
    base_table='DBC.TablesV',
    subset_filter=None,
    subset_table='vt_subset',
)
par_runner = ParallelGroupByRunner(
    engine=engine,
    base_table='DBC.TablesV',
    subset_filter=None,
    subset_table='vt_subset',
    max_workers=MAX_WORKERS,
)

t0 = time.perf_counter()
seq_results = seq_runner.run(groupby_queries, materialise_subset=False)
seq_time = time.perf_counter() - t0

t1 = time.perf_counter()
par_results = par_runner.run(groupby_queries, materialise_subset=False)
par_time = time.perf_counter() - t1

print(f'Sequential runner: {seq_time:.3f}s')
print(f'Parallel runner  : {par_time:.3f}s')
if par_time > 0:
    print(f'Speedup (seq/par): {seq_time / par_time:.2f}x')

print('\nRow counts per query (seq vs par):')
for i, (seq_df, par_df) in enumerate(zip(seq_results, par_results), start=1):
    print(f'Q{i}: {len(seq_df)} vs {len(par_df)}')

[32m2026-03-01 21:31:37.814[0m | [1mINFO    [0m | [36mrunner[0m:[36mrun[0m:[36m232[0m - [1mSequentialGroupByRunner: running 4 queries[0m
[32m2026-03-01 21:31:37.856[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mrun[0m:[36m239[0m - [34m[1mExecuting query 1/4[0m
[32m2026-03-01 21:31:37.923[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mrun[0m:[36m242[0m - [34m[1mQuery 1/4 returned 4 rows[0m
[32m2026-03-01 21:31:37.925[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mrun[0m:[36m239[0m - [34m[1mExecuting query 2/4[0m
[32m2026-03-01 21:31:37.991[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mrun[0m:[36m242[0m - [34m[1mQuery 2/4 returned 1 rows[0m
[32m2026-03-01 21:31:37.991[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mrun[0m:[36m239[0m - [34m[1mExecuting query 3/4[0m
[32m2026-03-01 21:31:38.097[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mrun[0m:[36m242[0m - [34m[1mQuery 3/4 returned 17 rows[0m
[32m2026-03-01 

Sequential runner: 0.436s
Parallel runner  : 0.836s
Speedup (seq/par): 0.52x

Row counts per query (seq vs par):
Q1: 4 vs 4
Q2: 1 vs 1
Q3: 17 vs 17
Q4: 50 vs 50


In [18]:
from runner import ServerSideGroupByRunner

In [19]:
db = TERADATA_DATABASE.replace("'", "''")
groupby_queries = [
    f'''
    SELECT TableKind, COUNT(*) AS cnt
    FROM vt_subset
    GROUP BY TableKind
    ''',
    f'''
    SELECT COUNT(*) AS total_tables
    FROM vt_subset
    ''',
    f'''
    SELECT ColumnType, COUNT(*) AS cnt
    FROM vt_subset
    GROUP BY ColumnType
    ''',
    f'''
    SELECT TOP 50 TableName, ColumnName
    FROM vt_subset
    ORDER BY TableName, ColumnId
    ''',
]

serverside_runner = ServerSideGroupByRunner(
    engine=engine,
    base_table='DBC.TablesV',
    subset_filter=f"DatabaseName = '{db}'",
    subset_table='vt_subset',
)

t2 = time.perf_counter()
serverside_results = serverside_runner.run(groupby_queries, materialise_subset=False)
serverside_time = time.perf_counter() - t2

print(f'Serverside runner  : {serverside_time:.3f}s')
if serverside_time > 0:
    print(f'Speedup (seq/serverside): {seq_time / serverside_time:.2f}x')

print('\nRow counts per query (seq vs serverside):')
for i, (seq_df, serverside_df) in enumerate(zip(seq_results, serverside_results), start=1):
    print(f'Q{i}: {len(seq_df)} vs {len(serverside_df)}')

[32m2026-03-01 21:45:48.723[0m | [34m[1mDEBUG   [0m | [36mrunner[0m:[36mdrop_tables_and_proc[0m:[36m368[0m - [34m[1mDropping existing tables and procedure if they exist[0m
 at gosqldriver/teradatasql.formatError ErrorUtil.go:85
 at gosqldriver/teradatasql.(*teradataConnection).formatDatabaseError ErrorUtil.go:223
 at gosqldriver/teradatasql.(*teradataConnection).makeChainedDatabaseError ErrorUtil.go:239
 at gosqldriver/teradatasql.(*teradataConnection).processErrorParcel TeradataConnection.go:816
 at gosqldriver/teradatasql.(*TeradataRows).processResponseBundle TeradataRows.go:2494
 at gosqldriver/teradatasql.(*TeradataRows).executeSQLRequest TeradataRows.go:970
 at gosqldriver/teradatasql.newTeradataRows TeradataRows.go:791
 at gosqldriver/teradatasql.(*teradataStatement).QueryContext TeradataStatement.go:122
 at gosqldriver/teradatasql.(*teradataConnection).QueryContext TeradataConnection.go:1335
 at database/sql.ctxDriverQuery ctxutil.go:48
 at database/sql.(*DB).query

OperationalError: (teradatasql.OperationalError) [Version 20.0.0.32] [Session 1214] [Teradata Database] [Error 3706] Syntax error: Invalid  SQL Statement.
 at gosqldriver/teradatasql.formatError ErrorUtil.go:85
 at gosqldriver/teradatasql.(*teradataConnection).formatDatabaseError ErrorUtil.go:223
 at gosqldriver/teradatasql.(*teradataConnection).makeChainedDatabaseError ErrorUtil.go:239
 at gosqldriver/teradatasql.(*teradataConnection).processErrorParcel TeradataConnection.go:816
 at gosqldriver/teradatasql.(*TeradataRows).processResponseBundle TeradataRows.go:2494
 at gosqldriver/teradatasql.(*TeradataRows).executeSQLRequest TeradataRows.go:970
 at gosqldriver/teradatasql.newTeradataRows TeradataRows.go:791
 at gosqldriver/teradatasql.(*teradataStatement).QueryContext TeradataStatement.go:122
 at gosqldriver/teradatasql.(*teradataConnection).QueryContext TeradataConnection.go:1335
 at database/sql.ctxDriverQuery ctxutil.go:48
 at database/sql.(*DB).queryDC.func1 sql.go:1786
 at database/sql.withLock sql.go:3574
 at database/sql.(*DB).queryDC sql.go:1781
 at database/sql.(*Conn).QueryContext sql.go:2037
 at main.createRows goside.go:1080
 at main.goCreateRows goside.go:959
 at _cgoexp_ff5e33a08e40_goCreateRows _cgo_gotypes.go:417
 at runtime.cgocallbackg1 cgocall.go:446
 at runtime.cgocallbackg cgocall.go:350
 at runtime.cgocallback asm_amd64.s:1084
 at runtime.goexit asm_amd64.s:1700
[SQL: 
                    REPLACE PROCEDURE test_run_dynamic_aggs()
                    BEGIN
                        DECLARE stmt VARCHAR(32000);
                        FOR cur AS c1 CURSOR FOR
                            SELECT sql_text FROM test_agg_jobs ORDER BY job_id
                        DO
                            SET stmt = cur.sql_text;
                            CALL DBC.SysExecSQL(stmt);
                        END FOR;
                    END;
                    ]
(Background on this error at: https://sqlalche.me/e/20/e3q8)