# [SETUP] 

In [1]:
! python ./setup.py

Cleaning up (if any existing) tpch db file
Creating TPCH input data


## Connect to DuckDB

In [2]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect("tpch.db")
%sql conn --alias duckdb

In [3]:
%%sql
show tables;

name
customer
lineitem
nation
orders
part
partsupp
region
supplier


# [Data Model]

The TPC-H data represents a car parts seller’s data warehouse, where we record orders, items that make up that order (lineitem), supplier, customer, part (parts sold), region, nation, and partsupp (parts supplier). 

Note: Have a copy of the data model as you follow along; this will help in understanding the examples provided and in answering exercise questions.

![](../../tpch_erd.png)


# [Structs] are like dictionaries with strong typing

In [8]:
%%sql
SELECT 
    o.*, 
    struct_pack(
    id := c.c_custkey,
    name := c.c_name,
    address := c.c_address,
    nationkey := c.c_nationkey,
    phone := c.c_phone,
    acctbal := c.c_acctbal,
    mktsegment := c.c_mktsegment,
    comment := c.c_comment
    ) AS customer
FROM 
    orders o
LEFT JOIN 
    customer c
ON 
    o.o_custkey = c.c_custkey LIMIT 5;

o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,customer
1,370,O,172799.49,1996-01-02,5-LOW,Clerk#000000951,0,ly express platelets. deposits acc,"{'id': 370, 'name': 'Customer#000000370', 'address': 'DtqbDO5rid', 'nationkey': 12, 'phone': '22-524-280-8721', 'acctbal': Decimal('8982.79'), 'mktsegment': 'FURNITURE', 'comment': 'nstructions. furious requests across the ironic'}"
2,781,O,38426.09,1996-12-01,1-URGENT,Clerk#000000880,0,ve the furiously fluffy dependencies. carefully regular,"{'id': 781, 'name': 'Customer#000000781', 'address': 'maoqGuL5,rHfX0leqZcFqHqpQH', 'nationkey': 18, 'phone': '28-478-388-5881', 'acctbal': Decimal('6403.62'), 'mktsegment': 'MACHINERY', 'comment': 'ld packages detect against the slyly ironic platelets. special, regular instructions haggle. carefu'}"
3,1234,F,205654.3,1993-10-14,5-LOW,Clerk#000000955,0,after the asymptotes. instructions cajole after the foxes. carefully unu,"{'id': 1234, 'name': 'Customer#000001234', 'address': 'qXcJPk,e9hn0l,fPThA,8ywX', 'nationkey': 1, 'phone': '11-742-434-6436', 'acctbal': Decimal('-982.32'), 'mktsegment': 'FURNITURE', 'comment': 'ironic, ironic requests cajole around the bold, express dinos. foxes print quickly against the'}"
4,1369,O,56000.91,1995-10-11,5-LOW,Clerk#000000124,0,st the furiously bold pinto beans. furiously pending theodolites cajol,"{'id': 1369, 'name': 'Customer#000001369', 'address': 'A37vct6,QYRGWdHWfVPZRfIH', 'nationkey': 10, 'phone': '20-232-617-7418', 'acctbal': Decimal('498.77'), 'mktsegment': 'AUTOMOBILE', 'comment': 'heodolites need to are; blithely regular excuses use final, unusual requests. iron'}"
6,557,F,45523.1,1992-02-21,4-NOT SPECIFIED,Clerk#000000058,0,furiously ironic accounts haggle blithely carefully regular de,"{'id': 557, 'name': 'Customer#000000557', 'address': 'eyUl6yn9Tw', 'nationkey': 15, 'phone': '25-390-153-6699', 'acctbal': Decimal('9559.04'), 'mktsegment': 'BUILDING', 'comment': 'nce the carefully unusual ideas doubt bold dependencies. furiously bold packages affix blithely. furiously'}"


## Use structs for better logical grouping of data

### Use struct for one-to-one & hierarchical relationships

In [9]:
%%sql
SELECT 
    l.*, 
    struct_pack(
        id := c.c_custkey,
        name := c.c_name,
        address := c.c_address,
        nationkey := c.c_nationkey,
        phone := c.c_phone,
        acctbal := c.c_acctbal,
        mktsegment := c.c_mktsegment,
        comment := c.c_comment
    ) AS customer,
    struct_pack(
        id := s.s_suppkey,
        name := s.s_name,
        address := s.s_address,
        nationkey := s.s_nationkey,
        phone := s.s_phone,
        acctbal := s.s_acctbal,
        comment := s.s_comment
    ) AS supplier
FROM 
    lineitem l
LEFT JOIN 
    orders o ON l.l_orderkey = o.o_orderkey
LEFT JOIN 
    customer c ON o.o_custkey = c.c_custkey
LEFT JOIN 
    supplier s ON l.l_suppkey = s.s_suppkey
LIMIT 5;

l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment,customer,supplier
1,1552,93,1,17.0,24710.35,0.04,0.02,N,O,1996-03-13,1996-02-12,1996-03-22,DELIVER IN PERSON,TRUCK,to beans x-ray carefull,"{'id': 370, 'name': 'Customer#000000370', 'address': 'DtqbDO5rid', 'nationkey': 12, 'phone': '22-524-280-8721', 'acctbal': Decimal('8982.79'), 'mktsegment': 'FURNITURE', 'comment': 'nstructions. furious requests across the ironic'}","{'id': 93, 'name': 'Supplier#000000093', 'address': 'wNZNHIg370XspE', 'nationkey': 16, 'phone': '26-528-528-1157', 'acctbal': Decimal('368.76'), 'comment': 'instructions mold slyly special dolphins. quickly regular instru'}"
1,674,75,2,36.0,56688.12,0.09,0.06,N,O,1996-04-12,1996-02-28,1996-04-20,TAKE BACK RETURN,MAIL,according to the final foxes. qui,"{'id': 370, 'name': 'Customer#000000370', 'address': 'DtqbDO5rid', 'nationkey': 12, 'phone': '22-524-280-8721', 'acctbal': Decimal('8982.79'), 'mktsegment': 'FURNITURE', 'comment': 'nstructions. furious requests across the ironic'}","{'id': 75, 'name': 'Supplier#000000075', 'address': 'ULXKdX bZFJwii', 'nationkey': 18, 'phone': '28-716-704-8686', 'acctbal': Decimal('-224.84'), 'comment': 'platelets cajole. sentiments '}"
1,637,38,3,8.0,12301.04,0.1,0.02,N,O,1996-01-29,1996-03-05,1996-01-31,TAKE BACK RETURN,REG AIR,ourts cajole above the furiou,"{'id': 370, 'name': 'Customer#000000370', 'address': 'DtqbDO5rid', 'nationkey': 12, 'phone': '22-524-280-8721', 'acctbal': Decimal('8982.79'), 'mktsegment': 'FURNITURE', 'comment': 'nstructions. furious requests across the ironic'}","{'id': 38, 'name': 'Supplier#000000038', 'address': 'vmOuWWwn,l4j7TOSEw4l3', 'nationkey': 4, 'phone': '14-361-296-6426', 'acctbal': Decimal('2512.41'), 'comment': 'inst the slyly final pinto beans. carefully un'}"
1,22,48,4,28.0,25816.56,0.09,0.06,N,O,1996-04-21,1996-03-30,1996-05-16,NONE,AIR,s cajole busily above t,"{'id': 370, 'name': 'Customer#000000370', 'address': 'DtqbDO5rid', 'nationkey': 12, 'phone': '22-524-280-8721', 'acctbal': Decimal('8982.79'), 'mktsegment': 'FURNITURE', 'comment': 'nstructions. furious requests across the ironic'}","{'id': 48, 'name': 'Supplier#000000048', 'address': 'IK,6rmdbeaoxtxgx7Df34QfLX5FUcyddqUEbF4h', 'nationkey': 14, 'phone': '24-722-551-9498', 'acctbal': Decimal('5630.62'), 'comment': 'ecial excuses detect blithely. slyly regular requests '}"
1,241,23,5,24.0,27389.76,0.1,0.04,N,O,1996-03-30,1996-03-14,1996-04-01,NONE,FOB,"the regular, regular pa","{'id': 370, 'name': 'Customer#000000370', 'address': 'DtqbDO5rid', 'nationkey': 12, 'phone': '22-524-280-8721', 'acctbal': Decimal('8982.79'), 'mktsegment': 'FURNITURE', 'comment': 'nstructions. furious requests across the ironic'}","{'id': 23, 'name': 'Supplier#000000023', 'address': ' zMyxL7O3O0SUAFoTrY7gVO mnM8XsH', 'nationkey': 9, 'phone': '19-559-422-5776', 'acctbal': Decimal('5926.41'), 'comment': 'sual asymptotes affix quickly at the instructions. deposits '}"


### Use list[struct] for one-to-many relationships

In [16]:
%%sql
WITH line_items as (
SELECT 
    l_orderkey as orderkey,
    array_agg(struct_pack(
        lineitemkey := l.l_linenumber,
        partkey := l.l_partkey,
        suppkey := l.l_suppkey,
        quantity := l.l_quantity,
        extendedprice := l.l_extendedprice,
        discount := l.l_discount,
        tax := l.l_tax,
        returnflag := l.l_returnflag,
        linestatus := l.l_linestatus,
        shipdate := l.l_shipdate,
        commitdate := l.l_commitdate,
        receiptdate := l.l_receiptdate,
        shipinstruct := l.l_shipinstruct,
        shipmode := l.l_shipmode,
        comment := l.l_comment
    )) AS lineitems
FROM 
    lineitem l 
GROUP BY 
    l_orderkey)
SELECT o.*,
len(l.lineitems) as num_lineitems,
l.lineitems
FROM orders o
LEFT JOIN line_items l
on o.o_orderkey = l.orderkey
LIMIT 5;

o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,num_lineitems,lineitems
1,370,O,172799.49,1996-01-02,5-LOW,Clerk#000000951,0,ly express platelets. deposits acc,6,"[{'lineitemkey': 1, 'partkey': 1552, 'suppkey': 93, 'quantity': Decimal('17.00'), 'extendedprice': Decimal('24710.35'), 'discount': Decimal('0.04'), 'tax': Decimal('0.02'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 3, 13), 'commitdate': datetime.date(1996, 2, 12), 'receiptdate': datetime.date(1996, 3, 22), 'shipinstruct': 'DELIVER IN PERSON', 'shipmode': 'TRUCK', 'comment': 'to beans x-ray carefull'}, {'lineitemkey': 2, 'partkey': 674, 'suppkey': 75, 'quantity': Decimal('36.00'), 'extendedprice': Decimal('56688.12'), 'discount': Decimal('0.09'), 'tax': Decimal('0.06'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 4, 12), 'commitdate': datetime.date(1996, 2, 28), 'receiptdate': datetime.date(1996, 4, 20), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'MAIL', 'comment': ' according to the final foxes. qui'}, {'lineitemkey': 3, 'partkey': 637, 'suppkey': 38, 'quantity': Decimal('8.00'), 'extendedprice': Decimal('12301.04'), 'discount': Decimal('0.10'), 'tax': Decimal('0.02'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 1, 29), 'commitdate': datetime.date(1996, 3, 5), 'receiptdate': datetime.date(1996, 1, 31), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'REG AIR', 'comment': 'ourts cajole above the furiou'}, {'lineitemkey': 4, 'partkey': 22, 'suppkey': 48, 'quantity': Decimal('28.00'), 'extendedprice': Decimal('25816.56'), 'discount': Decimal('0.09'), 'tax': Decimal('0.06'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 4, 21), 'commitdate': datetime.date(1996, 3, 30), 'receiptdate': datetime.date(1996, 5, 16), 'shipinstruct': 'NONE', 'shipmode': 'AIR', 'comment': 's cajole busily above t'}, {'lineitemkey': 5, 'partkey': 241, 'suppkey': 23, 'quantity': Decimal('24.00'), 'extendedprice': Decimal('27389.76'), 'discount': Decimal('0.10'), 'tax': Decimal('0.04'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 3, 30), 'commitdate': datetime.date(1996, 3, 14), 'receiptdate': datetime.date(1996, 4, 1), 'shipinstruct': 'NONE', 'shipmode': 'FOB', 'comment': ' the regular, regular pa'}, {'lineitemkey': 6, 'partkey': 157, 'suppkey': 10, 'quantity': Decimal('32.00'), 'extendedprice': Decimal('33828.80'), 'discount': Decimal('0.07'), 'tax': Decimal('0.02'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 1, 30), 'commitdate': datetime.date(1996, 2, 7), 'receiptdate': datetime.date(1996, 2, 3), 'shipinstruct': 'DELIVER IN PERSON', 'shipmode': 'MAIL', 'comment': 'rouches. special '}]"
2,781,O,38426.09,1996-12-01,1-URGENT,Clerk#000000880,0,ve the furiously fluffy dependencies. carefully regular,1,"[{'lineitemkey': 1, 'partkey': 1062, 'suppkey': 33, 'quantity': Decimal('38.00'), 'extendedprice': Decimal('36596.28'), 'discount': Decimal('0.00'), 'tax': Decimal('0.05'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1997, 1, 28), 'commitdate': datetime.date(1997, 1, 14), 'receiptdate': datetime.date(1997, 2, 2), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'RAIL', 'comment': 're. enticingly regular instruct'}]"
3,1234,F,205654.3,1993-10-14,5-LOW,Clerk#000000955,0,after the asymptotes. instructions cajole after the foxes. carefully unu,6,"[{'lineitemkey': 1, 'partkey': 43, 'suppkey': 19, 'quantity': Decimal('45.00'), 'extendedprice': Decimal('42436.80'), 'discount': Decimal('0.06'), 'tax': Decimal('0.00'), 'returnflag': 'R', 'linestatus': 'F', 'shipdate': datetime.date(1994, 2, 2), 'commitdate': datetime.date(1994, 1, 4), 'receiptdate': datetime.date(1994, 2, 23), 'shipinstruct': 'NONE', 'shipmode': 'AIR', 'comment': 's cajole above the pinto beans. iro'}, {'lineitemkey': 2, 'partkey': 191, 'suppkey': 70, 'quantity': Decimal('49.00'), 'extendedprice': Decimal('53468.31'), 'discount': Decimal('0.10'), 'tax': Decimal('0.00'), 'returnflag': 'R', 'linestatus': 'F', 'shipdate': datetime.date(1993, 11, 9), 'commitdate': datetime.date(1993, 12, 20), 'receiptdate': datetime.date(1993, 11, 24), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'RAIL', 'comment': 'ecial pinto beans. sly'}, {'lineitemkey': 3, 'partkey': 1285, 'suppkey': 60, 'quantity': Decimal('27.00'), 'extendedprice': Decimal('32029.56'), 'discount': Decimal('0.06'), 'tax': Decimal('0.07'), 'returnflag': 'A', 'linestatus': 'F', 'shipdate': datetime.date(1994, 1, 16), 'commitdate': datetime.date(1993, 11, 22), 'receiptdate': datetime.date(1994, 1, 23), 'shipinstruct': 'DELIVER IN PERSON', 'shipmode': 'SHIP', 'comment': 'e carefully fina'}, {'lineitemkey': 4, 'partkey': 294, 'suppkey': 22, 'quantity': Decimal('2.00'), 'extendedprice': Decimal('2388.58'), 'discount': Decimal('0.01'), 'tax': Decimal('0.06'), 'returnflag': 'A', 'linestatus': 'F', 'shipdate': datetime.date(1993, 12, 4), 'commitdate': datetime.date(1994, 1, 7), 'receiptdate': datetime.date(1994, 1, 1), 'shipinstruct': 'NONE', 'shipmode': 'TRUCK', 'comment': 'ackages boost across '}, {'lineitemkey': 5, 'partkey': 1831, 'suppkey': 61, 'quantity': Decimal('28.00'), 'extendedprice': Decimal('48519.24'), 'discount': Decimal('0.04'), 'tax': Decimal('0.00'), 'returnflag': 'R', 'linestatus': 'F', 'shipdate': datetime.date(1993, 12, 14), 'commitdate': datetime.date(1994, 1, 10), 'receiptdate': datetime.date(1994, 1, 1), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'FOB', 'comment': 'heodolites haggle blit'}, {'lineitemkey': 6, 'partkey': 622, 'suppkey': 16, 'quantity': Decimal('26.00'), 'extendedprice': Decimal('39588.12'), 'discount': Decimal('0.10'), 'tax': Decimal('0.02'), 'returnflag': 'A', 'linestatus': 'F', 'shipdate': datetime.date(1993, 10, 29), 'commitdate': datetime.date(1993, 12, 18), 'receiptdate': datetime.date(1993, 11, 4), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'RAIL', 'comment': 'telets x-ray quickly mult'}]"
4,1369,O,56000.91,1995-10-11,5-LOW,Clerk#000000124,0,st the furiously bold pinto beans. furiously pending theodolites cajol,1,"[{'lineitemkey': 1, 'partkey': 881, 'suppkey': 81, 'quantity': Decimal('30.00'), 'extendedprice': Decimal('53456.40'), 'discount': Decimal('0.03'), 'tax': Decimal('0.08'), 'returnflag': 'N', 'linestatus': 'O', 'shipdate': datetime.date(1996, 1, 10), 'commitdate': datetime.date(1995, 12, 14), 'receiptdate': datetime.date(1996, 1, 18), 'shipinstruct': 'DELIVER IN PERSON', 'shipmode': 'REG AIR', 'comment': 's. even ideas are above the accounts. '}]"
6,557,F,45523.1,1992-02-21,4-NOT SPECIFIED,Clerk#000000058,0,furiously ironic accounts haggle blithely carefully regular de,1,"[{'lineitemkey': 1, 'partkey': 1397, 'suppkey': 36, 'quantity': Decimal('37.00'), 'extendedprice': Decimal('48040.43'), 'discount': Decimal('0.08'), 'tax': Decimal('0.03'), 'returnflag': 'A', 'linestatus': 'F', 'shipdate': datetime.date(1992, 4, 27), 'commitdate': datetime.date(1992, 5, 15), 'receiptdate': datetime.date(1992, 5, 2), 'shipinstruct': 'TAKE BACK RETURN', 'shipmode': 'TRUCK', 'comment': 'ly silent ideas! carefull'}]"


## Struct schema evolution handles constantly changing upstream

## Using complex data types in data processing

### Elements in structs can be used in the same way as a normal column

### Convert rows to list of structs or vice versa

## Performance concerns

# Recap