In [1]:
import duckdb as dd
from pathlib import Path

### Staging Tabelle

In [91]:
connect = dd.connect("data\\warehouse.duckdb")

In [9]:
connect.execute(
    "CREATE TABLE IF NOT EXISTS staging_data AS " \
    "SELECT * FROM read_csv_auto('data\\example.csv')")

<_duckdb.DuckDBPyConnection at 0x11fd243d8b0>

In [10]:
connect.sql("select * from staging_data limit 5")

┌─────────────────────┬─────────────┬────────────┬───────────┬─────────┬────────────┬───────────────┬───────────────────┬────────────────────┬────────┬────────────────┬──────────────┬───────────────────────┐
│    Purchase date    │ Customer ID │ First_Name │ Last_Name │ Gender  │  Birthday  │ Support Level │       City        │    Servicename     │ Costs  │ payment_method │ Sales Canal  │ Customer Satisfaction │
│      timestamp      │    int64    │  varchar   │  varchar  │ varchar │    date    │    varchar    │      varchar      │      varchar       │ double │    varchar     │   varchar    │         int64         │
├─────────────────────┼─────────────┼────────────┼───────────┼─────────┼────────────┼───────────────┼───────────────────┼────────────────────┼────────┼────────────────┼──────────────┼───────────────────────┤
│ 2024-06-06 19:13:54 │  4152186239 │ Nicholas   │ Avila     │ M       │ 1992-05-30 │ Premium       │ North Danielton   │ Custom Development │ 199.99 │ PayPal         │

In [11]:
connect.sql("describe select * from staging_data limit 5")

┌───────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│      column_name      │ column_type │  null   │   key   │ default │  extra  │
│        varchar        │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ Purchase date         │ TIMESTAMP   │ YES     │ NULL    │ NULL    │ NULL    │
│ Customer ID           │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ First_Name            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Last_Name             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Gender                │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Birthday              │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ Support Level         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ City                  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Servicename           │ VARCHAR     │ 

### Erste kleine Analysen

In [20]:
# Wie viele unterschiedliche Kunden gibt es?
query = """
SELECT 
  count(DISTINCT "Customer ID") as Anzahl
FROM staging_data  
"""
connect.sql(query)

┌────────┐
│ Anzahl │
│ int64  │
├────────┤
│    100 │
└────────┘

In [29]:
# Wie oft kommen die 7 Services vor?
query = """
SELECT
  Servicename,
  count(Servicename) as Anzahl
FROM staging_data
GROUP BY Servicename
ORDER BY Anzahl DESC
"""
connect.sql(query)

┌────────────────────┬────────┐
│    Servicename     │ Anzahl │
│      varchar       │ int64  │
├────────────────────┼────────┤
│ Hardware           │  14409 │
│ Custom Development │  14309 │
│ Streaming          │  14306 │
│ Automation         │  14299 │
│ E-Commerce         │  14237 │
│ Databases          │  14225 │
│ Bi Tooling         │  14215 │
└────────────────────┴────────┘

In [69]:
# Welche Services haben wir?
query = """
WITH cte AS (
SELECT DISTINCT
  Servicename,
  Costs
FROM staging_data
GROUP BY Servicename, Costs
ORDER BY Costs)
SELECT 
  ROW_NUMBER() OVER () AS ID,
  Servicename, 
  Costs
FROM CTE
"""
connect.sql(query)

┌───────┬────────────────────┬────────┐
│  ID   │    Servicename     │ Costs  │
│ int64 │      varchar       │ double │
├───────┼────────────────────┼────────┤
│     1 │ Bi Tooling         │   8.99 │
│     2 │ Databases          │   9.99 │
│     3 │ Hardware           │  12.69 │
│     4 │ Streaming          │  19.99 │
│     5 │ E-Commerce         │  39.99 │
│     6 │ Automation         │  89.99 │
│     7 │ Custom Development │ 199.99 │
└───────┴────────────────────┴────────┘

In [39]:
query = """
SELECT DISTINCT
  Servicename,
  AVG(Costs)
FROM staging_data
GROUP BY Costs, Servicename
ORDER BY Costs
"""
connect.sql(query)

┌────────────────────┬────────────────────┐
│    Servicename     │     avg(Costs)     │
│      varchar       │       double       │
├────────────────────┼────────────────────┤
│ Bi Tooling         │   8.99000000000221 │
│ Databases          │  9.990000000001386 │
│ Hardware           │ 12.690000000001808 │
│ Streaming          │ 19.989999999996005 │
│ E-Commerce         │  39.98999999999336 │
│ Automation         │  89.98999999999188 │
│ Custom Development │  199.9900000000534 │
└────────────────────┴────────────────────┘

In [48]:
query = """
SELECT 
  EXTRACT(YEAR FROM "Purchase date") AS Year,
  COUNT(*) as Verkäufe
FROM staging_data
GROUP BY Year
ORDER BY Year
"""
connect.sql(query)

┌───────┬──────────┐
│ Year  │ Verkäufe │
│ int64 │  int64   │
├───────┼──────────┤
│  2010 │     6333 │
│  2011 │     6308 │
│  2012 │     6285 │
│  2013 │     6269 │
│  2014 │     6398 │
│  2015 │     6193 │
│  2016 │     6347 │
│  2017 │     6324 │
│  2018 │     6330 │
│  2019 │     6332 │
│  2020 │     6405 │
│  2021 │     6248 │
│  2022 │     6252 │
│  2023 │     6285 │
│  2024 │     6342 │
│  2025 │     5349 │
├───────┴──────────┤
│     16 rows      │
└──────────────────┘

In [59]:
# Verteilung auf Monate
query = """
SELECT
  STRFTIME("Purchase date", '%Y-%m') AS Jahr_Monat,
  COUNT(*) as Verkäufe
FROM staging_data
GROUP BY Jahr_Monat
ORDER BY Verkäufe DESC
LIMIT 10
"""
connect.sql(query)

┌────────────┬──────────┐
│ Jahr_Monat │ Verkäufe │
│  varchar   │  int64   │
├────────────┼──────────┤
│ 2022-05    │      589 │
│ 2010-08    │      581 │
│ 2019-01    │      580 │
│ 2024-08    │      580 │
│ 2014-12    │      578 │
│ 2019-05    │      578 │
│ 2025-01    │      576 │
│ 2024-10    │      574 │
│ 2012-07    │      572 │
│ 2012-08    │      571 │
├────────────┴──────────┤
│ 10 rows     2 columns │
└───────────────────────┘

In [61]:
# Verteilung auf Wochen
query = """
SELECT
  STRFTIME("Purchase date", '%Y-%W') AS Jahr_Woche,
  COUNT(*) as Verkäufe
FROM staging_data
GROUP BY Jahr_Woche
ORDER BY Verkäufe DESC
LIMIT 10
"""
connect.sql(query)

┌────────────┬──────────┐
│ Jahr_Woche │ Verkäufe │
│  varchar   │  int64   │
├────────────┼──────────┤
│ 2018-35    │      151 │
│ 2019-19    │      149 │
│ 2014-50    │      148 │
│ 2022-33    │      147 │
│ 2022-19    │      147 │
│ 2014-37    │      146 │
│ 2016-49    │      146 │
│ 2016-12    │      146 │
│ 2016-08    │      145 │
│ 2012-33    │      144 │
├────────────┴──────────┤
│ 10 rows     2 columns │
└───────────────────────┘

### Dimensionstabellen erstellen

In [92]:
# Services
query = """
CREATE OR REPLACE TABLE tblServices AS
WITH cte AS (
SELECT DISTINCT
  Servicename,
  Costs
FROM staging_data
ORDER BY Costs)
SELECT 
  ROW_NUMBER() OVER () AS ID,
  Servicename, 
  Costs
FROM CTE
"""
connect.sql(query)

In [93]:
# Customers
query = """
CREATE OR REPLACE TABLE tblCustomers AS
SELECT DISTINCT
  "Customer ID" as customers_id,
  First_Name,
  Last_Name,
  Gender,
  City,
  "Support Level" as support_level,
  Birthday,
  date_diff('year', Birthday, today()) as Age
FROM staging_data
ORDER BY Last_Name
"""
connect.sql(query)

In [94]:
connect.close()