In [16]:
import pandas as pd
import duckdb as dd

# Daten laden und normalisiern
- Lade die CSV-Datei in DuckDB.
- Normalisiere die Daten in separate Tabellen, z. B.:
- Tabelle für Nutzerinformationen.
- Tabelle für Dienste.
- Tabelle für Nutzungsprotokolle.

### Tabellen Funkionen

In [17]:

def create_city(con):
  con.execute("DROP TABLE IF EXISTS City")
  con.execute("""
    CREATE TABLE IF NOT EXISTS City AS
    WITH unique_cities AS (
    SELECT DISTINCT city FROM read_csv('data.csv')
    ) SELECT row_number() OVER () AS city_id, city FROM unique_cities;
  """)

def create_customer(con):
    con.execute("drop table if exists Customer")
    con.execute("""
        CREATE TABLE IF NOT EXISTS 
        Customer 
        AS 
        SELECT DISTINCT 
        "Customer ID" AS customer_id, 
        first_name, 
        last_name,
        c.city_id
        FROM read_csv('data.csv') d
        JOIN City c on d.City = c.city
        """)

def create_service(con):
    con.execute("drop table if exists Service")
    con.execute("""
        CREATE TABLE IF NOT EXISTS 
        Service 
        AS 
        WITH unique_service AS (
        SELECT DISTINCT ON (Servicename)
        Servicename,
        "Costs per Month" as costs_per_month
        FROM read_csv('data.csv'))
        SELECT row_number() OVER () AS service_id, 
        Servicename as service_name, 
        costs_per_month 
        FROM 
        unique_service;
        """)

### Die Zwischentabelle über JOIN
Dabei müssen die Tabellen ein Identifikator haben. Bei der Tabelle Service ist das lediglich der Name.

In [18]:
def create_usage(con):
  con.execute("drop table if exists Usage")
  con.execute("""
    CREATE TABLE IF NOT EXISTS Usage AS
    SELECT 
      c.customer_id,
      s.service_id,
      d."Usage Time",
      d."Status last use"
    FROM 
      read_csv('data.csv') d
    JOIN Customer c ON d."Customer ID" = c.customer_id
    JOIN Service s ON d.Servicename = s.service_name
  """)

## Die Datenbank erstellen
Da die Tabelle Customer die City Tabelle referenziert, muss diese vorher erstellt werden. 
Der Pfad ist eigentl. unnötig. 

In [19]:
with dd.connect(r"data.db") as con:
    create_city(con)
    create_customer(con)
    create_service(con)
    create_usage(con)