In [None]:
import duckdb

In [2]:
DB_PATH = 'taxi_rides_ny.duckdb'

In [None]:
def analyze_duckdb_database():

    # Connect to the database
    conn = duckdb.connect(DB_PATH)

    # Get list of all schemas
    schemas = conn.execute("""
                SELECT schema_name 
                FROM information_schema.schemata 
                WHERE schema_name NOT IN ('information_schema', 'pg_catalog')
                ORDER BY schema_name
            """).fetchall()

    for schema in schemas:
        schema_name = schema[0]

        if schema_name == 'main':
            continue

        print(f"Schema: {schema_name}")
        print("-" * 40)
        
        # Get list of tables in the schema
        tables = conn.execute(f"""
            SELECT table_name 
            FROM information_schema.tables 
            WHERE table_schema = '{schema_name}'
            AND table_type = 'BASE TABLE'
            ORDER BY table_name
        """).fetchall()

        # For each table, get records count
        for table in tables:
            table_name = table[0]
            
            # Get the number of rows in the table
            row_count = conn.execute(f"""
                SELECT COUNT(*) FROM "{schema_name}"."{table_name}"
            """).fetchone()[0]

            print(f"  Table: {table_name}: row count {row_count:,}")

        print("-" * 40)

    conn.close()

In [4]:
analyze_duckdb_database()

Schema: prod
----------------------------------------
  Table: fhv_tripdata: row count 43,244,696
  Table: green_tripdata: row count 7,778,101
  Table: yellow_tripdata: row count 109,047,518
----------------------------------------


### Run dbt

dbt build --target prod

In [5]:
analyze_duckdb_database()

Schema: prod
----------------------------------------
  Table: dim_vendors: row count 3
  Table: dim_zones: row count 265
  Table: fct_monthly_zone_revenue: row count 12,184
  Table: fct_trips: row count 112,086,662
  Table: fhv_tripdata: row count 43,244,696
  Table: green_tripdata: row count 7,778,101
  Table: int_trips: row count 112,086,662
  Table: int_trips_unioned: row count 114,827,251
  Table: payment_type_lookup: row count 7
  Table: taxi_zone_lookup: row count 265
  Table: yellow_tripdata: row count 109,047,518
----------------------------------------


In [6]:
conn = duckdb.connect(DB_PATH)

In [7]:
schema_name = 'prod'
table_name = 'fct_monthly_zone_revenue'
row_count = conn.execute(f"""
                SELECT COUNT(*) FROM "{schema_name}"."{table_name}"
            """).fetchone()[0]

print(f"  Table: {table_name}: row count {row_count:,}")

  Table: fct_monthly_zone_revenue: row count 12,184


In [12]:
schema_name = 'prod'
table_name = 'fct_monthly_zone_revenue'
df = conn.execute(f"""
                SELECT 
                  * 
                FROM "{schema_name}"."{table_name}"
                LIMIT 5
                """).df()
df.head()


Unnamed: 0,pickup_zone,revenue_month,service_type,revenue_monthly_fare,revenue_monthly_extra,revenue_monthly_mta_tax,revenue_monthly_tip_amount,revenue_monthly_tolls_amount,revenue_monthly_ehail_fee,revenue_monthly_improvement_surcharge,revenue_monthly_total_amount,total_monthly_trips,avg_monthly_passenger_count,avg_monthly_trip_distance
0,Lincoln Square West,2019-01-01,Yellow,914763.99,23976.7,46753.5,136833.27,8103.35,0.0,28123.5,1159590.51,93827,1.567459,1.983532
1,Hamilton Heights,2019-01-01,Yellow,106269.93,2246.5,4151.0,10594.23,1615.6,0.0,2510.1,127528.96,8379,1.587421,3.068887
2,Upper East Side South,2019-01-01,Yellow,3404904.18,93330.84,158049.0,425494.42,17163.4,0.0,94938.9,4194450.29,316687,1.539309,1.64004
3,East New York,2019-01-01,Green,106287.85,568.5,2139.5,202.07,2783.4,,893.1,112899.77,4349,1.251322,6.751065
4,Brownsville,2019-01-01,Yellow,23027.62,131.0,435.0,78.13,586.39,0.0,273.0,24538.94,912,1.349781,7.094265


In [13]:
schema_name = 'prod'
table_name = 'fct_monthly_zone_revenue'
df = conn.execute(f"""
                SELECT 
                  pickup_zone,
                  SUM(revenue_monthly_total_amount) as revenue_monthly_total_amount 
                FROM "{schema_name}"."{table_name}"
                WHERE service_type = 'Green' 
                    AND revenue_month >= '2020-01-01'
                GROUP BY pickup_zone
                ORDER BY revenue_monthly_total_amount desc
                LIMIT 5
                """).df()
df.head()

Unnamed: 0,pickup_zone,revenue_monthly_total_amount
0,East Harlem North,1817359.35
1,East Harlem South,1653113.71
2,Central Harlem,1097546.92
3,Washington Heights South,880070.2
4,Morningside Heights,764231.64


In [14]:
schema_name = 'prod'
table_name = 'fct_monthly_zone_revenue'
df = conn.execute(f"""
                SELECT 
                  revenue_month,
                  SUM(total_monthly_trips) as total_monthly_trips 
                FROM "{schema_name}"."{table_name}"
                WHERE service_type = 'Green' 
                    AND revenue_month = '2019-10-01'
                GROUP BY revenue_month
                """).df()
df.head()

Unnamed: 0,revenue_month,total_monthly_trips
0,2019-10-01,384624.0


In [16]:
schema_name = 'prod'
table_name = 'fhv_tripdata'
row_count = conn.execute(f"""
                SELECT COUNT(*) FROM "{schema_name}"."{table_name}"
            """).fetchone()[0]

print(f"  Table: {table_name}: row count {row_count:,}")

  Table: fhv_tripdata: row count 43,244,696


In [17]:
conn.close()