# Chapter 1. Importing Data into DuckDB

- https://learning.oreilly.com/library/view/duckdb-up-and/9781098159689/ch01.html#creating_duckdb_databases

# --CSV FILES

## CREATE DATABASE

In [3]:
# creates an in memory storage
import duckdb

# conn = duckdb.connect()
conn_memory = duckdb.connect(":memory:")  # same as above

In [6]:
# create a persistant storage
conn_manning = duckdb.connect(
    database="../data/manning.duckdb",
    read_only=False,
)

## CREATE TABLE

### LOAD - EXECUTE

In [7]:
import duckdb

conn_manning = duckdb.connect()
conn_manning.execute(
    """
CREATE TABLE flights
AS
SELECT * FROM read_csv_auto('../data/flights.csv')
"""
).df()

Unnamed: 0,Count
0,5819079


In [10]:
display(conn_manning.execute("SHOW TABLES").df())

Unnamed: 0,name
0,flights


In [16]:
conn_manning.sql("SELECT * from flights;")

┌───────┬───────┬───────┬─────────────┬───┬────────────────┬───────────────┬─────────────────────┬───────────────┐
│ YEAR  │ MONTH │  DAY  │ DAY_OF_WEEK │ … │ SECURITY_DELAY │ AIRLINE_DELAY │ LATE_AIRCRAFT_DELAY │ WEATHER_DELAY │
│ int64 │ int64 │ int64 │    int64    │   │     int64      │     int64     │        int64        │     int64     │
├───────┼───────┼───────┼─────────────┼───┼────────────────┼───────────────┼─────────────────────┼───────────────┤
│  2015 │     1 │     1 │           4 │ … │           NULL │          NULL │                NULL │          NULL │
│  2015 │     1 │     1 │           4 │ … │           NULL │          NULL │                NULL │          NULL │
│  2015 │     1 │     1 │           4 │ … │           NULL │          NULL │                NULL │          NULL │
│  2015 │     1 │     1 │           4 │ … │           NULL │          NULL │                NULL │          NULL │
│  2015 │     1 │     1 │           4 │ … │           NULL │          NULL │    

In [17]:
# create airports table, copy csv content into this table
conn_manning.execute(
    """
CREATE TABLE airports(
                    IATA_CODE VARCHAR,
                    AIRPORT VARCHAR,
                    CITY VARCHAR,
                    STATE VARCHAR,
                    COUNTRY VARCHAR,
                    LATITUDE VARCHAR,
                    LONGITUDE VARCHAR
                    );
COPY airports FROM '../data/airports.csv' (AUTO_DETECT TRUE);
"""
)

conn_manning.execute("SELECT * FROM airports").df()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


In [22]:
# show # of tables
conn_manning.sql("SHOW TABLES")
# conn_manning.execute("SHOW TABLES").df()

┌──────────┐
│   name   │
│ varchar  │
├──────────┤
│ airports │
│ flights  │
└──────────┘

### LOAD - REGISTER

In [33]:
airlines = conn_manning.execute(
    """
    SELECT
        *
    FROM read_csv('../data/airlines.csv',
                  Header = True,
                  Columns = {'IATA_CODE': 'VARCHAR', 'AIRLINE': 'VARCHAR'})
"""
).df()
airlines

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [36]:
# now register the dataframe as table
conn_manning.register("airlines", airlines)

<duckdb.duckdb.DuckDBPyConnection at 0x1065875b0>

In [37]:
conn_manning.execute("SHOW TABLES").df()

Unnamed: 0,name
0,airlines
1,airports
2,flights


In [38]:
# check the airlines table
conn_manning.sql("SELECT * FROM airlines")

┌───────────┬──────────────────────────────┐
│ IATA_CODE │           AIRLINE            │
│  varchar  │           varchar            │
├───────────┼──────────────────────────────┤
│ UA        │ United Air Lines Inc.        │
│ AA        │ American Airlines Inc.       │
│ US        │ US Airways Inc.              │
│ F9        │ Frontier Airlines Inc.       │
│ B6        │ JetBlue Airways              │
│ OO        │ Skywest Airlines Inc.        │
│ AS        │ Alaska Airlines Inc.         │
│ NK        │ Spirit Air Lines             │
│ WN        │ Southwest Airlines Co.       │
│ DL        │ Delta Air Lines Inc.         │
│ EV        │ Atlantic Southeast Airlines  │
│ HA        │ Hawaiian Airlines Inc.       │
│ MQ        │ American Eagle Airlines Inc. │
│ VX        │ Virgin America               │
├───────────┴──────────────────────────────┤
│ 14 rows                        2 columns │
└──────────────────────────────────────────┘

## EXPORT DATA

In [40]:
conn_manning.sql("FROM airports").df().head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [41]:
# export data
conn_manning.execute(
    """
COPY (SELECT IATA_CODE, LATITUDE, LONGITUDE from airports)
TO '../data/airports_location.csv' WITH (HEADER 1, DELIMITER ',');
"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x1065875b0>

In [42]:
# close connection
conn_manning.close()

# --PARQUET FILES

In [45]:
import pandas as pd

df_airports = pd.read_csv("../data/airports.csv")
df_airports.to_parquet("../data/airports.parquet", engine="fastparquet")