# Chapter 3. EDA w/ DuckDB

[- https://learning.oreilly.com/library/view/duckdb-up-and/9781098159689/ch02.html](https://learning.oreilly.com/library/view/duckdb-up-and/9781098159689/ch03.html#our_dataset_the_2015_flights_delay_dataset)

## LOAD DATA

In [1]:
import duckdb

conn = duckdb.connect()  # in memory

In [2]:
# load fligts data
conn.execute(
    """
CREATE TABLE flights
AS
SELECT *
FROM read_csv_auto('../data/flights.csv')
"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x105c2f6b0>

In [3]:
# load airports data
conn.execute(
    """
CREATE TABLE airports
AS
SELECT *
FROM read_csv_auto('../data/airports.csv')
"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x105c2f6b0>

In [4]:
conn.execute(
    """
    CREATE TABLE airlines
    as
    SELECT
        *
    FROM read_csv('../data/airlines.csv',
                  Header = True,
                  Columns = {
                    'IATA_CODE': 'VARCHAR',
                    'AIRLINE': 'VARCHAR'
                  })
"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x105c2f6b0>

In [5]:
# show tables
conn.execute("SHOW TABLES;").df()

Unnamed: 0,name
0,airlines
1,airports
2,flights


## EDA - GEO SPATIAL DATA

In [6]:
import folium

In [7]:
# show US map
mymap = folium.Map(
    location=[47.116386, -101.2999591],
    width=950,
    height=550,
    zoom_start=3,
    tiles="openstreetmap",
)

mymap

In [8]:
import folium

mymap = folium.Map(
    location=[47.116386, -101.299591],
    width=950,
    height=550,
    zoom_start=3,
    tiles="openstreetmap",
)

folium.TileLayer("cartodbpositron", attr="cartodbpositron").add_to(mymap)
folium.TileLayer("cartodbdark_matter", attr="cartodbdark_matter").add_to(mymap)
folium.LayerControl().add_to(mymap)

mymap

### Display all airports on the map

In [9]:
df = conn.execute(
    """
SELECT *
FROM airports
"""
).df()

df.shape

(322, 7)

In [10]:
df.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [11]:
import math

for lat, lng, airport in zip(df["LATITUDE"], df["LONGITUDE"], df["AIRPORT"]):
    if math.isnan(lat) or math.isnan(lng):
        continue

    airport = folium.CircleMarker(
        location=[lat, lng],
        radius=4,
        color="red",
        fill=True,
        fill_color="yellow",
        fill_opacity=0.5,
        popup=airport,
    )

    airport.add_to(mymap)

mymap

In [12]:
import math

df = conn.execute(
    """
    SELECT
    *
    FROM airports
"""
).df()

for lat, lng, airport in zip(df["LATITUDE"], df["LONGITUDE"], df["AIRPORT"]):
    if math.isnan(lat) or math.isnan(lng):
        continue

    airport = folium.Marker(
        location=[lat, lng],
        popup=airport,
        icon=folium.Icon(
            color="lightgray",  # icon to display in
            icon="plane-arrival",  # the marker
            prefix="fa",
        ),
    )
    airport.add_to(mymap)

mymap

### DuckDB spatial Extension

In [13]:
import pandas as pd

df = pd.read_csv("../data/airports.csv")
df.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [14]:
from shapely.geometry import Point

df["geometry"] = df.apply(
    lambda row: Point(row["LONGITUDE"], row["LATITUDE"]).wkt, axis=1
)

df.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE,geometry
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404,POINT (-75.4404 40.65236)
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819,POINT (-99.6819 32.41132)
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919,POINT (-106.60919 35.04022)
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183,POINT (-98.42183 45.44906)
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447,POINT (-84.19447 31.53552)


In [15]:
conn.register("airports_2", df)

<duckdb.duckdb.DuckDBPyConnection at 0x105c2f6b0>

In [16]:
conn.install_extension("spatial")

In [17]:
conn.load_extension("spatial")

In [18]:
import leafmap

df_airports_gdf = leafmap.df_to_gdf(
    df, geometry="geometry", src_crs="EPSG:4326", dst_crs="EPSG:4326"
)

In [21]:
df_airports_gdf.explore()

In [26]:
# get co-ordinates of louisville
df.query("CITY=='Louisville'")

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE,geometry
276,SDF,Louisville International Airport (Standiford F...,Louisville,KY,USA,38.17439,-85.736,POINT (-85.736 38.17439)


In [30]:
LOUISVILLE_LNGLAT = (-85.736, 38.17439)

# within 3 degrees
df_airports_near_louisville = conn.sql(
    f"""
    SELECT *
    FROM airports_2
    WHERE ST_DWithin(
        ST_GeomFromText(geometry),
        ST_GeomFromText('POINT ({LOUISVILLE_LNGLAT[0]} {LOUISVILLE_LNGLAT[1]})'),
        3);
"""
).df()

In [32]:
df_airports_near_louisville.sort_values(by="LATITUDE")

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE,geometry
8,TYS,McGhee Tyson Airport,Knoxville,TN,USA,35.81249,-83.99286,POINT (-83.99286 35.81249)
0,BNA,Nashville International Airport,Nashville,TN,USA,36.12448,-86.67818,POINT (-86.67818 36.12448)
6,LEX,Blue Grass Airport,Lexington,KY,USA,38.03697,-84.60539,POINT (-84.60539 38.03697)
3,EVV,Evansville Regional Airport,Evansville,IN,USA,38.03799,-87.53063,POINT (-87.53063 38.03799)
7,SDF,Louisville International Airport (Standiford F...,Louisville,KY,USA,38.17439,-85.736,POINT (-85.736 38.17439)
1,CVG,Cincinnati/Northern Kentucky International Air...,Covington,KY,USA,39.04614,-84.66217,POINT (-84.66217 39.04614)
5,IND,Indianapolis International Airport,Indianapolis,IN,USA,39.71733,-86.29438,POINT (-86.29438 39.71733)
2,DAY,James M. Cox Dayton International Airport,Dayton,OH,USA,39.90238,-84.21938,POINT (-84.21938 39.90238)
4,FWA,Fort Wayne International Airport,Fort Wayne,IN,USA,40.97847,-85.19515,POINT (-85.19515 40.97847)


In [33]:
# plot the airports
import leafmap

df_airports_near_louisville_gdf = leafmap.df_to_gdf(
    df_airports_near_louisville, src_crs="EPSG:4326", dst_crs="EPSG:4326"
)

folium_map = df_airports_near_louisville_gdf.explore()
folium_map

In [34]:
import folium

folium.Marker(
    location=[LOUISVILLE_LNGLAT[1], LOUISVILLE_LNGLAT[0]], popup="Louisville"
).add_to(folium_map)

folium_map

## PERFORMING DESCRIPTIVE ANLYTICS