# SQL

In [1]:
import pandas as pd
import sqlalchemy

## Connect to database

In [2]:
# Connect to the movie db
engine = sqlalchemy.create_engine(
    "postgresql+psycopg2://", 
    connect_args={"database": "postgres", "user": "faculty", "host": "/var/run/postgresql"}
)
con = engine.connect()
con.execute("SET schema 'input'")

<sqlalchemy.engine.result.ResultProxy at 0x7fbf2136e490>

## Create tables
Create simple `customers` table from slides:

In [3]:
con.execute('''
    DROP TABLE IF EXISTS customers CASCADE;
    DROP SEQUENCE IF EXISTS customers_id_seq;
''')

<sqlalchemy.engine.result.ResultProxy at 0x7fbf218a1130>

In [4]:
con.execute("CREATE SEQUENCE customers_id_seq")
con.execute("""
CREATE TABLE customers (
    id BIGINT DEFAULT NEXTVAL('customers_id_seq') PRIMARY KEY,
    name TEXT NOT NULL,
    address TEXT
)
""")

<sqlalchemy.engine.result.ResultProxy at 0x7fbf21d6e8e0>

In [5]:
con.execute("""
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname = 'input'
""")

<sqlalchemy.engine.result.ResultProxy at 0x7fbf4875a370>

Create `product` table from slides:

In [6]:
con.execute('''
    DROP TABLE IF EXISTS products CASCADE;
    DROP SEQUENCE IF EXISTS products_id_seq;
''')

<sqlalchemy.engine.result.ResultProxy at 0x7fbf4875f8b0>

In [7]:
con.execute("CREATE SEQUENCE products_id_seq")
con.execute("""
CREATE TABLE products (
    id BIGINT DEFAULT NEXTVAL('products_id_seq') PRIMARY KEY,
    name TEXT NOT NULL
)
""")

<sqlalchemy.engine.result.ResultProxy at 0x7fbf21388d90>

Create `sales` table from slides:

In [8]:
con.execute('''
    DROP TABLE IF EXISTS sales CASCADE
''')

<sqlalchemy.engine.result.ResultProxy at 0x7fbf4875fc70>

In [9]:
con.execute("CREATE SEQUENCE sales_id_seq")
con.execute("""
CREATE TABLE sales (
    id BIGINT DEFAULT NEXTVAL('sales_id_seq') PRIMARY KEY,
    customer_id BIGINT,
    product_id BIGINT,
    quantity BIGINT,
    FOREIGN KEY (customer_id) REFERENCES customers(id) ON DELETE CASCADE,
    FOREIGN KEY (product_id) REFERENCES products(id) ON DELETE CASCADE
)
""")

ProgrammingError: (psycopg2.errors.DuplicateTable) relation "sales_id_seq" already exists

[SQL: CREATE SEQUENCE sales_id_seq]
(Background on this error at: http://sqlalche.me/e/f405)

# Insert values

In [None]:
con.execute("INSERT INTO customers (name, address) VALUES ('Andy Brookes', 'London')")
con.execute("INSERT INTO customers (name, address) VALUES ('Jake Dunn', 'London')")
con.execute("INSERT INTO customers (name, address) VALUES ('Bill Gates', 'Seattle')")
con.execute("INSERT INTO customers (name, address) VALUES ('Steve Jobs', 'Palo Alto')")
con.execute("INSERT INTO customers (name, address) VALUES ('Random guy', NULL)")

In [None]:
con.execute("INSERT INTO products (name) VALUES ('iPhone')")
con.execute("INSERT INTO products (name) VALUES ('Xbox')")
con.execute("INSERT INTO products (name) VALUES ('PlayStation')")
con.execute("INSERT INTO products (name) VALUES ('Wii')")

In [None]:
con.execute("""
    INSERT INTO sales (customer_id, product_id, quantity)
    VALUES (1, 1, 1), (2, 2, 3), (3, 1, 2), (5, 3, 10)
""")

## Select results

The full `customers` table:

In [None]:
q = """
SELECT * 
FROM customers 
"""
df = pd.read_sql(q, con)
df

The full `products` table:

In [None]:
q = """
SELECT * 
FROM products 
"""
df = pd.read_sql(q, con)
df

The full `sales` table:

In [None]:
q = """
SELECT * 
FROM sales 
"""
df = pd.read_sql(q, con)
df

Perform a projection, i.e. taking vertical slices of a table, and only recording values once:

In [None]:
q = """
SELECT DISTINCT address 
FROM customers 
"""
df = pd.read_sql(q, con)
df

Perform a selection, i.e. filter rows of a table based on some predicate you are interested in:

In [None]:
q = """
SELECT name 
FROM customers 
WHERE address = 'London'
"""
df = pd.read_sql(q, con)
df

You can order your results:

In [None]:
q = """
SELECT name 
FROM customers 
ORDER BY name ASC
"""
df = pd.read_sql(q, con)
df

You can also aggregate results:

In [None]:
q = """
SELECT product_id, SUM(quantity) 
FROM sales 
GROUP BY product_id
"""
df = pd.read_sql(q, con)
df

In [None]:
q = """
SELECT product_id, SUM(quantity) AS orders
FROM sales 
GROUP BY product_id
"""
df = pd.read_sql(q, con)
df

Using `IN` in predicates, to check membership to a collection:

In [None]:
q = """
SELECT name 
FROM customers 
WHERE address IN ('Palo Alto', 'Seattle')
"""
df = pd.read_sql(q, con)
df

And of course if we negate...

In [None]:
q = """
SELECT name 
FROM customers 
WHERE address NOT IN ('Palo Alto', 'Seattle')
"""
df = pd.read_sql(q, con)
df

WÄT???? :|

What happened to `Random guy`?

NULL in SQL can be confusing. This is because you shouldn't think of them as a `None`, but rather as an `Unknown` value. As a result, e.g., `NULL IN <anything>` is `NULL`.

When writing predicates involving NULL remember not to use `=` but `IS`, this will make your life easier:

In [None]:
q = """
SELECT name 
FROM customers 
WHERE address IS NOT NULL
"""
df = pd.read_sql(q, con)
df

Set operations: `INTERSECT` and `UNION`

In [None]:
q = """
(SELECT name, address 
FROM customers
WHERE address IS NOT NULL)
INTERSECT
(SELECT name, address
FROM customers
WHERE id > 1)
"""
df = pd.read_sql(q, con)
df

In [None]:
q = """
(SELECT name, address 
FROM customers
WHERE address IS NOT NULL)
UNION
(SELECT name, address
FROM customers
WHERE id > 1)
"""
df = pd.read_sql(q, con)
df

Aggregating results from multiple tables with JOIN. 

In [None]:
q = """
SELECT products.name, sales.quantity 
FROM products
JOIN sales
ON products.id = sales.product_id
"""
df = pd.read_sql(q, con)
df

Example of LEFT JOIN

In [None]:
q = """
SELECT products.name, sales.quantity 
FROM products
LEFT OUTER JOIN sales
ON products.id = sales.product_id
"""
df = pd.read_sql(q, con)
df

Example of less common `JOIN`:

In [None]:
q = """
SELECT products.name, sales.quantity 
FROM products
FULL OUTER JOIN sales
ON products.id = sales.product_id
WHERE products.id IS NULL OR sales.id IS NULL
"""
df = pd.read_sql(q, con)
df

## Update values

Update statements can grow more complicated, but here is a taster:

In [None]:
con.execute("UPDATE customers SET name = 'Andrew Brookes' WHERE name = 'Andy Brookes'")
q = """
SELECT *
FROM customers
"""
df = pd.read_sql(q, con)
df

## Delete entries

A simple example of how to delete rows matching a filer:

In [None]:
con.execute("DELETE FROM customers WHERE name = 'Andrew Brookes'")
q = """
SELECT *
FROM customers
"""
df = pd.read_sql(q, con)
df

## Drop tables -- BE CAREFUL!!! :@

Deleting tables is not something you will have to do too often, so when doing this it is worth being extra careful. You do NOT want to be dropping tables late on a Friday night, especially if you want to avoid explaining a client/manager why you lost all your data :)

Let's remove the tables we have created so far, to leave us with a clean state for the exercise:

In [None]:
con.execute("DROP TABLE sales")
con.execute("DROP SEQUENCE sales_id_seq")

con.execute("DROP TABLE products")
con.execute("DROP SEQUENCE products_id_seq")

con.execute("DROP TABLE customers")
con.execute("DROP SEQUENCE customers_id_seq")