In [1]:
from sqlalchemy import create_engine
import pandas as pd
from warnings import filterwarnings
import pymysql
filterwarnings('ignore', category=pymysql.Warning)
import os


In [2]:
engine = create_engine('mysql+pymysql://root:kcmo1728@localhost')  # connect to server
engine.execute("create database if not exists food_truck") #create db

<sqlalchemy.engine.result.ResultProxy at 0x221c3cb0898>

In [3]:
engine = create_engine('mysql+pymysql://root:kcmo1728@localhost/food_truck')

In [4]:
def RunSQL(sql_command):
    connection = pymysql.connect(host='localhost',
                             user='root',
                             password='kcmo1728',
                             db='food_truck',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
    try:
        with connection.cursor() as cursor:
            commands = sql_command.split(';')
            for command in commands:
                if command == '\n': continue
                cursor.execute(command + ';')
                connection.commit()
    except Exception as e: 
        print(e)
    finally:
        connection.close()

In [5]:
sql_query = """
drop table if exists OrderItems;
drop table if exists Products;
drop table if exists Orders; 
drop table if exists Customers;
"""
RunSQL(sql_query)

# Normalization

When we import a csv file, usually it is what is called a 'denormalized form'. 

For instance, pretend we start a food truck. We don't have time to create a database, so we'll make a simple csv file to get things rolling:



In [6]:
file_name = os.path.join('resources', 'food_truck.csv')
data_frame = pd.read_csv(file_name)
data_frame.head()

data_frame.to_sql(con=engine, name='orders', if_exists='replace', index=False)
orders = pd.read_sql_query('select * from Orders', engine)
orders.head()

Unnamed: 0,order_id,email,name,items,price
0,1,joe@example.com,Joe Tonks,'pollo burrito. diet coke',12.5
1,2,jill@example.com,Jill Jones,'carne asada. sprite',14.5


In [7]:
sql_query = """
select sum(price) as Total from Orders 
"""
order_items = pd.read_sql_query(sql_query, engine)
order_items.head()

Unnamed: 0,Total
0,27.0


* There are a couple issues with maintaining the order database above. Any thoughts?

* Our friend, a DBA, tells us we should follow the 3 Step Process for Normalizing our database.

## First Normal Form. Atomic Values

* First normal form says that values in a record need to be atomic, and not composed of embedded arrays. Eliminate all repeating groups. 

* We used an array for `items`, so that should be broken out.

In [8]:
sql_query = """
drop table orders;
create table Orders(
 order_id integer,
 email varchar(1000),
 name varchar(1000),
 item varchar(1000),
 price decimal(10,2)
);

insert into Orders(order_id, email, name, item, price)
values (1, 'joe@example.com', 'Joe Tonks', 'pollo burrito', 8.00),
(1, 'joe@example.com', 'Joe Tonks', 'diet coke', 2.50),
(2, 'jill@example.com', 'Jill Jones', 'Carne Asada', 12.00),
(2, 'jill@example.com', 'Jill Jones', 'Sprite', 2.50);
"""
RunSQL(sql_query)

* Notice that in 1NF - First Normal Form, it's a little harder to type in orders. I can't just chain them on one line, I have to **repeat** the email and name information each time.

In [9]:
orders = pd.read_sql_query('select * from Orders', engine)
orders.head()

Unnamed: 0,order_id,email,name,item,price
0,1,joe@example.com,Joe Tonks,pollo burrito,8.0
1,1,joe@example.com,Joe Tonks,diet coke,2.5
2,2,jill@example.com,Jill Jones,Carne Asada,12.0
3,2,jill@example.com,Jill Jones,Sprite,2.5


## Second Normal Form. Columns Depend on Single Primary Key

* 2NF assumes that you have achieved 1NF. 

* 2NF is about eliminating redundant data. A single row is an entity - it is one thing. For 2NF we have to uniquely identify columns that define the `entities` in our table. 

* order_id, items and price have nothing to do with email or name. So we are not in 2NF

In [14]:
sql_query = """
drop table if exists Orders;
drop table if exists Customers;

create table customers (
 email varchar(254),
 name varchar(1000),
 primary key (email)
);

create table orders (
 order_id integer,
 customer_id varchar(254),
 item varchar(1000),
 price decimal(10,2),
 foreign key (customer_id) references Customers(email)
);

insert into customers(email, name) values 
('joe@example.com', 'Joe Tonks'),
('jill@example.com', 'Jill Jones');

insert into Orders(order_id, customer_id, item, price) values 
 (1, 'joe@example.com', 'pollo burrito', 8.00),
 (1, 'joe@example.com', 'diet coke', 2.50),
 (2, 'jill@example.com', 'Carne Asada', 12.00),
 (2, 'jill@example.com', 'Sprite', 2.50);
"""
RunSQL(sql_query)

In [15]:
orders = pd.read_sql_query('select * from orders', engine)
orders.head()

Unnamed: 0,order_id,customer_id,item,price
0,1,joe@example.com,pollo burrito,8.0
1,1,joe@example.com,diet coke,2.5
2,2,jill@example.com,Carne Asada,12.0
3,2,jill@example.com,Sprite,2.5


In [16]:
customers = pd.read_sql_query('select * from customers', engine)
customers.head()
# Here, email is the primary key. email isn't great as a primary key because people change their email.
# That said, we are moving in the right direction and are in 2NF...

Unnamed: 0,email,name
0,jill@example.com,Jill Jones
1,joe@example.com,Joe Tonks


## Third Normal Form. Eliminate Non-dependent Columns.

* non-keys describe the key and nothing else
* our orders table has repeated values for the key (which is a no-no) and the price of each item has nothing to do with the order.


In [17]:
sql_query = """
drop table if exists OrderItems;
drop table if exists Products;
drop table if exists Orders;
drop table if exists Customers;


create table Products (
 sku varchar(254) not null,
 price decimal(10,2),
 name varchar(254),
 primary key (sku)
);

create table Customers (
 customer_id integer auto_increment not null, # better to use a unique id than email
 email varchar(254),
 name varchar(1000),
 primary key (customer_id)
);

create table Orders (
 order_id integer not null,
 customer_id integer not null,
 date datetime,
 primary key(order_id),
 foreign key (customer_id) references Customers(customer_id)
);

create table OrderItems(
 order_item_id integer auto_increment not null,
 order_id integer not null,
 sku varchar(254) not null,
 primary key(order_item_id),
 foreign key (order_id) references Orders(order_id)
);

# email address changed to integer.
insert into Customers(email, name) values 
 ('joe@example.com', 'Joe Tonks'),
 ('jill@example.com', 'Jill Jones');
    
insert into Products(sku, price, name) values 
 ('PB', 8.00, 'pollo burrito'),
 ('DC',2.50, 'diet coke'),
 ('CA',12.00, 'carne asada'),
 ('S', 2.50, 'sprite');
    
insert into Orders(order_id, customer_id, date) values 
 (1, 1, '2012-06-18'),
 (2, 2,'2012-11-12');
    
insert into OrderItems(order_id, sku) values 
 (1, 'PB'),
 (1, 'DC'),
 (2, 'CA'),
 (2, 'S');
"""
RunSQL(sql_query)

In [18]:
customers = pd.read_sql_query('select * from Customers', engine)
customers.head()

Unnamed: 0,customer_id,email,name
0,1,joe@example.com,Joe Tonks
1,2,jill@example.com,Jill Jones


In [19]:
orders = pd.read_sql_query('select * from Orders', engine)
orders.head()

Unnamed: 0,order_id,customer_id,date
0,1,1,2012-06-18
1,2,2,2012-11-12


In [20]:
products = pd.read_sql_query('select * from Products', engine)
products.head()

Unnamed: 0,sku,price,name
0,CA,12.0,carne asada
1,DC,2.5,diet coke
2,PB,8.0,pollo burrito
3,S,2.5,sprite


In [21]:
order_items = pd.read_sql_query('select * from OrderItems', engine)
order_items.head()

Unnamed: 0,order_item_id,order_id,sku
0,1,1,PB
1,2,1,DC
2,3,2,CA
3,4,2,S


## Is that all? 

* Wait, there's more.

* Let's say we wanted to see our total sales. We might do a query like this:

In [128]:
sql_query = """
select sum(price) from OrderItems 
inner join Products on 
Products.sku = OrderItems.sku
"""
order_items = pd.read_sql_query(sql_query, engine)
order_items.head()

Unnamed: 0,sum(price)
0,25.0


## Great! We have our Normalized Database

* Now, we've been using our database for a while and business is a success!
* We want to raise the price of our pollo burrito to $9.00

* What happens to our database?

In [129]:
sql_query = """
update products
set price=9.00
where sku = 'PB';
"""
RunSQL(sql_query)

In [130]:
products = pd.read_sql_query('select * from Products', engine)
products.head()

Unnamed: 0,sku,price,name
0,CA,12.0,carne asada
1,DC,2.5,diet coke
2,PB,9.0,pollo burrito
3,S,2.5,sprite


In [131]:
sql_query = """
select sum(price) from OrderItems 
inner join Products on 
Products.sku = OrderItems.sku
"""
order_items = pd.read_sql_query(sql_query, engine)
order_items.head()

Unnamed: 0,sum(price)
0,26.0


### Yikes!! We just messed up our total sales value!!

* We have a database with `slowly changing dimensions`. That is, our price isn't one price for eternity, but a snapshot at a moment in time. We need to fix our prices table!
 
* There are historically, a bunch of ways to fix this. See https://en.wikipedia.org/wiki/Slowly_changing_dimension. Also, new databases like Microsoft SQL Server have Temporal Tables specifically for this issue. To the best of my knowledge MySQL does not have this feature, so we'll do it the formal way.

* I solved with type 2 slowly changing dimension.

In [132]:
sql_query = """
drop table if exists OrderItems;
drop table if exists Products;
drop table if exists Orders;
drop table if exists Customers;


create table Products (
 sku varchar(254) not null,
 price decimal(10,2),
 name varchar(254),
 valid_from datetime,
 valid_to datetime,
 index (sku)
);

create table Customers (
 customer_id integer auto_increment not null, # better to use a unique id than email
 email varchar(254),
 name varchar(1000),
 primary key (customer_id)
);

create table Orders (
 order_id integer not null,
 customer_id integer not null,
 date datetime,
 primary key(order_id),
 foreign key (customer_id) references Customers(customer_id)
);

create table OrderItems(
 order_item_id integer auto_increment not null,
 order_id integer not null,
 sku varchar(254) not null,
 primary key(order_item_id),
 foreign key (order_id) references Orders(order_id)
);

# email address changed to integer.
insert into Customers(email, name) values 
 ('joe@example.com', 'Joe Tonks'),
 ('jill@example.com', 'Jill Jones');
    
insert into Products(sku, price, name, valid_from, valid_to) values 
 ('PB', 8.00, 'pollo burrito', '2012-05-01', '2018-02-01'),
 ('PB', 8.00, 'pollo burrito', '2018-02-01', '9999-12-31'),
 ('DC',2.50, 'diet coke', '2012-05-01', '9999-12-31'),
 ('CA',12.00, 'carne asada', '2012-05-01', '9999-12-31'),
 ('S', 2.50, 'sprite', '2012-05-01', '9999-12-31');
    
insert into Orders(order_id, customer_id, date) values 
 (1, 1, '2012-06-18'),
 (2, 2,'2012-11-12');
    
insert into OrderItems(order_id, sku) values 
 (1, 'PB'),
 (1, 'DC'),
 (2, 'CA'),
 (2, 'S');
"""
RunSQL(sql_query)

In [133]:
sql_query = """
select * from OrderItems 
inner join Products on 
Products.sku = OrderItems.sku
inner join Orders on
Orders.order_id = OrderItems.order_id
"""
order_items = pd.read_sql_query(sql_query, engine)
order_items.head()

Unnamed: 0,order_item_id,order_id,sku,sku.1,price,name,valid_from,valid_to,order_id.1,customer_id,date
0,1,1,PB,PB,8.0,pollo burrito,2012-05-01,2018-02-01 00:00:00,1,1,2012-06-18
1,1,1,PB,PB,8.0,pollo burrito,2018-02-01,9999-12-31 00:00:00,1,1,2012-06-18
2,2,1,DC,DC,2.5,diet coke,2012-05-01,9999-12-31 00:00:00,1,1,2012-06-18
3,3,2,CA,CA,12.0,carne asada,2012-05-01,9999-12-31 00:00:00,2,2,2012-11-12
4,4,2,S,S,2.5,sprite,2012-05-01,9999-12-31 00:00:00,2,2,2012-11-12


In [134]:
sql_query = """
select sum(price) from OrderItems 
inner join Orders on
Orders.order_id = OrderItems.order_id
inner join Products on 
Products.sku = OrderItems.sku
where Orders.date <= Products.valid_to and Orders.date >= Products.valid_from
"""
order_items = pd.read_sql_query(sql_query, engine)
order_items.head()

Unnamed: 0,sum(price)
0,25.0
