In [1]:
from sqlalchemy import create_engine
import pandas as pd 
import json

with open('/home/douglas/postgres_credentials.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']
    address = data['address']

conn = create_engine('postgresql://{}:{}@{}:5432/ex_orders_random'.format(username, password, address))

  """)


# Check out table format

In [2]:
pd.read_sql_query("SELECT * FROM customer LIMIT 5", conn)
# emails don't match customer name - randomized dataset 

Unnamed: 0,customer_id,name,email,address,state
0,1000,Kristin Chan,alvarezlawrence@wolf.com,414 Scott Port,NE
1,1001,Valerie Dean,gardnernicole@hawkins.com,1172 Hudson Mission,UT
2,1002,Cristian Patterson,bruce24@hotmail.com,874 Scott Valley,WI
3,1003,Richard Johnson,williamsjoseph@gmail.com,4810 Compton Coves,WA
4,1004,Mark Rodriguez,christiangaines@harris.com,54950 Cheryl Pine,NY


In [3]:
pd.read_sql_query("SELECT * FROM product LIMIT 5", conn)

Unnamed: 0,product_id,product_name,price
0,8001,Rustic Steel Pizza,46.0
1,8002,Ergonomic Granite Soap,113.0
2,8003,Rustic Wooden Sausages,16.0
3,8004,Ergonomic Metal Gloves,17.0
4,8005,Awesome Cotton Ball,68.0


In [4]:
pd.read_sql_query("SELECT max(product_id), min(product_id) FROM product", conn)
# missing product id 8000, present in order_product

Unnamed: 0,max,min
0,8149,8000


In [5]:
# add 8000 -to allow copy of extraneous id generated in order_details.csv
# conn.execute("INSERT INTO product VALUES (8000, 'Missing', 88)")

In [6]:
# confirm added 
pd.read_sql_query("SELECT * FROM product ORDER BY product_id LIMIT 5", conn)

Unnamed: 0,product_id,product_name,price
0,8000,Missing,88.0
1,8001,Rustic Steel Pizza,46.0
2,8002,Ergonomic Granite Soap,113.0
3,8003,Rustic Wooden Sausages,16.0
4,8004,Ergonomic Metal Gloves,17.0


In [7]:
pd.read_sql_query("SELECT * FROM customer_order LIMIT 5", conn)

Unnamed: 0,order_id,customer_id,date_ordered,date_delivered
0,0,1006,2016-10-21,2016-10-31
1,1,1050,2016-10-21,2016-10-29
2,2,1152,2016-10-21,2016-10-27
3,3,1167,2016-10-21,2016-10-27
4,4,1191,2016-10-22,2016-10-27


In [8]:
pd.read_sql_query("SELECT * FROM order_product LIMIT 5", conn)

Unnamed: 0,order_id,product_id,qty
0,0,8048,9
1,1,8110,5
2,1,8051,9
3,1,8032,4
4,1,8091,4


In [9]:
pd.read_sql_query("SELECT count(*) FROM order_product WHERE product_id = 8000", conn)
# a good amount of orders involving product 8000

Unnamed: 0,count
0,48


In [10]:
pd.read_sql_query("SELECT max(product_id), min(product_id) FROM order_product", conn)
#confirm 8000 is the only extraneous product_id 

Unnamed: 0,max,min
0,8149,8000


In [11]:
# notably missing - total $ spent in an order

# Questions

In [12]:
# List the 10 most expensive products for sale, and their prices
pd.read_sql_query("SELECT * FROM product ORDER BY price DESC LIMIT 10", conn)

Unnamed: 0,product_id,product_name,price
0,8028,Incredible Granite Keyboard,115.0
1,8112,Ergonomic Concrete Cheese,114.0
2,8134,Fantastic Steel Towels,113.0
3,8002,Ergonomic Granite Soap,113.0
4,8035,Generic Concrete Soap,113.0
5,8091,Sleek Frozen Shirt,113.0
6,8009,Generic Metal Hat,113.0
7,8130,Small Metal Bike,112.0
8,8057,Awesome Metal Salad,111.0
9,8051,Handcrafted Frozen Pants,110.0


In [13]:
# Which states have more than 5 customers? Use the state column on the customer table. Count each customer on the table, regardless of whether they have ever bought anything.
pd.read_sql_query('''
SELECT state,
count(customer_id) 
FROM customer
GROUP BY state 
HAVING count(customer_id) > 5 
ORDER BY count DESC''', conn)
# HAVING has access to aggregate functions but not SELECT aliases
# ORDER for SQL execution
# 1. FROM 2. ON 3. OUTER 4. WHERE 5. GROUP BY 6. CUBE | ROLLUP 7. HAVING 8. SELECT 9. DISTINCT 10 ORDER BY 11. TOP

Unnamed: 0,state,count
0,AL,9
1,WY,8
2,IL,7
3,WV,7
4,FL,6
5,MS,6
6,ME,6


In [14]:
# Get the 17 customers that have made the largest number of orders. Include the name, address, state, and number of orders made.
pd.read_sql_query('''
SELECT c.name,
c.address,
c.state,
count(o.order_id) as num_orders
FROM customer c
JOIN customer_order o
ON c.customer_id = o.customer_id
GROUP BY 1,2,3 
ORDER BY num_orders DESC
LIMIT 17''', conn)

Unnamed: 0,name,address,state,num_orders
0,Joseph Ponce,93874 Esparza Mountain,KS,19
1,Andrew Fischer,7764 Brown Divide,ME,18
2,Sabrina Foster,5075 Mullins Drive Apt. 298,MD,17
3,George Davis MD,439 Chan Route,IL,16
4,Benjamin Brown,598 Moore Ports,TN,16
5,Eric Erickson,7751 Clark Lane,VA,15
6,Michelle Wright,565 Manuel Road Suite 343,IN,15
7,Emily Fritz,918 Renee Lights,AL,15
8,Jessica Burke,68160 Amanda Pike,NM,15
9,Edgar Perry,333 Jenna Bridge,AL,15


In [15]:
# Get all orders by customer 1026. Include the amount spent in each order, the order id, and the total number of distinct products purchased.
# 1. join order_product and product to calc quantity x price
# 2. join customer order to order_product to get all products ordered (count and sum) 

In [16]:
# filter to only orders made by customer 1026 
pd.read_sql_query("SELECT * FROM customer_order WHERE customer_id = 1026", conn)

Unnamed: 0,order_id,customer_id,date_ordered,date_delivered
0,59,1026,2016-11-16,2016-11-21
1,274,1026,2017-02-04,2017-02-11
2,387,1026,2017-03-17,2017-03-19
3,622,1026,2017-06-16,2017-06-21
4,844,1026,2017-09-01,2017-09-05
5,1795,1026,2018-08-09,2018-08-16
6,1992,1026,2018-10-18,2018-10-24


In [17]:
# calc product_totals
pd.read_sql_query("SELECT o.order_id, o.product_id, o.qty, p.price, p.price * o.qty total FROM order_product o JOIN product p ON o.product_id = p.product_id LIMIT 5", conn)

Unnamed: 0,order_id,product_id,qty,price,total
0,0,8048,9,100.0,900.0
1,1,8110,5,90.0,450.0
2,1,8051,9,110.0,990.0
3,1,8032,4,102.0,408.0
4,1,8091,4,113.0,452.0


In [18]:
# create previous query as view for easy access
conn.execute('''CREATE OR REPLACE VIEW product_totals AS 
SELECT o.order_id, 
o.product_id, 
o.qty, 
p.price, 
p.price * o.qty total 
FROM order_product o 
JOIN product p 
ON o.product_id = p.product_id ''')

<sqlalchemy.engine.result.ResultProxy at 0x7fe3876ffc18>

In [19]:
pd.read_sql_query('''
SELECT s.order_id, 
SUM(t.total) AS spent,
COUNT(t.product_id) AS num_distinct_product
FROM (SELECT * FROM customer_order WHERE customer_id = 1026) s
JOIN product_totals t
ON s.order_id = t.order_id
GROUP BY s.order_id
ORDER BY order_id
''', conn)

Unnamed: 0,order_id,spent,num_distinct_product
0,59,1086.0,5
1,274,912.0,4
2,387,190.0,1
3,622,1148.0,2
4,844,870.0,1
5,1795,317.0,2
6,1992,285.0,1


In [20]:
# Get the 10 customers that have spent the most. Give the customer_id and amount spent
pd.read_sql_query('''
SELECT o.customer_id, 
SUM(t.total) AS total_spent,
COUNT(o.order_id) AS num_distinct_orders
FROM customer_order o
JOIN product_totals t
ON o.order_id = t.order_id
GROUP BY o.customer_id
ORDER BY total_spent DESC
LIMIT 10
''', conn)
# SINCE JOIN will match all rows on right table if not unique, the single join gets all rows corresponding to each customer

Unnamed: 0,customer_id,total_spent,num_distinct_orders
0,1087,22632.0,68
1,1178,21972.0,59
2,1013,20568.0,60
3,1139,19881.0,63
4,1153,19791.0,55
5,1106,19182.0,57
6,1140,18979.0,53
7,1042,18091.0,53
8,1190,17990.0,56
9,1029,17958.0,60


In [21]:
# Repeat the previous question, but include the customer's name, address, and state, in addition to the customer id and total amount spent
pd.read_sql_query('''
SELECT c.customer_id,
c.name,
c.address,
c.state,
s.total_spent
FROM customer c
JOIN (SELECT o.customer_id, 
    SUM(t.total) AS total_spent,
    COUNT(o.order_id) AS num_distinct_orders
    FROM customer_order o
    JOIN product_totals t
    ON o.order_id = t.order_id
    GROUP BY o.customer_id
    ORDER BY total_spent DESC
    LIMIT 10) s 
ON c.customer_id = s.customer_id
ORDER BY s.total_spent DESC''', conn)
# order doesn't match up with expected output --FIXED 

Unnamed: 0,customer_id,name,address,state,total_spent
0,1087,Allison Hoffman,55218 Lam Key,KY,22632.0
1,1178,Jacqueline Frazier,85471 Davis Viaduct Suite 294,AK,21972.0
2,1013,Timothy Robertson,72067 Bridget Loaf Apt. 580,PA,20568.0
3,1139,Joseph Ponce,93874 Esparza Mountain,KS,19881.0
4,1153,Johnathan Charles,22678 Hartman Mission,HI,19791.0
5,1106,Andrew Fischer,7764 Brown Divide,ME,19182.0
6,1140,Jennifer Blake,9201 Andrea Courts Apt. 332,MI,18979.0
7,1042,Jessica Burke,68160 Amanda Pike,NM,18091.0
8,1190,Michelle Austin,856 Mills Lakes,MI,17990.0
9,1029,Jordan Rose,0537 Joel Ferry,MT,17958.0


In [22]:
# totals off because of missing product 8000 -- FIXED
# pd.read_sql_query('''
# SELECT co.customer_id,
# co.order_id,
# op.product_id
# FROM customer_order co
# JOIN order_product op
# ON co.order_id = op.order_id
# WHERE co.customer_id = 1153
# AND op.product_id = 8000''', conn)

In [23]:
# Find the 10 customers that spent the most in 2017. Give the name and amount spent. Take the date to be the order date (not the delivery date)
pd.read_sql_query('''
SELECT c.customer_id,
c.name,
t.total_spent as amt_spent
FROM customer c 
JOIN (SELECT s.customer_id, 
    SUM(t.total) AS total_spent,
    COUNT(s.order_id) AS num_distinct_orders
    FROM (SELECT * FROM customer_order 
            WHERE EXTRACT('year' FROM date_ordered) = 2017) s
    JOIN product_totals t
    ON s.order_id = t.order_id
    GROUP BY s.customer_id
    ORDER BY total_spent DESC
    LIMIT 10) t
ON c.customer_id = t.customer_id
ORDER BY amt_spent DESC
''', conn)
# again, missing product 8000 price -- FIXED

Unnamed: 0,customer_id,name,amt_spent
0,1120,Sabrina Foster,14986.0
1,1115,Emily Nelson,13480.0
2,1014,Timothy Marks,13266.0
3,1087,Allison Hoffman,11928.0
4,1181,Jeanne Casey,11789.0
5,1143,Dana Kline,11312.0
6,1103,Kristen Davies,11125.0
7,1106,Andrew Fischer,10659.0
8,1135,Emily Fritz,10628.0
9,1139,Joseph Ponce,10439.0


In [24]:
# TROUBLESHOOTING MISSING ENTRY
# pd.read_sql_query('''SELECT s.customer_id, 
#     s.order_id,
#     t.product_id,
#     t.qty,
#     t.total
#     FROM (SELECT * FROM customer_order 
#             WHERE EXTRACT('year' FROM date_ordered) = 2017) s
#     JOIN product_totals t
#     ON s.order_id = t.order_id
#     WHERE t.product_id = 8000
#     AND customer_id = 1120 
#     ''', conn)

In [25]:
# TROUBLESHOOTING MISSING ENTRY
# (14986-14458)/6 # 14986 is what Sabrina Foster spends in solution output
# 8000 should cost $88 each

In [26]:
# Which three products have we sold the most of? i.e. the greatest number of units?
pd.read_sql_query('''
SELECT product_id,
sum(qty) AS num_sold
FROM order_product
GROUP BY product_id
ORDER BY num_sold DESC
LIMIT 3''', conn)

Unnamed: 0,product_id,num_sold
0,8020,344
1,8070,315
2,8009,311


In [27]:
# What is the average number of days between order and delivery?
pd.read_sql_query('''
SELECT AVG(date_delivered - date_ordered)
FROM customer_order
''', conn)

Unnamed: 0,avg
0,5 days 21:48:14.400000


In [28]:
# What is the average number of days between order and delivery for each year? Take the year from the order date.
pd.read_sql_query('''
SELECT EXTRACT('year' FROM date_ordered) as order_year,
AVG(date_delivered - date_ordered) AS avg_delivery_time
FROM customer_order
GROUP BY order_year 
ORDER BY order_year
''', conn)

Unnamed: 0,order_year,avg_delivery_time
0,2016.0,5 days 19:12:00
1,2017.0,5 days 21:05:17.025440
2,2018.0,5 days 23:15:26.732673
