In [1]:
from sqlalchemy import create_engine
import pandas as pd 
import json

with open('/home/douglas/postgres_credentials.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']
    address = data['address']

conn = create_engine('postgresql://{}:{}@{}:5432/ex_seattle_weather'.format(username, password, address))

  """)


# Check out table format

In [2]:
pd.read_sql_query('''SELECT * FROM weather LIMIT 5''', conn)

Unnamed: 0,date_weather,inches_rain,temp_max,temp_min,did_rain
0,1948-01-01,0.47,51.0,42.0,True
1,1948-01-02,0.59,45.0,36.0,True
2,1948-01-03,0.42,45.0,35.0,True
3,1948-01-04,0.31,45.0,34.0,True
4,1948-01-05,0.17,45.0,32.0,True


In [3]:
# look at column types
pd.read_sql_query("SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'weather'", conn)

Unnamed: 0,column_name,data_type
0,date_weather,timestamp without time zone
1,inches_rain,real
2,temp_max,real
3,temp_min,real
4,did_rain,boolean


In [4]:
# count rows 
pd.read_sql_query("SELECT count(*) FROM weather", conn)

Unnamed: 0,count
0,25548


# SQL_practice Problems

In [5]:
# Select all rows from December 1st, 2000 to December 15th, 2000 (inclusive)
pd.read_sql_query("SELECT * FROM weather WHERE date_weather BETWEEN '2000-12-01' AND '2000-12-15'", conn)

Unnamed: 0,date_weather,inches_rain,temp_max,temp_min,did_rain
0,2000-12-01,0.04,55.0,39.0,True
1,2000-12-02,0.18,51.0,37.0,True
2,2000-12-03,0.0,44.0,34.0,False
3,2000-12-04,0.0,51.0,37.0,False
4,2000-12-05,0.0,50.0,36.0,False
5,2000-12-06,0.0,50.0,35.0,False
6,2000-12-07,0.0,40.0,34.0,False
7,2000-12-08,0.02,45.0,30.0,True
8,2000-12-09,0.06,43.0,36.0,True
9,2000-12-10,0.0,40.0,30.0,False


In [6]:
# Get the average maximum temperature for every year from the year 2000 onward. Order the results by year (ascending)
pd.read_sql_query('''SELECT y.year::INTEGER, 
       avg(y.temp_max) as avg_high_temp
FROM (
    SELECT EXTRACT('year' FROM date_weather) as year, 
       temp_max 
    FROM weather) y
WHERE y.year >= 2000 
GROUP BY 1
ORDER BY 1''', conn)


Unnamed: 0,year,avg_high_temp
0,2000,58.674863
1,2001,58.473973
2,2002,58.893151
3,2003,60.441096
4,2004,60.622951
5,2005,60.148352
6,2006,61.038356
7,2007,59.20274
8,2008,58.494536
9,2009,59.912329


In [7]:
# Get the standard deviation of the maximum temperature per year, from 2000 onward. Order by year (ascending)
pd.read_sql_query('''SELECT y.year::INTEGER, 
       stddev(y.temp_max) std_dev_temp_max
FROM (
    SELECT EXTRACT('year' FROM date_weather) as year, 
       temp_max 
    FROM weather) y
WHERE y.year >= 2000 
GROUP BY 1
ORDER BY 1''', conn)

Unnamed: 0,year,std_dev_temp_max
0,2000,11.4863
1,2001,11.175302
2,2002,12.306171
3,2003,12.872151
4,2004,12.611146
5,2005,11.893002
6,2006,13.048248
7,2007,12.917
8,2008,12.997101
9,2009,14.232874


In [8]:
pd.read_sql_query("SELECT count (*) FROM weather", conn)

Unnamed: 0,count
0,25548


In [9]:
# What are the 10 hottest days on record? Take hottest to mean 'highest maximum temperature'.
pd.read_sql_query('''SELECT * FROM weather ORDER BY temp_max desc LIMIT 10''', conn)

Unnamed: 0,date_weather,inches_rain,temp_max,temp_min,did_rain
0,2009-07-29,0.0,103.0,71.0,False
1,1994-07-20,0.0,100.0,65.0,False
2,1991-07-23,0.0,99.0,65.0,False
3,1960-08-09,0.0,99.0,59.0,False
4,1981-08-09,0.0,99.0,68.0,False
5,1960-08-08,0.0,98.0,66.0,False
6,1981-08-10,0.0,98.0,67.0,False
7,1979-07-16,0.0,98.0,63.0,False
8,1967-08-16,0.0,98.0,59.0,False
9,2007-07-11,0.0,98.0,61.0,False


In [10]:
# In 2016, what fraction of days did it rain?

# Method 1: numpy way
#import numpy as np
#np.mean(pd.read_sql_query('''SELECT did_rain FROM weather WHERE EXTRACT('year' FROM date_weather) = 2016 ''', conn))

In [11]:
# Method 2: Subquery 
pd.read_sql_query('''
SELECT SUM(CASE WHEN did_rain THEN 1 ELSE 0 END) / COUNT(*)::FLOAT as rainy_days
FROM (SELECT did_rain FROM weather WHERE EXTRACT('year' FROM date_weather) = 2016) s ''', conn)

Unnamed: 0,rainy_days
0,0.469945


In [12]:
%%time 
# What is the 75th percentile for the amount of rain that fell on a day where there was some rain in 2016?
# Method 1 : count rows and take row#(count*0.75)
pd.read_sql_query('''
SELECT inches_rain 
FROM (
SELECT inches_rain, row_number() OVER (ORDER BY inches_rain)
FROM weather
WHERE EXTRACT('year' FROM date_weather) = 2016
AND did_rain) s
WHERE row_number = floor(172*0.75) ''', conn)
# needs magic number though 

CPU times: user 1.71 ms, sys: 1.14 ms, total: 2.85 ms
Wall time: 12.4 ms


Unnamed: 0,inches_rain
0,0.33


In [13]:
%%time 
#Method 2: find percent_rank closest to 0.75 
pd.read_sql_query('''
SELECT inches_rain, abs(percentile - 0.75) as dist 
FROM (SELECT inches_rain, percent_rank() OVER (ORDER BY inches_rain) percentile
FROM weather
WHERE EXTRACT('year' FROM date_weather) = 2016
AND did_rain  ORDER BY percentile) s 
ORDER BY dist LIMIT 1''', conn)

CPU times: user 4.86 ms, sys: 468 µs, total: 5.33 ms
Wall time: 11.1 ms


Unnamed: 0,inches_rain,dist
0,0.33,0.00731


In [14]:
# What is the 75th percentile for the amount of rain that fell on any day in 2016?
pd.read_sql_query('''
SELECT inches_rain, abs(percentile - 0.75) as dist 
FROM (SELECT inches_rain, percent_rank() OVER (ORDER BY inches_rain) percentile
FROM weather
WHERE EXTRACT('year' FROM date_weather) = 2016 ORDER BY percentile) s 
ORDER BY dist LIMIT 1''', conn)

Unnamed: 0,inches_rain,dist
0,0.15,0.002055


In [15]:
# Get the 10 years with the hottest average maximum temperature in July. Order from hottest to coolest
pd.read_sql_query('''
SELECT s.year::INTEGER,
       avg(temp_max) as avg_july_high_temp 
   FROM ( SELECT EXTRACT('year' FROM date_weather) as year,
       EXTRACT('month' FROM date_weather) as month, 
       temp_max 
       FROM weather ) s
   WHERE s.month = 7
   GROUP BY s.year
   ORDER BY 2 desc
   LIMIT 10
   ''', conn)

Unnamed: 0,year,avg_july_high_temp
0,2015,82.580645
1,1958,81.419355
2,2009,80.967742
3,1985,80.935484
4,2014,80.419355
5,1960,79.645161
6,1965,79.451613
7,1990,79.193548
8,2013,78.967742
9,2003,78.967742


In [16]:
# Get the 10 years with the coldest average minimum temperature in December. Order from coolest to hottest
pd.read_sql_query('''
SELECT s.year::INTEGER,
       avg(temp_min) as avg_dec_min_temp 
   FROM ( SELECT EXTRACT('year' FROM date_weather) as year,
       EXTRACT('month' FROM date_weather) as month, 
       temp_min 
       FROM weather ) s
   WHERE s.month = 12
   GROUP BY s.year
   ORDER BY 2
   LIMIT 10
   ''', conn)

Unnamed: 0,year,avg_dec_min_temp
0,1990,30.387097
1,1948,30.806452
2,1985,30.935484
3,1951,31.225806
4,1964,31.483871
5,1983,31.516129
6,1968,32.032258
7,2009,32.096774
8,1984,32.096774
9,1978,32.16129


In [17]:
# Repeat the last question, but round the temperatures to 3 decimal places
pd.read_sql_query('''
SELECT s.year::INTEGER,
       ROUND(avg(temp_min), 3) as avg_dec_min_temp
   FROM ( SELECT EXTRACT('year' FROM date_weather) as year,
       EXTRACT('month' FROM date_weather) as month, 
       temp_min::numeric
       FROM weather ) s
   WHERE s.month = 12
   GROUP BY s.year
   ORDER BY 2
   LIMIT 10
   ''', conn)

Unnamed: 0,year,avg_dec_min_temp
0,1990,30.387
1,1948,30.806
2,1985,30.935
3,1951,31.226
4,1964,31.484
5,1983,31.516
6,1968,32.032
7,2009,32.097
8,1984,32.097
9,1978,32.161


In [18]:
# Given the results of the previous queries, would it be fair to use this data to claim that 2015 had the "hottest July on record"? Why or why not?
# For Seattle? Also, query was based only on the max_temp of the day. It's arguable that there may be better metrics to account for more sustained heat in other years, or number of days with temps > a threshold.  

In [19]:
# Get the 10 years with the hottest average maximum temperature in July. Order from hottest to coolest
pd.read_sql_query('''
SELECT s.year::INTEGER,
       avg(temp_max) as avg_july_high_temp,
       stddev(temp_max) as std_july_high_temp 
   FROM ( SELECT EXTRACT('year' FROM date_weather) as year,
       EXTRACT('month' FROM date_weather) as month, 
       temp_max 
       FROM weather ) s
   WHERE s.month = 7
   GROUP BY s.year
   ORDER BY 2 desc
   LIMIT 10
   ''', conn)

Unnamed: 0,year,avg_july_high_temp,std_july_high_temp
0,2015,82.580645,7.944701
1,1958,81.419355,7.957278
2,2009,80.967742,10.077976
3,1985,80.935484,6.752459
4,2014,80.419355,7.21468
5,1960,79.645161,8.097524
6,1965,79.451613,7.579968
7,1990,79.193548,8.565121
8,2013,78.967742,6.20475
9,2003,78.967742,6.615557


In [20]:
# Give the average inches of rain that fell per day for each month, where the average is taken over 2000 - 2010 (inclusive).
pd.read_sql_query('''
SELECT s.month::INTEGER,
       avg(s.inches_rain) as avg_daily_inches_rain 
   FROM ( SELECT EXTRACT('year' FROM date_weather) as year,
       EXTRACT('month' FROM date_weather) as month, 
       inches_rain
       FROM weather) s
   WHERE s.year BETWEEN 2000 AND 2010
   GROUP BY s.month
   ORDER BY s.month   ''', conn)

Unnamed: 0,month,avg_daily_inches_rain
0,1,0.191613
1,2,0.094277
2,3,0.113578
3,4,0.085364
4,5,0.068035
5,6,0.050182
6,7,0.016129
7,8,0.03437
8,9,0.05693
9,10,0.115543
