In [6]:
import pandas as pd
from env import get_db_url

In [19]:
url = get_db_url("employees")

In [9]:
query = """
SELECT * FROM employees LIMIT 10
"""

In [10]:
pd.read_sql(query, url)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12
5,10006,1953-04-20,Anneke,Preusig,F,1989-06-02
6,10007,1957-05-23,Tzvetan,Zielinski,F,1989-02-10
7,10008,1958-02-19,Saniya,Kalloufi,M,1994-09-15
8,10009,1952-04-19,Sumant,Peac,F,1985-02-18
9,10010,1963-06-01,Duangkaew,Piveteau,F,1989-08-24


In [15]:
url = get_db_url("employees")
pd.read_sql("""SELECT * FROM employees""", url)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12
...,...,...,...,...,...,...
300019,499995,1958-09-24,Dekang,Lichtner,F,1993-01-12
300020,499996,1953-03-07,Zito,Baaz,M,1990-09-27
300021,499997,1961-08-03,Berhard,Lenart,M,1986-04-21
300022,499998,1956-09-05,Patricia,Breugel,M,1993-10-13


In [20]:
titles = pd.read_sql("""SELECT * FROM titles""", get_db_url("employees"))

In [21]:
titles

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01
...,...,...,...,...
443303,499997,Engineer,1987-08-30,1992-08-29
443304,499997,Senior Engineer,1992-08-29,9999-01-01
443305,499998,Senior Staff,1998-12-27,9999-01-01
443306,499998,Staff,1993-12-27,1998-12-27


In [None]:
# How many unique titles are in the titles DataFrame?
# Do we use SQL? Do we use Pandas? When do we use both? Is it 50/50? 
# Welcome to Analysis Paralysis!
# "It depends"
# but WHAT does our answer depend on?

### How many unique titles are there?
- We could solve with SQL in one or two ways:
    - SELECT DISTINCT title from titles
    - SELECT titles from titles group by title
- Or we could read the entire SQL query/table into a pandas dataframe
    - then use .unique from pandas
   
- If you know you're going to need that whole titles table and historic data the rest of your notebook:
    - Then read the entire titles table into a dataframe
    - We can join dataframes together exactly how we joined tables together
    - Once you save a query to a dataframe, that dataframe exists in memory

In [22]:
sql = """
SELECT DISTINCT title from titles;
"""
unique_titles = pd.read_sql(sql, url)
unique_titles

Unnamed: 0,title
0,Senior Engineer
1,Staff
2,Engineer
3,Senior Staff
4,Assistant Engineer
5,Technique Leader
6,Manager


In [23]:
sql = """
SELECT title from titles;
"""
all_titles = pd.read_sql(sql, url)
all_titles

Unnamed: 0,title
0,Senior Engineer
1,Staff
2,Senior Engineer
3,Engineer
4,Senior Engineer
...,...
443303,Engineer
443304,Senior Engineer
443305,Senior Staff
443306,Staff


In [24]:
all_titles.title.unique()

array(['Senior Engineer', 'Staff', 'Engineer', 'Senior Staff',
       'Assistant Engineer', 'Technique Leader', 'Manager'], dtype=object)

Display the summary statistics for each DataFrame.

- I can look at all of my columns no matter the data type if I like.

In [21]:
titles.describe(include='all')

Unnamed: 0,emp_no,title,from_date,to_date
count,443308.0,443308,443308,443308
unique,,7,6393,5888
top,,Engineer,1998-10-25,9999-01-01
freq,,115003,132,240124
mean,253075.03443,,,
std,161853.292613,,,
min,10001.0,,,
25%,84855.75,,,
50%,249847.5,,,
75%,424891.25,,,


In [22]:
employees.describe(include='all')

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
count,300024.0,300024,300024,300024,300024,300024
unique,,4750,1275,1637,2,5434
top,,1952-03-08,Shahab,Baba,M,1985-06-20
freq,,95,295,226,179973,132
mean,253321.763392,,,,,
std,161828.23554,,,,,
min,10001.0,,,,,
25%,85006.75,,,,,
50%,249987.5,,,,,
75%,424993.25,,,,,


___

In [24]:
titles.title.describe()

count       443308
unique           7
top       Engineer
freq        115003
Name: title, dtype: object

___

What is the oldest date in the to_date column?

In [25]:
titles.to_date.sort_values().head(1)

16064    1985-03-01
Name: to_date, dtype: object