## Advanced Dataframes (review and practice)

In [24]:
import pandas as pd
import numpy as np
from pydataset import data
import matplotlib.pyplot as plt

## Exercise I

1. Create a function named get_db_url. It should accept a username, hostname, password, and database name and return a url connection string formatted like in the example at the start of this lesson.

In [2]:
def get_db_url(host, user, password, database):
    url = f'mysql+pymysql://{user}:{password}@{host}/{database}'
    return url

In [3]:
from env import host, user, password
url = get_db_url(host, user, password, 'employees')

In [4]:
pd.read_sql('SELECT * FROM employees LIMIT 5 OFFSET 50', url)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10051,1953-07-28,Hidefumi,Caine,M,1992-10-15
1,10052,1961-02-26,Heping,Nitsch,M,1988-05-21
2,10053,1954-09-13,Sanjiv,Zschoche,F,1986-02-04
3,10054,1957-04-04,Mayumi,Schueller,M,1995-03-13
4,10055,1956-06-06,Georgy,Dredge,M,1992-04-27


4. Read the employees and titles tables into two separate DataFrames.

In [5]:
titles_df = pd.read_sql('SELECT * FROM titles', url)
titles_df.sample(5)

Unnamed: 0,emp_no,title,from_date,to_date
177001,219498,Senior Staff,2000-03-18,9999-01-01
356779,441355,Engineer,1995-03-20,9999-01-01
57836,49093,Engineer,1996-07-01,1997-01-22
604,10416,Senior Staff,1996-03-24,9999-01-01
73704,59772,Senior Engineer,2001-04-02,9999-01-01


In [6]:
employees_df = pd.read_sql('SELECT * FROM employees', url)
employees_df.sample(5)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
29485,39486,1954-06-27,Brewster,Brandsma,M,1996-06-22
161468,261444,1961-03-31,Peer,Rabejac,F,1985-11-13
125320,225296,1956-11-21,Phuoc,Pulkowski,M,1993-12-15
208272,408248,1963-01-27,Fusako,Thiran,F,1989-10-13
1287,11288,1953-03-23,Domenico,Eastman,F,1989-01-22


5. How many rows and columns do you have in each DataFrame? 

In [7]:
titles_df.shape

(443308, 4)

In [8]:
employees_df.shape

(300024, 6)

6. Display the summary statistics for each DataFrame.

In [9]:
titles_df.describe()

Unnamed: 0,emp_no
count,443308.0
mean,253075.03443
std,161853.292613
min,10001.0
25%,84855.75
50%,249847.5
75%,424891.25
max,499999.0


In [10]:
employees_df.describe()

Unnamed: 0,emp_no
count,300024.0
mean,253321.763392
std,161828.23554
min,10001.0
25%,85006.75
50%,249987.5
75%,424993.25
max,499999.0


7. How many unique titles are in the titles DataFrame?

In [11]:
titles_df.title.value_counts() # or
titles_df['title'].value_counts()

Engineer              115003
Staff                 107391
Senior Engineer        97750
Senior Staff           92853
Technique Leader       15159
Assistant Engineer     15128
Manager                   24
Name: title, dtype: int64

In [12]:
# for an array
titles_df.title.unique()

array(['Senior Engineer', 'Staff', 'Engineer', 'Senior Staff',
       'Assistant Engineer', 'Technique Leader', 'Manager'], dtype=object)

8. What is the oldest date in the to_date column?

In [13]:
titles_df.to_date.sort_values().head(1)

16064    1985-03-01
Name: to_date, dtype: object

9. What is the most recent date in the to_date column?

In [14]:
titles_df.to_date.sort_values(ascending=False).head()

443307    9999-01-01
191723    9999-01-01
191707    9999-01-01
191712    9999-01-01
191714    9999-01-01
Name: to_date, dtype: object

#### Exercise II

1. Copy the users and roles DataFrames from the examples above.

In [15]:
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
users

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [16]:
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


2. What is the result of using a right join on the DataFrames?

In [19]:
# right join
(users.merge(roles, 
            left_on='role_id', 
            right_on='id', 
            how='right')
            )

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1,admin
1,2.0,joe,2.0,2,author
2,3.0,sally,3.0,3,reviewer
3,4.0,adam,3.0,3,reviewer
4,,,,4,commenter


In [21]:
# left join
(users.merge(roles, 
            left_on='role_id', 
            right_on='id', 
            how='left')
            )

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1,bob,1.0,1.0,admin
1,2,joe,2.0,2.0,author
2,3,sally,3.0,3.0,reviewer
3,4,adam,3.0,3.0,reviewer
4,5,jane,,,
5,6,mike,,,


3. What is the result of using an outer join on the DataFrames?

In [18]:
# outer join
(users.merge(roles, 
            left_on='role_id', 
            right_on='id', 
            how='outer')
            )

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1.0,admin
1,2.0,joe,2.0,2.0,author
2,3.0,sally,3.0,3.0,reviewer
3,4.0,adam,3.0,3.0,reviewer
4,5.0,jane,,,
5,6.0,mike,,,
6,,,,4.0,commenter


4. What happens if you drop the foreign keys from the DataFrames and try to merge them?

In [22]:
(users.merge(roles, 
            left_on='role_id', 
            right_on='id', 
            how='outer')
    .drop(columns='role_id')
    .drop(columns='id')
            )

KeyError: "['id'] not found in axis"

5. Load the mpg dataset from PyDataset.

In [27]:
mpg_df = data('mpg')
mpg_df.sample(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
159,pontiac,grand prix,5.3,2008,8,auto(s4),f,16,25,p,midsize
64,dodge,durango 4wd,5.9,1999,8,auto(l4),4,11,15,r,suv
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact
210,volkswagen,gti,2.0,2008,4,manual(m6),f,21,29,p,compact
152,nissan,pathfinder 4wd,3.3,1999,6,manual(m5),4,15,17,r,suv


6. Output and read the documentation for the mpg dataset.

In [28]:
data('mpg', show_doc=True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




7. How many rows and columns are in the dataset?

In [31]:
mpg_df.shape

(234, 11)

8. Check out your column names and perform any cleanup you may want on them.

In [35]:
mpg_df.rename(columns={'displ': 'display', 
                     'cyl': 'cylinder',
                     'trans': 'transmission',
                     'drv': 'drive',
                     'cty': 'city',
                     'hwy': 'highway',
                     'fl': 'fuel'}, inplace=True
            )

In [37]:
mpg_df.sample(5)

Unnamed: 0,manufacturer,model,display,year,cylinder,transmission,drive,city,highway,fuel,class
8,audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
33,chevrolet,malibu,2.4,1999,4,auto(l4),f,19,27,r,midsize
121,hyundai,tiburon,2.7,2008,6,manual(m6),f,16,24,r,subcompact
122,hyundai,tiburon,2.7,2008,6,manual(m5),f,17,24,r,subcompact


9. Display the summary statistics for the dataset.

In [33]:
mpg_df.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


10. How many different manufacturers are there?

In [43]:
len(mpg_df.manufacturer.unique())

15

12. Create a column named mileage_difference like you did in the DataFrames exercises;
    this column should contain the difference between highway and city mileage for each car.
