In [6]:
import pandas as pd
import numpy as np
from pydataset import data

In [7]:
from env import host, user, password

url = f'mysql+pymysql://{user}:{password}@{host}/employees'

1. Load the `mpg` dataset. Read the documentation for it, and use the data to answer these questions:

In [19]:
mpg = data('mpg')
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


    - On average, which manufacturer has the best miles per gallon?

In [32]:
#create a variable avg_mpg to get average of hwy and cty
avg_mpg = mpg[['hwy', 'cty']].agg('mean', axis = 1)
#add a column to represent the average mileage for all vehicles
mpg['average_mileage'] = avg_mpg
#set a variable to hold a series grouped by manufacturer showing each average mileage from highest to lowest
mpg_manufacturer = mpg.groupby('manufacturer').average_mileage.max().sort_values(ascending=False)
#return the manufacturer with best mpg
mpg_manufacturer.nlargest().head(1)

manufacturer
volkswagen    39.5
Name: average_mileage, dtype: float64

    - How many different manufacturers are there?

In [40]:
#alternate solution
#len(mpg_manufacturer)
len(mpg['manufacturer'].value_counts())

15

    - How many different models are there?

In [48]:
#alternate solution
#len(mpg['model'].value_counts())
len(mpg.groupby('model'))

38

    - Do automatic or manual cars have better miles per gallon?

In [78]:
trans_avg_mpg = mpg.groupby(['trans']).average_mileage.agg(['mean'])
auto = trans_avg_mpg.iloc[:8].mean()
manual = trans_avg_mpg.iloc[8:].mean()
print(f'{manual}: manual')
print(f'{auto}: auto')

mean    21.664247
dtype: float64: manual
mean    20.773042
dtype: float64: auto


2. Joining and Merging

    Copy the `users` and `roles` dataframes from the examples above. What do you think a `right` join would look like? An `outer` join? What happens if you drop the foreign keys from the dataframes and try to merge them?

In [79]:
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
users

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [80]:
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


In [82]:
# a right join would exclude the values from users that dont have users.role_id = roles.id
# the commenter would not have values for users.id and users.name
pd.merge(users,roles, left_on='role_id', right_on='id', how='right')

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1,admin
1,2.0,joe,2.0,2,author
2,3.0,sally,3.0,3,reviewer
3,4.0,adam,3.0,3,reviewer
4,,,,4,commenter


In [83]:
# an outer join will include all values from each including nulls/NaNs
pd.merge(users,roles, left_on='role_id', right_on='id', how='outer')

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1.0,admin
1,2.0,joe,2.0,2.0,author
2,3.0,sally,3.0,3.0,reviewer
3,4.0,adam,3.0,3.0,reviewer
4,5.0,jane,,,
5,6.0,mike,,,
6,,,,4.0,commenter


In [91]:
# if you dropped the foreign keys from the dataframes they wouldn't be able to merge
# because they have no place to merge from

3. Getting data from SQL databases

        a. Create a function named get_db_url. It should accept a username, hostname, password, and database name and return a url formatted like in the examples in this lesson.

        b. Use your function to obtain a connection to the employees database.

        c. Once you have successfully run a query:
            - Intentionally make a typo in the database url. What kind of error message do you see?
            - Intentionally make an error in your SQL query. What does the error message look like?

        d. Read the employees and titles tables into two separate dataframes

        e. Visualize the number of employees with each title.

        f. Join the employees and titles dataframes together.

        g. Visualize how frequently employees change titles.

        h. For each title, find the hire date of the employee that was hired most recently with that title.

        i. Write the code necessary to create a cross tabulation of the number of titles by department. (Hint: this will involve a combination of SQL and python/pandas code)

4. Use your get_db_url function to help you explore the data from the chipotle database. Use the data to answer the following questions:

    - What is the total price for each order?

    - What are the most popular 3 items?

    - Which item has produced the most revenue?