In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pydataset import data

Attendance Data

Load the attendance.csv file and calculate an attendance percentage for each student.
One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [11]:
attendance = pd.read_csv('untidy-data/attendance.csv')
attendance.head()

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [12]:
attendance.shape

(4, 9)

In [13]:
attendance.info

<bound method DataFrame.info of   Unnamed: 0 2018-01-01 2018-01-02 2018-01-03 2018-01-04 2018-01-05  \
0      Sally          P          T          T          H          P   
1       Jane          A          P          T          T          T   
2      Billy          A          T          A          A          H   
3       John          P          T          H          P          P   

  2018-01-06 2018-01-07 2018-01-08  
0          A          T          T  
1          T          A          T  
2          T          P          T  
3          T          P          P  >

In [14]:
attendance = attendance.rename(columns={'Unnamed: 0': 'Name'})

In [15]:
attendance = pd.melt(attendance, id_vars='Name', var_name = 'date', value_name= 'attendance')
attendance

Unnamed: 0,Name,date,attendance
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T
5,Jane,2018-01-02,P
6,Billy,2018-01-02,T
7,John,2018-01-02,T
8,Sally,2018-01-03,T
9,Jane,2018-01-03,T


In [24]:
# One method
# df.loc[df['att'] == 'P', 'percent'] = 1

# other method
conditions = [
    (attendance['attendance'] == 'P'),
    (attendance['attendance'] == 'T'),
    (attendance['attendance'] == 'A'),
    (attendance['attendance'] == 'H')
]

values = [1,.9,0,0.5]

In [25]:
attendance['attendance_value'] = np.select(conditions,values)

In [26]:
attendance.head()

Unnamed: 0,Name,date,attendance,attendance_value
0,Sally,2018-01-01,P,1.0
1,Jane,2018-01-01,A,0.0
2,Billy,2018-01-01,A,0.0
3,John,2018-01-01,P,1.0
4,Sally,2018-01-02,T,0.9


In [30]:
attendance_percent = attendance.groupby('Name').attendance_value.mean()
attendance_percent

Name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: attendance_value, dtype: float64

Coffee Levels

- Read the coffee_levels.csv file.
- Transform the data so that each carafe is in it's own column.
- Is this the best shape for the data?

In [35]:
# Read the coffee_levels.csv file.
coffee = pd.read_csv('untidy-data/coffee_levels.csv')
coffee.sample(5)

Unnamed: 0,hour,coffee_carafe,coffee_amount
7,15,x,0.215043
3,11,x,0.335533
12,10,y,0.023163
0,8,x,0.816164
2,10,x,0.843279


In [37]:
# Transform the data so that each carafe is in it's own column.
coffee = coffee.pivot_table(index='hour', columns='coffee_carafe')
coffee.sample(5)

Unnamed: 0_level_0,coffee_amount,coffee_amount,coffee_amount
coffee_carafe,x,y,z
hour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
10,0.843279,0.023163,0.144928
8,0.816164,0.189297,0.999264
15,0.215043,0.144644,0.436364
14,0.507288,0.058361,0.864464
9,0.451018,0.521502,0.91599


This gives us a good visual to the eye but it is not good for programs to read.

Cake Recipes

- Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
- Tidy the data as necessary.
- Which recipe, on average, is the best? recipe b
- Which oven temperature, on average, produces the best results? 275
- Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [38]:
# melt is best for aggregation, pivot best for presentation
cake_recipe = pd.read_csv('untidy-data/cake_recipes.csv')
cake_recipe.sample(5)

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
7,d:top,96.873178,76.101363,59.57162,50.971626
1,a:top,51.709751,52.009735,68.576858,50.22847
5,c:top,71.306308,82.795477,92.098049,53.960273
4,c:bottom,96.470207,52.001358,92.893227,65.473084


In [39]:
# Tidy the data as necessary.
cake_recipe = cake_recipe.melt(id_vars='recipe:position', var_name='temp')
cake_recipe.sample(5)

Unnamed: 0,recipe:position,temp,value
12,c:bottom,250,52.001358
26,b:bottom,300,99.248541
8,a:bottom,250,53.912627
9,a:top,250,52.009735
18,b:bottom,275,61.19698


In [40]:
# here we splut the recipe:position columns into two separate columns
cake_recipe[['recipe','position']] = cake_recipe['recipe:position'].str.split(':', expand=True)
cake_recipe.sample(5)

Unnamed: 0,recipe:position,temp,value,recipe,position
14,d:bottom,250,58.670419,d,bottom
16,a:bottom,275,74.41473,a,bottom
11,b:top,250,95.224151,b,top
1,a:top,225,51.709751,a,top
27,b:top,300,58.169349,b,top


In [41]:
# here we will drop the recipe:position column
cake_recipe.drop(columns='recipe:position', inplace=True)

In [42]:
cake_recipe.sample(5)

Unnamed: 0,temp,value,recipe,position
25,300,50.22847,a,top
23,275,59.57162,d,top
26,300,99.248541,b,bottom
27,300,58.169349,b,top
29,300,53.960273,c,top


In [47]:
# Which recipe, on average, is the best? recipe b
cake_recipe.groupby('recipe').value.mean().nlargest()

recipe
b    76.736074
c    75.874748
a    63.922201
d    62.864844
Name: value, dtype: float64

In [49]:
# Which oven temperature, on average, produces the best results? 275
cake_recipe.groupby('temp').value.mean().nlargest()

temp
275    74.886754
225    71.306022
300    66.627655
250    66.577437
Name: value, dtype: float64

In [58]:
# Which combination of recipe, rack position, and temperature gives the best result? 
# recipe b, bottom rack, 300 degrees
# attempt 1: cake_recipe.groupby('recipe').value.nlargest(1)


cake_recipe.value_counts().nlargest(1)

temp  value      recipe  position
300   99.248541  b       bottom      1
dtype: int64

### Extra Practice: Gapminder

In [59]:
gm1 = pd.read_csv('untidy-data/gapminder1.csv')
gm2 = pd.read_csv('untidy-data/gapminder2.csv')

In [60]:
gm1.sample(5)

Unnamed: 0,year,country,measure,measurement
1989,2000,Rwanda,fertility,6.01
552,1965,Peru,pop,11467300.0
2078,2005,Venezuela,fertility,2.547
49,1980,Austria,pop,7549433.0
1911,1995,Nigeria,fertility,6.246


In [61]:
gm2.sample(5)

Unnamed: 0,country,life_expect_1955,life_expect_1960,life_expect_1965,life_expect_1970,life_expect_1975,life_expect_1980,life_expect_1985,life_expect_1990,life_expect_1995,...,pop_1960,pop_1965,pop_1970,pop_1975,pop_1980,pop_1985,pop_1990,pop_1995,pop_2000,pop_2005
46,North Korea,52.681,55.292,57.716,62.612,64.766,67.123,69.81,72.244,74.647,...,24784140,28705000,32241000,35281000,38124000,40806000,42869000,45264146,47351083,48640671
14,Colombia,55.118,57.863,59.963,61.623,63.837,66.653,67.768,68.421,70.313,...,15952727,18646175,21429658,24114177,26582811,29678395,32858579,36280883,39685655,42954279
24,Georgia,62.625,64.644,66.654,68.158,69.634,69.638,70.45,70.465,70.49,...,4146570,4464959,4694491,4897656,5045697,5192957,5426207,5012952,4777209,4677401
44,New Zealand,70.26,71.24,71.52,71.89,72.22,73.84,74.32,76.33,77.55,...,2371746,2640400,2828050,3117800,3170150,3298050,3359604,3565990,3819762,4035461
33,Iran,47.181,49.325,52.469,55.234,57.702,59.62,63.04,65.742,68.042,...,21577000,25000000,28933000,33379000,39583397,48439952,57035717,61628116,65660289,68017860
