# Tidy Data Lesson
#### Corey Solitaire
#### 9/10/2020

In [1]:
import pandas as pd
import numpy as np
from pydataset import data
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Attendance Data

#### - Load the attendance.csv file and 

In [2]:
df= pd.read_csv('attendance.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4 non-null      object
 1   2018-01-01  4 non-null      object
 2   2018-01-02  4 non-null      object
 3   2018-01-03  4 non-null      object
 4   2018-01-04  4 non-null      object
 5   2018-01-05  4 non-null      object
 6   2018-01-06  4 non-null      object
 7   2018-01-07  4 non-null      object
 8   2018-01-08  4 non-null      object
dtypes: object(9)
memory usage: 416.0+ bytes


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [4]:
df.rename(columns = {'Unnamed: 0':'name'}, inplace = True)
df.head()

Unnamed: 0,name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


#### - Calculate an attendnace percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [5]:
df = df.replace({'P': 1.0, 'A': 0.0, 'T': 0.9, 'H': 0.5})
df

Unnamed: 0,name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,1.0,0.9,0.9,0.5,1.0,0.0,0.9,0.9
1,Jane,0.0,1.0,0.9,0.9,0.9,0.9,0.0,0.9
2,Billy,0.0,0.9,0.0,0.0,0.5,0.9,1.0,0.9
3,John,1.0,0.9,0.5,1.0,1.0,0.9,1.0,1.0


In [6]:
df['attend_percent'] = df.sum(axis=1)/8
df

Unnamed: 0,name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08,attend_percent
0,Sally,1.0,0.9,0.9,0.5,1.0,0.0,0.9,0.9,0.7625
1,Jane,0.0,1.0,0.9,0.9,0.9,0.9,0.0,0.9,0.6875
2,Billy,0.0,0.9,0.0,0.0,0.5,0.9,1.0,0.9,0.525
3,John,1.0,0.9,0.5,1.0,1.0,0.9,1.0,1.0,0.9125


In [None]:
# alternate answer via zach walkthrough:
grades = {
    'P': 1,
    'A': 0,
    'H': 0.5,
    'T': 0.9,
}

df = pd.read_csv('untidy-data/attendance.csv')
df = df.rename(columns={'Unnamed: 0': 'student'})
df = df.melt(id_vars='student', var_name='date')
df['grade'] = df.value.apply(get_attendance_grade)
df.groupby('student').grade.mean()

# Coffee Levels

#### - Read the coffee_levels.csv file.

In [47]:
df = pd.read_csv('coffee_levels.csv')
df.head()

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291


#### - Transform the data so that each carafe is in it's own column.

In [48]:
#make columns
df = df.pivot_table('coffee_amount', 'hour', 'coffee_carafe')

In [49]:
#make pretty
df = df.reset_index()
df.columns.name = ''

In [52]:
#make readable
df.rename(columns = {'x': 'carafe_x', 'y': 'carafe_y','z': 'carafe_z' }, inplace = True)
df

Unnamed: 0,hour,carafe_x,carafe_y,carafe_z
0,8,0.816164,0.189297,0.999264
1,9,0.451018,0.521502,0.91599
2,10,0.843279,0.023163,0.144928
3,11,0.335533,0.235529,0.311495
4,12,0.898291,0.017009,0.771947
5,13,0.310711,0.997464,0.39852
6,14,0.507288,0.058361,0.864464
7,15,0.215043,0.144644,0.436364
8,16,0.183891,0.544676,0.280621
9,17,0.39156,0.594126,0.436677


#### - Is this the best shape for the data?

In [None]:
'''
I don't really think this is the best shape for this data. I understand that at the 8 hour there was X 
volume of coffee remaining, and you could plot how that volume decreasted over time; however I would have
kept the carafes as rows.  
'''

# Cake Recipes

#### - Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

In [73]:
df = pd.read_csv('cake_recipes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   recipe:position  8 non-null      object 
 1   225              8 non-null      float64
 2   250              8 non-null      float64
 3   275              8 non-null      float64
 4   300              8 non-null      float64
dtypes: float64(4), object(1)
memory usage: 448.0+ bytes


#### - Tidy the data as necessary.

In [74]:
#split combined colum (two variables) in to two seperate columns
recip_pos = df['recipe:position'].str.split(':', expand=True)
recip_pos.columns = ['recipe', 'position']
#added those columns back on the origional dataframe and dropped the old column
df = pd.concat([df,recip_pos], axis=1)
df = df.drop(columns='recipe:position')
#next step is to melt data
df = df.melt(id_vars=['recipe', 'position'])
df.rename(columns = {'variable': 'cook_temp', 'value': 'overall_rating' }, inplace = True)
df

Unnamed: 0,recipe,position,cook_temp,overall_rating
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207
5,c,top,225,71.306308
6,d,bottom,225,52.799753
7,d,top,225,96.873178
8,a,bottom,250,53.912627
9,a,top,250,52.009735


#### - Which recipe, on average, is the best? recipe b

In [75]:
df.groupby('recipe').mean()

Unnamed: 0_level_0,overall_rating
recipe,Unnamed: 1_level_1
a,63.922201
b,76.736074
c,75.874748
d,62.864844


In [None]:
'''
Yes it was recipe b
'''

#### - Which oven temperature, on average, produces the best results? 275

In [76]:
df.groupby('cook_temp').mean()

Unnamed: 0_level_0,overall_rating
cook_temp,Unnamed: 1_level_1
225,71.306022
250,66.577437
275,74.886754
300,66.627655


In [None]:
'''
Yes it was 275
'''

#### - Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [82]:
df.groupby(['recipe', 'position', 'cook_temp']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,overall_rating
recipe,position,cook_temp,Unnamed: 3_level_1
a,bottom,225,61.738655
a,bottom,250,53.912627
a,bottom,275,74.41473
a,bottom,300,98.786784
a,top,225,51.709751
a,top,250,52.009735
a,top,275,68.576858
a,top,300,50.22847
b,bottom,225,57.09532
b,bottom,250,61.904369


In [None]:
'''
Yes it was recipe b, bottom rack, 300 degrees
'''
index_of_highest_score = df.total_score.argmax()
df[df.index == index_of_highest_score]


In [None]:
# alternate via zach walkthrough:

df.sort_values(by='tastiness').tail()