# Tidy Data Exercises

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pydataset import data

### 1. Attendance Data

Load the `attendance.csv` file and calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

You should end up with something like this:

<code>name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: grade, dtype: float64</code>



### Result:


| name   |   status |
|:-------|---------:|
| Billy  |   0.525  |
| Jane   |   0.6875 |
| John   |   0.9125 |
| Sally  |   0.7625 |


In [2]:
attendance_df = pd.read_csv('untidy-data/attendance.csv')
attendance_df.rename(columns={'Unnamed: 0':'name'}, inplace=True)
att_cln = attendance_df.melt(id_vars=['name'],
                  var_name='date',
                  value_name='status')

att_cln.replace({'status': {'P': 1, 'H': .5, 'T': .9, 'A': 0}}, inplace=True)
# print(att_cln.groupby('name').mean().to_markdown())

### 2. Coffee Levels

*  Read the `coffee_levels.csv` file.

|    |   hour | coffee_carafe   |   coffee_amount |
|---:|-------:|:----------------|----------------:|
|  0 |      8 | x               |        0.816164 |
|  1 |      9 | x               |        0.451018 |
|  2 |     10 | x               |        0.843279 |
|  3 |     11 | x               |        0.335533 |
|  4 |     12 | x               |        0.898291 |
    
    
* Transform the data so that each carafe is in it's own column.


|   hour |        x |         y |        z |
|-------:|---------:|----------:|---------:|
|      8 | 0.816164 | 0.189297  | 0.999264 |
|      9 | 0.451018 | 0.521502  | 0.91599  |
|     10 | 0.843279 | 0.0231628 | 0.144928 |
|     11 | 0.335533 | 0.235529  | 0.311495 |
|     12 | 0.898291 | 0.0170092 | 0.771947 |
|     13 | 0.310711 | 0.997464  | 0.39852  |
|     14 | 0.507288 | 0.0583609 | 0.864464 |
|     15 | 0.215043 | 0.144644  | 0.436364 |
|     16 | 0.183891 | 0.544676  | 0.280621 |
|     17 | 0.39156  | 0.594126  | 0.436677 |

    
    
* Is this the best shape for the data?
    * ___Answer:___ I'm still not exactly sure what the data is derived from, particularly the `hour` column. However, it does seem to be an easier format to deal with.

In [3]:
coff_df = pd.read_csv('untidy-data/coffee_levels.csv')
# print(coff_df.head().to_markdown())
# print(coff_df.pivot_table(index=['hour'],
#                    columns='coffee_carafe',
#                    values='coffee_amount').to_markdown())


### 3. Cake Recipes

* Read the `cake_recipes.csv` data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

|    | recipe:position   |     225 |     250 |     275 |     300 |
|---:|:------------------|--------:|--------:|--------:|--------:|
|  0 | a:bottom          | 61.7387 | 53.9126 | 74.4147 | 98.7868 |
|  1 | a:top             | 51.7098 | 52.0097 | 68.5769 | 50.2285 |
|  2 | b:bottom          | 57.0953 | 61.9044 | 61.197  | 99.2485 |
|  3 | b:top             | 82.455  | 95.2242 | 98.5949 | 58.1693 |
|  4 | c:bottom          | 96.4702 | 52.0014 | 92.8932 | 65.4731 |


* Tidy the data as necessary.


* Which recipe, on average, is the best? recipe b

|    | recipe   |   score |
|---:|:---------|--------:|
|  1 | b        | 76.7361 |

* Which oven temperature, on average, produces the best results? 275

|    |   oven_temp |   score |
|---:|------------:|--------:|
|  2 |         275 | 74.8868 |

* Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

|    | recipe   | position   |   oven_temp |   score |
|---:|:---------|:-----------|------------:|--------:|
| 26 | b        | bottom     |         300 | 99.2485 |



In [4]:
cake_df = pd.read_csv('untidy-data/cake_recipes.csv')
# print(cake_df.head().to_markdown())
cake_df[['recipe', 'position']] = cake_df['recipe:position'].str.split(':', expand = True)
cake_df.drop(columns='recipe:position', inplace=True)
tidy_cake = cake_df.melt(id_vars=['recipe', 'position'],
             var_name = 'oven_temp',
             value_name = 'score')

# print(tidy_cake.groupby('oven_temp').agg('mean').reset_index().sort_values(by='score', ascending=False).head(n=1).to_markdown())
# print(tidy_cake.sort_values(by='score', ascending=False).head(n=1).to_markdown())
