# Tidy Data

- tabular data
- one value per cell
- 1 var / column
- 1 obs / row

In [1]:
import pandas as pd

## One Variable in Multiple Columns

- `melt`

In [30]:
df = pd.read_csv('./untidy_data/treatments.csv')
df.columns = ['subject_name', 'treatment_a', 'treatment_b', 'treatment_c']

df = df.melt(id_vars='subject_name', var_name='treatment', value_name='response')

df.treatment = df.treatment.str.replace('treatment_', '')
df

Unnamed: 0,subject_name,treatment,response
0,John Smith,a,
1,Jane Doe,a,16.0
2,Mary Johnson,a,3.0
3,John Smith,b,2.0
4,Jane Doe,b,11.0
5,Mary Johnson,b,1.0
6,John Smith,c,0.0
7,Jane Doe,c,3.0
8,Mary Johnson,c,4.0


## One Column With Multiple Variables

- `pivot_table`

In [32]:
df = pd.read_csv('./untidy_data/students.csv')
df.head()

Unnamed: 0,date,var,val
0,2019-02-04,n_late_from_break,4.02812
1,2019-02-04,coffee_consumption,5255.40974
2,2019-02-04,classroom_temp,67.0
3,2019-02-05,n_late_from_break,2.101998
4,2019-02-05,coffee_consumption,8603.704719


In [34]:
df.pivot_table(values='val', index='date', columns='var')

var,classroom_temp,coffee_consumption,n_late_from_break
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-02-04,67.0,5255.40974,4.02812
2019-02-05,73.0,8603.704719,2.101998
2019-02-06,81.0,1801.49805,4.941244
2019-02-07,62.0,9282.959741,1.419342
2019-02-08,72.0,7558.270659,1.808919
2019-02-11,60.0,5731.008713,4.831584
2019-02-12,52.0,9547.673484,3.55083
2019-02-13,56.0,7114.931847,2.550362
2019-02-14,72.0,1977.295513,3.507548
2019-02-15,75.0,2254.674679,2.114379


## More complex example

In [39]:
df = pd.read_csv('./untidy_data/sales.csv')
df

Unnamed: 0,Product,2016 Sales,2016 PPU,2017 Sales,2017 PPU,2018 Sales,2018 PPU
0,A,673,5,231,7,173,9
1,B,259,3,748,5,186,8
2,C,644,3,863,5,632,5
3,D,508,9,356,11,347,14


In [40]:
df = df.melt(id_vars='Product')
df.head(7)

Unnamed: 0,Product,variable,value
0,A,2016 Sales,673
1,B,2016 Sales,259
2,C,2016 Sales,644
3,D,2016 Sales,508
4,A,2016 PPU,5
5,B,2016 PPU,3
6,C,2016 PPU,3


In [42]:
df['year'] = df.variable.str.extract(r'^(\d+)')
df

Unnamed: 0,Product,variable,value,year
0,A,2016 Sales,673,2016
1,B,2016 Sales,259,2016
2,C,2016 Sales,644,2016
3,D,2016 Sales,508,2016
4,A,2016 PPU,5,2016
5,B,2016 PPU,3,2016
6,C,2016 PPU,3,2016
7,D,2016 PPU,9,2016
8,A,2017 Sales,231,2017
9,B,2017 Sales,748,2017


In [44]:
df['var_name'] = df.variable.str.extract(r'^\d+\s(.*)$')

In [46]:
df = df.drop(columns='variable')

In [49]:
df = df.pivot_table('value', ['Product', 'year'], 'var_name')
df

Unnamed: 0_level_0,var_name,PPU,Sales
Product,year,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2016,5,673
A,2017,7,231
A,2018,9,173
B,2016,3,259
B,2017,5,748
B,2018,8,186
C,2016,3,644
C,2017,5,863
C,2018,5,632
D,2016,9,508


In [None]:
df = df.reset_index()

In [58]:
df.columns.name = ''

In [59]:
df

Unnamed: 0,Product,year,PPU,Sales
,,,,
0.0,A,2016.0,5.0,673.0
1.0,A,2017.0,7.0,231.0
2.0,A,2018.0,9.0,173.0
3.0,B,2016.0,3.0,259.0
4.0,B,2017.0,5.0,748.0
5.0,B,2018.0,8.0,186.0
6.0,C,2016.0,3.0,644.0
7.0,C,2017.0,5.0,863.0
8.0,C,2018.0,5.0,632.0
