In [211]:
import pandas as pd
from statsmodels.multivariate.pca import PCA
import dcor
import seaborn as sns
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Legend
from bokeh.palettes import Spectral6, Dark2, inferno
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import Range1d
output_notebook()

In [238]:
df = pd.read_csv('../data/demo1.csv')
df['vol'] = df['wt']*df['rep']
df = df.fillna(0).set_index(['ex', 'day', 'wt', 'rep'])
df = df.multiply(df['vol'], axis='index').drop(columns=['vol'])

In [239]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,push,pull,chest,back,shoulder,bi,tri,quad,ham,hip,core,bilateral,unilateral
ex,day,wt,rep,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
push up,0,155,200,31000.0,0.0,31000.0,0.0,0.0,0.0,31000.0,0.0,0.0,0.0,0.0,31000.0,0.0
pull up,0,155,100,0.0,15500.0,0.0,15500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15500.0,0.0
air squat,0,155,300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46500.0,46500.0,46500.0,0.0,46500.0,0.0
bicep curls,1,25,32,0.0,0.0,0.0,0.0,0.0,800.0,0.0,0.0,0.0,0.0,0.0,800.0,0.0
hspu,1,155,20,3100.0,0.0,0.0,0.0,3100.0,0.0,0.0,0.0,0.0,0.0,0.0,3100.0,0.0
hollow rocks,1,155,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1550.0,0.0,0.0
bench press,2,135,40,5400.0,0.0,5400.0,0.0,0.0,0.0,5400.0,0.0,0.0,0.0,0.0,5400.0,0.0
dumbbell rows,2,50,40,0.0,2000.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0
back squat,2,225,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5625.0,5625.0,5625.0,0.0,5625.0,0.0
Shoulder press,3,105,25,2625.0,0.0,0.0,0.0,2625.0,0.0,0.0,0.0,0.0,0.0,0.0,2625.0,0.0


## Visualize exercise data using `PCA`
This will give some insight to whether or not the features are useful in separating similarities.

In [241]:
pc = PCA(df)
df_pca = pc.factors.iloc[:,0:2]
cds = ColumnDataSource(data=df_pca.query('day in (2,1,3,4,5)').reset_index())
p = figure(
    x_axis_type="datetime",
    plot_height=300,
    plot_width=500,
    title='Exercise Similarity'
)

p.circle(x=df_pca.columns[0], y=df_pca.columns[1], source=cds, size=10, alpha=0.5)
hovers = [(y, f'@{y}') for y in ['ex', 'wt', 'rep']]        
p.add_tools(HoverTool(tooltips=hovers))
    
show(p)

## Using `dCorr` to measure workout similarity

The fundamental challenge here is comparing two matrices that are potentially of different dimensions (e.g. one workout might have 3 exercies while another has 4). Is there a single metric to judge similarities between WODs? After doing some reasearch I found `dCorr` to be a promising lead!

- Day 0,2 and 4 are very similar. All have chest, back and squat in common. 4 has deadlifts. All pairwise dCor is high among these. 
- While 1 and 3 are similar. These are bicep, shoulder and abs. dCorr(1,3) is high. dCorr between 1,3 and any of 0,2,4 are low. This makes sense.
- Day 5 is unlike any of them. It has shoulder, back, squat. It's not obvious which of the other days are "closest" to 5. Turns out 0,2,4 are closer to 5 than 1,3. This makes sense because 5 shares back & sqaut with 0,2,4 and only shoulder with 1,3. It also turns out that 4 is most like 5 by this definition. Does this make sense?

In [219]:
df.query('day==0')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,push,pull,chest,back,shoulder,bi,tri,quad,ham,hip,core,bilateral,unilateral
ex,day,wt,rep,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
push up,0,155,200,31000.0,0.0,31000.0,0.0,0.0,0.0,31000.0,0.0,0.0,0.0,0.0,31000.0,0.0
pull up,0,155,100,0.0,15500.0,0.0,15500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15500.0,0.0
air squat,0,155,300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46500.0,46500.0,46500.0,0.0,46500.0,0.0


In [220]:
df.query('day==2')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,push,pull,chest,back,shoulder,bi,tri,quad,ham,hip,core,bilateral,unilateral
ex,day,wt,rep,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
bench press,2,135,40,5400.0,0.0,5400.0,0.0,0.0,0.0,5400.0,0.0,0.0,0.0,0.0,5400.0,0.0
dumbbell rows,2,50,40,0.0,2000.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.0
back squat,2,225,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5625.0,5625.0,5625.0,0.0,5625.0,0.0


In [221]:
df.query('day==4')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,push,pull,chest,back,shoulder,bi,tri,quad,ham,hip,core,bilateral,unilateral
ex,day,wt,rep,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
bench press,4,95,80,7600.0,0.0,7600.0,0.0,0.0,0.0,7600.0,0.0,0.0,0.0,0.0,7600.0,0.0
dumbbell rows,4,35,100,0.0,3500.0,0.0,3500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3500.0
back squat,4,135,75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10125.0,10125.0,10125.0,0.0,10125.0,0.0
deadlift,4,135,30,0.0,4050.0,0.0,4050.0,0.0,0.0,0.0,4050.0,4050.0,4050.0,0.0,4050.0,0.0


### Compare pairwise WOD `dCorr` values

In [252]:
dcor.distance_correlation(
    df.query('day==0').T, 
    df.query('day==2').T
)

0.9792639009643077

In [255]:
dcor.distance_correlation(
    df.query('day==1').T, 
    df.query('day==4').T
)

0.40510039452277813

In [257]:
dcor.distance_correlation(
    df.query('day==3').T, 
    df.query('day==5').T
)

0.4110448147888916

In [259]:
dcor.distance_correlation(
    df.query('day==4').T, 
    df.query('day==5').T
)

0.8763185070707223

## Conclusion

I think `dCorr` might be a viaable path to `AUTOWOD`!!!!!