In [None]:
import pymc3 as pm
import pandas as pd
import matplotlib 

%matplotlib inline

Here, we make a few modeeling choices.

1. We care only about the `normalized_measurement` column, and so we choose the t-distribution to model it, as we don't have a good "mechanistic" model that incorporates measurement error of OD600 and 'measurement'.

In [None]:
df = pd.read_csv('biofilm.csv')
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['isolate'])
df['indices'] = le.transform(df['isolate'])

In [None]:
with pm.Model() as best:
    nu = pm.Exponential('nu_minus_one', lam=1/30) + 1
    
    fold = pm.Flat('fold', shape=len(le.classes_))
    
    var = pm.HalfCauchy('var', beta=1, shape=len(le.classes_))
    
    mu = fold[df['indices']]
    sd = var[df['indices']]
    
    like = pm.StudentT('like', mu=mu, sd=sd, nu=nu, observed=df['normalized_measurement'])

In [None]:
with best:
    trace = pm.sample(draws=2000)

In [None]:
pm.forestplot(trace, varnames=['fold'], ylabels=le.classes_)