# Linear Mixed Effects Models -- modified Edward tutorial for grouped model

With linear mixed effects models, we wish to model a linear
relationship for data points with inputs of varying type, categorized
into subgroups, and associated to a real-valued output.

We demonstrate with an example in Edward. A webpage version is available 
[here](http://edwardlib.org/tutorials/linear-mixed-effects-models).

In [1]:
%matplotlib inline
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from edward.models import Normal, BernoulliWithSigmoidProbs, Bernoulli
from observations import insteval

import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.stats import logistic


plt.style.use('ggplot')
ed.set_seed(42)

## Data: Clusters and site parameters

We will define 3 clusters, each with some number of member sites.

The data-generating model follows a very simple premise:
> Within each cluster, the sites will have a "default" or "baseline" RR (response-rate), and sites within a cluster will exhibit variation around this cluster-default.


In [2]:

# site, cluster occurrence probabilities, to generate the dataset
# as well as their "true" logit-weights 

clusters = {0: dict(prob = 0.2, w = -0.5,   # cluster weight in logistic
                    sites = np.arange(6),     
                    site_probs = [0.1, 0.1, 0.1, 0.1, 0.1, 0.5],
                    w_s =        [0.0, 0.0, 0.0, 5.0, 0.0, 0.0]),  # weights of sites in logistic
            1: dict(prob = 0.5, w = -1.0,  
                    sites = 6 + np.arange(3), 
                    site_probs = [0.1, 0.3, 0.6],
                    w_s        = [0.0, 0.0, 0.0]),                    
            2: dict(prob = 0.3, w = -0.2,  
                    sites = 9 + np.arange(3), 
                    site_probs = [0.3, 0.3, 0.4] ,
                    w_s =        [0.0, -4, 0.0 ]
                   )}


# num clusters
n_c = len(clusters)

# num sites
n_s = sum(list( map(len, list( map( lambda d: d['sites'], clusters.values()))))) # num sites

# site to cluster map
s2c = dict( [ (s,c) for c in range(n_c) for s in clusters[c]['sites']] )

# prob of each cluster occurring
p_c = [c['prob'] for c in clusters.values()]

# prob of site occurrence, within a cluster

p_s = ( [  dict( zip (d['sites'], d['site_probs']))  for d in clusters.values() ])

# "true" weights of clusters in logit model
w_c = np.array( [d['w'] for d in clusters.values()] )

# "true" weights of sites in logit model
w_s = np.concatenate( [d['w_s'] for d in clusters.values() ])


## True weights of logistic model

In [3]:
# logit for a site: sum of site-weight and its cluster-weight
def logit_site(site):
    return w_c[s2c[site]] + w_s[site]

# site response_rate
def rr_s(site):
    return logistic.cdf(logit_site(site))


### Site response-rates
Note how this matches what we wanted to model, i.e. in each cluster sites have a certain "baseline" response rate (RR), and some have much higher or much lower RR.

In [4]:
[np.round(rr_s(s),4) for s in range(n_s) ]

[0.3775,
 0.3775,
 0.3775,
 0.989,
 0.3775,
 0.3775,
 0.2689,
 0.2689,
 0.2689,
 0.4502,
 0.0148,
 0.4502]

### Data gen 
We generate N rows with `[cluster_id, site_id, abel]`

In [5]:
def gen_prob_label(site):
    p = rr_s(site)
    return p, (np.random.uniform() < p)*1


def gen_row(cluster):
    site2prob = p_s[cluster]
    site_ids = list( site2prob.keys())
    probs = list(site2prob.values())
    site = site_ids[ np.random.choice(len(probs), 1, probs ) [0] ]
    prob, label = gen_prob_label(site)
    return [ cluster, site, prob, label ] 

def gen_data(N=100):
    clusters = np.random.choice(n_c, N, list(p_c))
    data = list(map(gen_row, clusters))
    df = pd.DataFrame( data, columns = ['cluster', 'site', 'prob', 'label'])
    features = dict(df[['cluster', 'site', 'prob']])
    labels = np.array(list( df['label']))
    return features, labels

def log_loss(labels, probs):
    return -np.mean(labels * np.log(probs) + (1-labels)*np.log(1-probs))

def rig(labels, probs):
    p = np.mean(labels)
    ent = -p*np.log(p) - (1-p)*np.log(1-p)
    loss = log_loss(labels, probs)
    return np.round(100*(ent - loss)/ent, 2)

x_train, y_train = gen_data(N=800)
x_test, y_test = gen_data(N=800)


In [6]:
pd.DataFrame(x_train).head(5)

Unnamed: 0,cluster,prob,site
0,2,0.014774,10
1,0,0.377541,5
2,2,0.450166,11
3,2,0.450166,9
4,0,0.377541,4


### Quickly train + eval with MLE/TensorFlow

In [7]:

def input_fn(features, labels,  batch_size = 100, test = False):
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    if not test:
        dataset = dataset.shuffle(1000).repeat()
    return dataset.batch(batch_size)


feature_cols = [tf.feature_column.categorical_column_with_identity(key = 'cluster', num_buckets=n_c),
                tf.feature_column.categorical_column_with_identity(key = 'site', num_buckets = n_s)
               ]

classifier = tf.estimator.LinearClassifier(feature_columns=feature_cols, optimizer=tf.train.AdamOptimizer(learning_rate=0.01))

classifier.train(input_fn=lambda: input_fn(x_train, y_train), steps=3000)

# Evaluate the model.
eval_result = classifier.evaluate(input_fn=lambda: input_fn(x_test, y_test, test=True))

y_pred = classifier.predict(input_fn=lambda: input_fn(x_test, y_test, test=True))

probs = np.array( [yp['probabilities'][1] for yp in y_pred] )  # predicted probabilities for class[1] 





INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/8t/pw3z265s66x05f3yfpynnr5s3_vhgw/T/tmpupocc4kz', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c1c3e5198>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/8t/pw3z265s66x05f3yfpynnr5s3_vhgw/T/tmpupocc4kz/model.ckpt.
INFO:tensorflow:loss = 69.31474, step = 1
INFO:tensorflow:global_step/sec: 505.026
INFO:tensorflow:loss = 50.97311, step = 101 (0.199 sec)
INFO:tensorflow:global_step/sec: 771.992
INFO:tensorflow:loss = 59.639168, st

In [8]:

RIG = rig(y_test, probs)

# true logistic model probabilities 
true_probs = np.array(list(x_test['prob']))

# RIG if we knew the true logistic model
ideal_RIG = rig(y_test, true_probs)

print("%RIG = ", RIG, "%RIG_ideal = ", ideal_RIG)


%RIG =  16.7 %RIG_ideal =  17.07


In [9]:
c_train = x_train['cluster']
s_train = x_train['site']
n_obs_train = len(c_train)

c_test = x_test['cluster']
s_test = x_test['site']
n_obs_test = len(c_test)



In [10]:
n_s = max(s_train) + 1  # number of sites
n_c = max(c_train) + 1  # number of clusters
n_obs = len(c_train)  # number of observations

print("Number of sites: {}".format(n_s))
print("Number of clusters: {}".format(n_c))
print("Number of observations: {}".format(n_obs))

Number of sites: 12
Number of clusters: 3
Number of observations: 800


## Model

Since our problem is binary classification (convert or not), we use a logistic regression where we model the _log-odds_ as a linear function of predictors.

In what follows we let $z$ denote the log-odds, and the actual prediction itself will be $1/(1+e^{-z})$.


```
z ~ (1|site) + (1|cluster)
```


In [11]:
# Set up placeholders for the data inputs.
s_ph = tf.placeholder(tf.int32, [None])
c_ph = tf.placeholder(tf.int32, [None])

# Set up random effects.

sigma_s = tf.sqrt(tf.exp(tf.get_variable("sigma_s", [])))
sigma_c = tf.sqrt(tf.exp(tf.get_variable("sigma_c", [])))

eta_s = Normal(loc=tf.zeros(n_s), scale=sigma_s * tf.ones(n_s))
eta_c = Normal(loc=tf.zeros(n_c), scale=sigma_c * tf.ones(n_c))

yhat = (tf.gather(eta_s, s_ph) + # pick the entry from eta_s using site-index fed into placeholder s_ph 
        tf.gather(eta_c, c_ph))  # same thing with cluster-index fed into placeholder c_ph

yhat_s = tf.gather(eta_s, s_ph) # site_only model

# y_logit = Normal(loc=yhat, scale=tf.ones(n_obs))

y = Bernoulli(logits = yhat)
y_s = Bernoulli(logits = yhat_s)  # site_only model


# y = tf.sigmoid(y_logit)


## Inference

Given data, we aim to infer the model's fixed and random effects.
In this analysis, we use variational inference with the
$\text{KL}(q\|p)$ divergence measure. We specify fully factorized
normal approximations for the random effects and pass in all training
data for inference. Under the algorithm, the fixed effects will be
estimated under a variational EM scheme.

In [12]:
q_eta_s = Normal(
    loc=tf.get_variable("q_eta_s/loc", [n_s]),
    scale=tf.nn.softplus(tf.get_variable("q_eta_s/scale", [n_s])))
q_eta_c = Normal(
    loc=tf.get_variable("q_eta_c/loc", [n_c]),
    scale=tf.nn.softplus(tf.get_variable("q_eta_c/scale", [n_c])))

latent_vars = {
    eta_s: q_eta_s,
    eta_c: q_eta_c}

data = {
    y: y_train,
    s_ph: s_train,
    c_ph: c_train}

data_s = {
    y_s: y_train,
    s_ph: s_train}


inference = ed.KLqp(latent_vars, data)

inference_s = ed.KLqp({eta_s: q_eta_s}, data_s)

  not np.issubdtype(value.dtype, np.float) and \
  not np.issubdtype(value.dtype, np.int) and \


### Criticism

We will evaluate the inferred distributions by computing logits from the means of the inferred posterior distributions of the latent vars. From the logits we can compute the log-loss relative to the observed 0/1 labels, and compute the RIG from there.

In [13]:
yhat_test = ed.copy(yhat, {
    eta_s: q_eta_s.mean(),
    eta_c: q_eta_c.mean()})

yhat_test_s = ed.copy(yhat_s, {
    eta_s: q_eta_s.mean()})



In [14]:
inference.initialize(n_print=2000, n_iter=10000)

tf.global_variables_initializer().run()


for _ in range(inference.n_iter):
  # Update and print progress of algorithm.
  info_dict = inference.update()

  inference.print_progress(info_dict)

  t = info_dict['t']
  if t == 1 or t % inference.n_print == 0:
    # Make predictions on test data.
    yhat_vals = yhat_test.eval(feed_dict={
        s_ph: s_test,
        c_ph: c_test})

    probs = logistic.cdf(yhat_vals)
    rg  = rig(y_test, probs)
    
    print('rig=', rg)


    1/10000 [  0%]                                ETA: 9737s | Loss: 580.906rig= -14.19
 2000/10000 [ 20%] ██████                         ETA: 12s | Loss: 458.938  rig= 16.34
 4000/10000 [ 40%] ████████████                   ETA: 7s | Loss: 454.474 rig= 16.38
 6000/10000 [ 60%] ██████████████████             ETA: 4s | Loss: 452.398rig= 16.38
 8000/10000 [ 80%] ████████████████████████       ETA: 2s | Loss: 451.070rig= 16.39
10000/10000 [100%] ██████████████████████████████ Elapsed: 11s | Loss: 453.829
rig= 16.39


In [16]:
inference_s.initialize(n_print=2000, n_iter=10000)

tf.global_variables_initializer().run()


for _ in range(inference_s.n_iter):
  # Update and print progress of algorithm.
  info_dict = inference_s.update()

  inference_s.print_progress(info_dict)

  t = info_dict['t']
  if t == 1 or t % inference.n_print == 0:
    # Make predictions on test data.
    yhat_vals = yhat_test_s.eval(feed_dict={
        s_ph: s_test})


    probs = logistic.cdf(yhat_vals)
    rg  = rig(y_test, probs)
    
    print('rig=', rg)

    1/10000 [  0%]                                ETA: 6797s | Loss: 45143.887rig= -6.08
 2000/10000 [ 20%] ██████                         ETA: 18s | Loss: 27015.959  rig= 16.06
 4000/10000 [ 40%] ████████████                   ETA: 12s | Loss: 27015.629rig= 16.07
 6000/10000 [ 60%] ██████████████████             ETA: 7s | Loss: 27010.602 rig= 16.07
 8000/10000 [ 80%] ████████████████████████       ETA: 3s | Loss: 27013.801rig= 16.07
10000/10000 [100%] ██████████████████████████████ Elapsed: 19s | Loss: 27010.979
rig= 16.07
