In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.stats import logistic

## Clusters and site parameters

We will define 3 clusters, each with 3 sites.

We will think of each cluster as having a "baseline" response-rate, and the sites within each cluster vary "around" that baseline. We cannot do this in probability space (since we need to stay within [0,1]), but we can easily do this "variation from baseline" in logit space.

In [2]:
p_c = np.array([0.2, 0.2, 0.6])  # prob of each cluster occurring
n_c = len(p_c)
rr_c = np.array([0.2, 0.6, 0.1])  # baseline response-rates of each cluster

w_c = np.log(rr_c/ (1.0 - rr_c)) # logits of the clusters, corresponding to their baseline rates

p_s = [ np.array([0.1, 0.4, 0.5]),  # site occurrence probabilities, in each cluster
        np.array([0.3, 0.3, 0.4]),
        np.array([0.2, 0.2, 0.6])
        ]

# Each site_id adds to the cluster's baseline logit;
# Cluster members generally have cluster's baseline RR,
# except for some that are "much higher" or "much lower".
# We cannot express this in probabilities easily (since they need to be in [0,1]
# but in logit (or log-oods) space this is easy.
# Think of these as "incremental" site_weights, relative to their cluster-weight
# So we name these as "d_w_s" , signifying "delta site weight"
d_w_s = [ np.array( [0.0, 0.0, 2.0]),  # cluster 1
          np.array( [0.0, 0.0, 0.0]),  # cluster 2
          np.array( [0.0, -5.0, 0.0])  # cluster 3
          ]

### True model with site + cluster indicators

In [71]:
# site + cluster model with these indicator variables:
# c_1, c_2, c_3, s_11, s_12, s_13, s_21, s_21, s_23, s_31, s_32, s_33

w_site = np.concatenate( [ d_w_s[i] for i in range(3)] )
w_sc = np.concatenate((w_c, w_site))
w_sc

array([-1.38629436,  0.40546511, -2.19722458,  0.        ,  0.        ,
        2.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -5.        ,  0.        ])

### Equivalent true model with sites only

In [72]:
## convert (cluster + sites) weight-vector to (site-only) weights
def site_weights(wts):
    w_clusters = wts[:n_c]
    w_sites = wts[n_c:]
    w_clusters_rep = np.concatenate( [  [ w_clusters[i] ] * len (p_s[i]) for i in range(n_c)])
    return w_sites + w_clusters_rep

w_s = site_weights(w_sc )
w_s

array([-1.38629436, -1.38629436,  0.61370564,  0.40546511,  0.40546511,
        0.40546511, -2.19722458, -7.19722458, -2.19722458])

### Data gen 
We generate N rows with one-hot encoded clusters and sites:

`c_1, c_2, c_3, s_11, s_12, s_13, s_21, s_21, s_23, s_31, s_32, s_33, label`

In [None]:
def one_hot(n, i):
    x = np.array([0] * n)
    x[i] = 1.0
    return x


# site-indicator variables for all clusters,
# given that a specific cluster and its site occur
def site_hot(cluster, site):
    nc = len(p_c)
    ns = len(p_s[cluster])  # num sites in this cluster
    return np.concatenate(
        [one_hot(len(p_s[c]), site)
         if c == cluster
         else one_hot(len(p_s[c]), 0) * 0
         for c in range(nc)]
    )


# encode data-row given cluster index, site_index in cluster
def encode(cluster, site):
    nc = len(p_c)
    cluster_hot = one_hot(nc, cluster)
    return np.concatenate((cluster_hot, site_hot(cluster, site)))

# logit for a given cluster and one of its sites
def logit_site(cluster, site):
    return w_c[cluster] + d_w_s[cluster][site]


def gen_label(cluster, site):
    p = logistic.cdf(logit_site(cluster, site))
    return (np.random.uniform() < p)*1


def gen_row(cluster):
    p_sites = p_s[cluster]
    site = np.random.choice(len(p_sites), 1, list(p_sites))[0]
    return np.concatenate( (encode(cluster, site), [gen_label(cluster, site)] ))

def gen_data(N=100):
    clusters = np.random.choice(3, N, list(p_c))
    return np.array(list(map(gen_row, clusters)))

obs_data = gen_data(N=1000)

### LR model with site + cluster indicators

In [None]:
#### LR with site + cluster indicators
lr_sc = LogisticRegression()
# model with cluster_ids
lr_sc.fit(obs_data[:,:-1], obs_data[:,-1])
# get the equivalent site-only weights:
lr_w_s = site_weights(lr_sc.coef_[0])
lr_w_s
# compare these with w_s (true weights with sites-only)
w_s

### LR model with only site indicators


In [None]:
# LR with only site indicators
n_c = len(p_c)
lr_s = LogisticRegression()
lr_s.fit(obs_data[:,n_c:-1], obs_data[:,-1])

# compare these with w_s (true weights with just sites)
lr_s.coef_
w_s

### Questions to look into

Recall the 2 x 2 x 2 combination of approaches:

- TrainModel: Train_LR / Train_Complex
- ScoreModel: Score_LR / Score_Complex
- ScoreFeatures: Score_Clusters / Score_NoClusters

Specific question:
- **Train_LR, Score_LR, Score_NoClusters** Is it possible to use LR for training (with both site_ids and cluster_id indicators) and get a useful ScoreModel with NoClusters, by essentially augmenting the learned site_weight with its cluster_weight.
- If not, then can **Train_Complex, Score_LR, Score_NoClusters** give a good model

