In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.stats import logistic

## Clusters and site parameters

We will define 3 clusters, each with some number of member sites.

The data-generating model follows a very simple premise:
> Within each cluster, the sites will have a "default" or "baseline" RR (response-rate), except that certain sites will have much higher or much lower RR. 


In [3]:
# prob of each cluster occurring
p_c = np.array([0.2, 0.2, 0.6]) 

n_c = len(p_c)  # num clusters

# default/baseline response-rate for sites in each cluster
rr_c_def = np.array([0.2, 0.6, 0.1]) 

# convert these to logits, think of them as "cluster weights"
w_c = np.log(rr_c_def / (1 - rr_c_def)) 

p_s = [ np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.5]),  # site occurrence probabilities, in each cluster
        np.array([0.3, 0.3, 0.4]),
        np.array([0.2, 0.2, 0.6])
        ]


# Each site_id adds to the cluster's baseline logit;
# Cluster members generally have cluster's baseline RR,
# except for some that are "much higher" or "much lower".
# We cannot express this in probabilities easily (since they need to be in [0,1]
# but in logit (or log-oods) space this is easy.
# Think of these as "incremental" site_weights, relative to their cluster-weight
# So we name these as "d_w_s" , signifying "delta site weight"
d_w_s = [ np.array( [0.0, 0.0, 2.0, 0.0, 0.0, 0.0]),  # cluster 1
          np.array( [0.0, 0.0, 0.0]),  # cluster 2
          np.array( [0.0, -5.0, 0.0])  # cluster 3
          ]

### True model with site + cluster indicators

In [4]:
# site + cluster model with these indicator variables:
# c_1, c_2, c_3, 
# s_11, s_12, s_13, s_14, s_15, s_16, 
# s_21, s_21, s_23, 
# s_31, s_32, s_33

w_site = np.concatenate( [ d_w_s[i] for i in range(n_c)] )
w_sc = np.concatenate((w_c, w_site))
w_sc

array([-1.38629436,  0.40546511, -2.19722458,  0.        ,  0.        ,
        2.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -5.        ,  0.        ])

### Equivalent true model with sites only

In [5]:
## convert (cluster + sites) weight-vector to (site-only) weights
def site_weights(wts):
    w_clusters = wts[:n_c]
    w_sites = wts[n_c:]
    w_clusters_rep = np.concatenate( [  [ w_clusters[i] ] * len (p_s[i]) for i in range(n_c)])
    return w_sites + w_clusters_rep

w_s = site_weights(w_sc )
w_s

array([-1.38629436, -1.38629436,  0.61370564, -1.38629436, -1.38629436,
       -1.38629436,  0.40546511,  0.40546511,  0.40546511, -2.19722458,
       -7.19722458, -2.19722458])

### True logistic model

Now each site's response-rate is a function of its logit (weight) `w_s`  , and a cluster response rate is given by this expression, but we don't really care about the "cluster response rate" since it's just a blend of its member sites' responses rates

$$
\begin{align}
p(y=1 | c=1) &= p(y=1 | s=11) p(s=11 | c=1) +  \\
             &= p(y=1 | s=12) p(s=12 | c=1) + \\
             &= p(y=1 | s=13) p(s=13 | c=1)
\end{align}
$$
   

In [6]:
# site response-rate, based on logistic with weights w_s
# logit for a given cluster and one of its sites
def logit_site(cluster, site):
    return w_c[cluster] + d_w_s[cluster][site]

# site response_rate, according to logistic model with true weights
def rr_s(cluster, site):
    return logistic.cdf(logit_site(cluster, site))

# cluster response rate, according to logistic model, and site-occurrence probs
def rr_c(cluster):
    return sum( [ rr_s(cluster,i) * p_s[cluster][i] for i in range(len(p_s[cluster]))]  ) 
    

### Site response-rates
Note how this matches what we wanted to model, i.e. in each cluster sites have a certain "baseline" response rate (RR), and some have much higher or much lower RR.

In [7]:
[ [ np.round(rr_s(c, i),4) for i in range( len(p_s[c]))] for c in range(n_c) ]

[[0.2, 0.2, 0.6488, 0.2, 0.2, 0.2], [0.6, 0.6, 0.6], [0.1, 0.0007, 0.1]]

### Cluster response-rates


             
              


In [8]:
# RR of each cluster, from model

[ rr_c(c) for c in range(n_c) ]

[0.24487856442839395, 0.6, 0.08014962014080428]

### Data gen 
We generate N rows with one-hot encoded clusters and sites:

`c_1, c_2, c_3, s_11, s_12, s_13, s_21, s_21, s_23, s_31, s_32, s_33, label`

In [15]:
def one_hot(n, i):
    x = np.array([0] * n)
    x[i] = 1.0
    return x


# site-indicator variables for all clusters,
# given that a specific cluster and its site occur
def site_hot(cluster, site):
    nc = len(p_c)
    ns = len(p_s[cluster])  # num sites in this cluster
    return np.concatenate(
        [one_hot(len(p_s[c]), site)
         if c == cluster
         else one_hot(len(p_s[c]), 0) * 0
         for c in range(nc)]
    )


# encode data-row given cluster index, site_index in cluster
def encode(cluster, site):
    nc = len(p_c)
    cluster_hot = one_hot(nc, cluster)
    return np.concatenate((cluster_hot, site_hot(cluster, site)))


def gen_label(cluster, site):
    p = rr_s(cluster, site)
    return (np.random.uniform() < p)*1


def gen_row(cluster):
    p_sites = p_s[cluster]
    site = np.random.choice(len(p_sites), 1, list(p_sites))[0]
    return np.concatenate( (encode(cluster, site), [gen_label(cluster, site)] ))

def gen_data(N=100):
    clusters = np.random.choice(3, N, list(p_c))
    return np.array(list(map(gen_row, clusters)))

obs_data = gen_data(N=100)

### LR model with site + cluster indicators

In [16]:
#### LR with site + cluster indicators
lr_sc = LogisticRegression(fit_intercept = False)
# model with cluster_ids
lr_sc.fit(obs_data[:,:-1], obs_data[:,-1])
# get the equivalent site-only weights:
lr_w_s = site_weights(lr_sc.coef_[0])

# compare these with w_s (true weights with sites-only)

np.c_[ lr_w_s, w_s]


array([[-0.51976795, -1.38629436],
       [-0.55311066, -1.38629436],
       [-0.02933708,  0.61370564],
       [-1.0696598 , -1.38629436],
       [-0.98446862, -1.38629436],
       [-0.59977762, -1.38629436],
       [-0.10441875,  0.40546511],
       [-0.10441875,  0.40546511],
       [ 0.8519067 ,  0.40546511],
       [-1.97519099, -2.19722458],
       [-2.4380698 , -7.19722458],
       [-2.126541  , -2.19722458]])

### LR model with only site indicators


In [18]:
# LR with only site indicators
lr_s = LogisticRegression(fit_intercept = False)
lr_s.fit(obs_data[:,n_c:-1], obs_data[:,-1])

# compare these with w_s (true weights with just sites)
np.c_[ lr_s.coef_[0], w_s]


array([[-0.33540625, -1.38629436],
       [-0.40396376, -1.38629436],
       [ 0.40105801,  0.61370564],
       [-0.82641745, -1.38629436],
       [-0.79666131, -1.38629436],
       [-0.28654723, -1.38629436],
       [-0.15405659,  0.40546511],
       [-0.15405659,  0.40546511],
       [ 0.79666131,  0.40546511],
       [-1.31964724, -2.19722458],
       [-1.63350602, -7.19722458],
       [-1.51753773, -2.19722458]])

### Questions to look into

Recall the 2 x 2 x 2 combination of approaches:

- TrainModel: Train_LR / Train_Complex
- ScoreModel: Score_LR / Score_Complex
- ScoreFeatures: Score_Clusters / Score_NoClusters

Specific question:
- **Train_LR, Score_LR, Score_NoClusters** Is it possible to use LR for training (with both site_ids and cluster_id indicators) and get a useful ScoreModel with NoClusters, by essentially augmenting the learned site_weight with its cluster_weight.
- If not, then can **Train_Complex, Score_LR, Score_NoClusters** give a good model

