In [1]:
%load_ext autoreload
%autoreload 2

In [49]:

from crowdnalysis.factory import Factory

from crowdnalysis.dawid_skene import DawidSkene
from crowdnalysis.simple import MajorityVoting
import numpy as np
import json

In [50]:
mv = Factory.make("MajorityVoting")
smoc = Factory.make("StanMultinomialOptimize")
smetaoc = Factory.make("StanMultinomialEtaOptimize")
sdsoc = Factory.make("StanDSOptimize")
sdsetaoc = Factory.make("StanDSEtaHOptimize")
ds = DawidSkene()

In [120]:


t = 10000
num_annotations_per_task = 3
real_tau = np.array([0.1, 0.3, 0.6])
k = len(real_tau)
real_pi = np.array([
    [[0.5, 0.3, 0.2], [0.2, 0.6, 0.2], [0.25, 0.3, 0.45]],
    [[0.6, 0.2, 0.2], [0.1, 0.6, 0.3], [0.1, 0.1, 0.8]]
    ]) 
print(real_pi)
p = ds.Parameters(tau=real_tau, pi=real_pi)
dgp = ds.DataGenerationParameters(n_tasks=t, n_annotations_per_task=num_annotations_per_task)
tasks, crowds_dcps = ds.linked_samples(p,{"base":p},dgp)
dcp = crowds_dcps["base"]
print(tasks)
print(crowds_dcps)

[[[0.5  0.3  0.2 ]
  [0.2  0.6  0.2 ]
  [0.25 0.3  0.45]]

 [[0.6  0.2  0.2 ]
  [0.1  0.6  0.3 ]
  [0.1  0.1  0.8 ]]]
[1 0 2 ... 0 1 2]
{'base': DiscreteConsensusProblem(n_tasks=10000, f_T=array([1, 0, 2, ..., 0, 1, 2]), n_workers=2, f_W=None, n_annotations=30000, t_A=array([   0,    0,    0, ..., 9999, 9999, 9999], dtype=int32), w_A=array([1, 0, 1, ..., 1, 0, 0], dtype=int32), f_A=array([[1],
       [1],
       [2],
       ...,
       [2],
       [0],
       [2]], dtype=int32), n_labels=3, classes=[0, 1, 2])}


In [121]:
print(np.unique(dcp.f_A,return_counts=True))
print(dcp.classes)

(array([0, 1, 2], dtype=int32), array([ 6027,  9882, 14091]))
[0, 1, 2]


In [122]:
t_A = dcp.t_A + 1
w_A = dcp.w_A + 1
ann = dcp.f_A + 1
w = 1
#t = np.unique(t_A).shape[0]
a = len(ann)
json_data = {'w': w,
             't': t,
             'a': a,
             'k': k,
             't_A': t_A.tolist(),
             'w_A': w_A.tolist(),
             'ann': ann.tolist()}

In [123]:
print("Max w_A:",np.max(w_A))
with open('multinomial.json', 'w') as outfile:
    json.dump(json_data, outfile, indent=2)

Max w_A: 2


In [124]:
def print_vars(d,v):
    for name in v:
        if name in d:
            print(name,"=",d[name])

In [125]:
def err_rate(t_C, t_C_m):
    max_t_C_m = np.argmax(t_C_m,axis=1)
    #print(max_t_C_m)
    return 1-(np.sum(t_C==max_t_C_m)/len(t_C))

def log_score(t_C, t_C_m):
    f = t_C_m.flatten()
    start = k*np.arange(t)
    return np.sum(np.log(f[start+t_C]))

In [127]:
print("Real parameters")
print("tau=",real_tau)
print("pi=",real_pi)
for model in [mv, smoc, ds, smetaoc, sdsoc, sdsetaoc]:
#for model in [mv, sdsetaoc]:
    print("*********")
    print(model.name)
    t_C, params = model.fit_and_compute_consensus(dcp)
    print(params)
    print("error_rate:", err_rate(tasks, t_C))
    print("log_score:", log_score(tasks, t_C))
    #print(d.keys())
    #print_vars(d,["p","_pi","tau","pi"])
    #if "p" in d:
    #    print (d['p'],d['pi_'])
    #else:
    #    print(d['tau'],d['pi'])

  if __name__ == '__main__':
INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:compiled model file: /home/cerquide/prj/crowdnalysis/src/crowdnalysis/cmdstan/cmdstan/Multinomial.fit_and_consensus
INFO:crowdnalysis:dict_keys(['w', 't', 'a', 'k', 'l', 'classes', 't_A', 'w_A', 'ann', 'tau_prior', 'pi_prior'])
INFO:crowdnalysis:Type of w is <class 'int'>
INFO:crowdnalysis:Type of t is <class 'int'>
INFO:crowdnalysis:Type of a is <class 'int'>
INFO:crowdnalysis:Type of k is <class 'int'>
INFO:crowdnalysis:Type of l is <class 'int'>
INFO:crowdnalysis:Type of classes is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of t_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of w_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of ann is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of tau_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of pi_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:{'tau': array([0.25311033, 0.28231533, 0.46457433]), 'pi': array([[0.846153

Real parameters
tau= [0.1 0.3 0.6]
pi= [[[0.5  0.3  0.2 ]
  [0.2  0.6  0.2 ]
  [0.25 0.3  0.45]]

 [[0.6  0.2  0.2 ]
  [0.1  0.6  0.3 ]
  [0.1  0.1  0.8 ]]]
*********
MajorityVoting
MajorityVoting.Parameters()
error_rate: 0.32299999999999995
log_score: -inf
*********
StanMultinomialOptimize


INFO:cmdstanpy:finish chain 1


StanMultinomialOptimizeConsensus.Parameters(tau=array([0.13817096, 0.32873752, 0.53309152]), pi=array([[0.49611571, 0.22590417, 0.27798012],
       [0.14426072, 0.60073108, 0.25500819],
       [0.15987577, 0.18893958, 0.65118465]]))
error_rate: 0.2651
log_score: -6357.0651734685725
*********
DawidSkene


INFO:crowdnalysis:DS has converged in 165 iterations
INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:compiled model file: /home/cerquide/prj/crowdnalysis/src/crowdnalysis/cmdstan/cmdstan/MultinomialEta.fit_and_consensus
INFO:crowdnalysis:dict_keys(['w', 't', 'a', 'k', 'l', 'classes', 't_A', 'w_A', 'ann', 'tau_prior', 'eta_alpha_prior', 'eta_beta_prior'])
INFO:crowdnalysis:Type of w is <class 'int'>
INFO:crowdnalysis:Type of t is <class 'int'>
INFO:crowdnalysis:Type of a is <class 'int'>
INFO:crowdnalysis:Type of k is <class 'int'>
INFO:crowdnalysis:Type of l is <class 'int'>
INFO:crowdnalysis:Type of classes is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of t_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of w_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of ann is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of tau_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of eta_alpha_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of eta_beta

DawidSkene.Parameters(tau=array([0.12485629, 0.31163664, 0.56350707]), pi=array([[[0.44577513, 0.27854605, 0.27567881],
        [0.18250221, 0.62372079, 0.193777  ],
        [0.24290022, 0.29958217, 0.45751761]],

       [[0.57584889, 0.1745026 , 0.24964851],
        [0.10696023, 0.59916385, 0.29387592],
        [0.08335818, 0.09213695, 0.82450487]]]))
error_rate: 0.24150000000000005
log_score: -5968.139399556444
*********
StanMultinomialEtaOptimize


INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:compiled model file: /home/cerquide/prj/crowdnalysis/src/crowdnalysis/cmdstan/cmdstan/MultinomialEta.fit_and_consensus
INFO:crowdnalysis:dict_keys(['w', 't', 'a', 'k', 'l', 'classes', 't_A', 'w_A', 'ann', 'tau_prior', 'eta_alpha_prior', 'eta_beta_prior'])
INFO:crowdnalysis:Type of w is <class 'int'>
INFO:crowdnalysis:Type of t is <class 'int'>
INFO:crowdnalysis:Type of a is <class 'int'>
INFO:crowdnalysis:Type of k is <class 'int'>
INFO:crowdnalysis:Type of l is <class 'int'>
INFO:crowdnalysis:Type of classes is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of t_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of w_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of ann is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of tau_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of eta_alpha_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of eta_beta_prior is <class 'numpy

StanMultinomialEtaOptimizeConsensus.Parameters(tau=array([0.12193963, 0.33082948, 0.54723089]), eta=array([[0.84975497, 0.68193728],
       [1.4117387 , 0.85952437],
       [1.37340851, 1.22394311]]), pi=array([[0.51728878, 0.22115113, 0.26156008],
       [0.14619502, 0.59985032, 0.25395467],
       [0.16366611, 0.19005126, 0.64628264]]))
error_rate: 0.24719999999999998
log_score: -6282.021429289687
*********
StanDSOptimize


INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:found newer exe file, not recompiling
INFO:cmdstanpy:compiled model file: /home/cerquide/prj/crowdnalysis/src/crowdnalysis/cmdstan/cmdstan/DS.fit_and_consensus
INFO:crowdnalysis:dict_keys(['w', 't', 'a', 'k', 'l', 'classes', 't_A', 'w_A', 'ann', 'tau_prior', 'pi_prior'])
INFO:crowdnalysis:Type of w is <class 'int'>
INFO:crowdnalysis:Type of t is <class 'int'>
INFO:crowdnalysis:Type of a is <class 'int'>
INFO:crowdnalysis:Type of k is <class 'int'>
INFO:crowdnalysis:Type of l is <class 'int'>
INFO:crowdnalysis:Type of classes is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of t_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of w_A is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of ann is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of tau_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:Type of pi_prior is <class 'numpy.ndarray'>
INFO:crowdnalysis:{'tau': array([0.12193963, 0.33082948, 0.54723089]), 'pi': array([[[0.51728878, 0.2

StanDSOptimizeConsensus.Parameters(tau=array([0.09990521, 0.31776175, 0.58233304]), pi=array([[[0.48240963, 0.25948933, 0.25810103],
        [0.18480674, 0.62232709, 0.19286617],
        [0.24490786, 0.2991577 , 0.45593444]],

       [[0.63849156, 0.15734124, 0.2041672 ],
        [0.11371672, 0.5971929 , 0.28909038],
        [0.09004056, 0.09442421, 0.81553523]]]))
error_rate: 0.2389
log_score: -5912.877714891227
*********
StanDSEtaHOptimize
eta: [[5. 5.]
 [5. 5.]
 [5. 5.]]
old_pi_prior: [[11.  1.  1.]
 [ 1. 11.  1.]
 [ 1.  1. 11.]]
pi_prior: [[0.98670329 0.00664835 0.00664835]
 [0.00664835 0.98670329 0.00664835]
 [0.00664835 0.00664835 0.98670329]]
param: [[[0.98670329 0.00664835 0.00664835]
  [0.00664835 0.98670329 0.00664835]
  [0.00664835 0.00664835 0.98670329]]

 [[0.98670329 0.00664835 0.00664835]
  [0.00664835 0.98670329 0.00664835]
  [0.00664835 0.00664835 0.98670329]]]
tau_init = [0.25311033 0.28231533 0.46457433]


INFO:cmdstanpy:finish chain 1


StanDSOptimizeConsensus.Parameters(tau=array([0.11275946, 0.29960391, 0.58763664]), pi=array([[[0.47619684, 0.26657692, 0.25722624],
        [0.1773302 , 0.63280997, 0.18985983],
        [0.24209421, 0.30148989, 0.45641591]],

       [[0.59080767, 0.18138261, 0.22780972],
        [0.11040813, 0.60872606, 0.2808658 ],
        [0.08981033, 0.09980814, 0.81038152]]]))
error_rate: 0.2388
log_score: -5916.3952891745585
