In [7]:
%pylab inline
import numpy as np
import pymc3 as pm
import pandas as pd
from sklearn.metrics import confusion_matrix

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = np.load(pm.get_data('extrahard_MC_500_5_4.npz.npy'))
z_true = np.load(pm.get_data('extrahard_MC_500_5_4_reference_classes.npy'))

In [3]:
I = data.shape[0]               # number of items
J = data.shape[1]               # number of annotators
K = data.shape[2]               # number of classes
N = I * J

In [4]:
# create data triplets
jj = list()  # annotator IDs
ii = list()  # item IDs
y = list()   # response

# initialize true category with majority votes
z_init = np.zeros( I, dtype=np.int64 )

# create data triplets
for i in range( I ):
    ks = list()
    for j in range( J ):
        dat = data[ i, j, : ]
        k = np.where( dat == 1 )[0][0]
        ks.append( k )
        ii.append( i )
        jj.append( j )
        y.append( k )

    # getting maj vote for work item i (dealing with numpy casts)
    z_init[ i ] = np.bincount( np.array( ks ) ).argmax()

In [5]:
len(z_init),I

(500, 500)

In [6]:
confMat = confusion_matrix( z_true, z_init )
print( "Majority vote estimate of true category:\n" , confMat )

Majority vote estimate of true category:
 [[120   2   1   2]
 [  5 116   4   0]
 [  4   6 113   2]
 [  4   3   3 115]]


In [10]:
# class prevalence (flat prior)
alpha = np.ones( K )

In [11]:
import theano.tensor as tt
model = pm.Model()

with model:
    goodness = pm.Gamma('goodness', mu=1,sd=1,shape=J)
    beta = []
    for i in range(J):
        beta.append(np.ones( (K,K) ) + goodness[i]* np.diag( np.ones(K) ))
    beta = tt.as_tensor_variable(beta)
    beta = beta.reshape((J,K,K))
    pi = pm.Dirichlet( 'pi', a=alpha, shape=K )
    z = pm.Categorical( 'z', p=pi, shape=I, testval=z_init )
    y_obs = pm.Categorical( 'y_obs', p=beta[ jj, z[ ii ] ], observed=y )

  rval = inputs[0].__getitem__(inputs[1:])


In [12]:
with model:
    pm.sample( 100,  progressbar=True, njobs=8 )

Only 100 samples in chain.
  rval = inputs[0].__getitem__(inputs[1:])


ValueError: Unused step method arguments: {'njobs'}

In [None]:
pm.summary(trace)

In [None]:
pm.traceplot( trace, varnames=['pi'] )

In [None]:
z = trace['z'][-1000:,:]

z_hat = np.zeros( I )
for i in range( I ):
    z_hat[ i ] = np.bincount( z[:,i] ).argmax()

In [None]:
confMat = confusion_matrix( z_true, z_hat )
print( "Dawid-Skene estimate of true category:\n", confMat )