# Load libraries

In [1]:
%matplotlib inline

import os
import math

import configparser

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import sys
sys.path.append('../..')

from scripps.utils import read_datasets, norm1d, bayesian_linear_regression
from IPython.core.display import clear_output

In [2]:
#np.random.seed(23) #Not working

# Load all mark2cure citizen scientist annotations

In [3]:
CF = read_datasets.get_configuration()

def read_disease_annotations():
    annotations = pd.DataFrame()
    anno = read_datasets.load_dataset('m2c_citizen_disease', 'files1')
    anno[3].replace(['I-Disease', 'O'], [1, 0], inplace=True)
    annotations = pd.concat([annotations, anno[0].rename('Token')], axis=1)
    annotations = pd.concat([annotations, anno[3].rename('Annotator1')], axis=1)
    for i in range(2, 6):
        anno = read_datasets.load_dataset('m2c_citizen_disease', 'files{}'.format(i))
        anno[3].replace(['I-Disease', 'O'], [1, 0], inplace=True)
        annotations = pd.concat([annotations, anno[3].rename('Annotator{}'.format(i))], axis=1)
    return annotations

def read_phenotype_annotations():
    annotations = pd.DataFrame()
    anno = read_datasets.load_dataset('m2c_citizen_phenotype', 'files1')
    anno[3].replace(['I-Phenotype', 'O'], [1, 0], inplace=True)
    annotations = pd.concat([annotations, anno[0].rename('Token')], axis=1)
    annotations = pd.concat([annotations, anno[3].rename('Annotator1')], axis=1)
    for i in range(2, 6):
        anno = read_datasets.load_dataset('m2c_citizen_phenotype', 'files{}'.format(i))
        anno[3].replace(['I-Phenotype', 'O'], [1, 0], inplace=True)
        annotations = pd.concat([annotations, anno[3].rename('Annotator{}'.format(i))], axis=1)
    return annotations

In [4]:
disease_annotations = read_disease_annotations()
phenotype_annotations = read_phenotype_annotations()

In [5]:
disease_annotations

Unnamed: 0,Token,Annotator1,Annotator2,Annotator3,Annotator4,Annotator5
0,Haematuria,0,0,1,0,1
1,and,0,0,0,0,0
2,abdominal,1,1,1,1,1
3,aortic,1,1,1,1,1
4,aneurysm,1,1,1,1,1
5,.,0,0,0,0,0
6,Haematuria,0,0,1,0,1
7,and,0,0,0,0,0
8,left,0,0,0,0,0
9,loin,0,0,0,0,0


In [6]:
phenotype_annotations

Unnamed: 0,Token,Annotator1,Annotator2,Annotator3,Annotator4,Annotator5
0,Haematuria,1,1,1,1,1
1,and,0,0,0,0,0
2,abdominal,0,0,0,1,0
3,aortic,0,0,0,1,0
4,aneurysm,0,0,0,1,0
5,.,0,0,0,0,0
6,Haematuria,1,1,1,1,1
7,and,0,0,0,0,0
8,left,1,1,0,0,1
9,loin,1,1,0,1,1


In [7]:
num_annotators = 5
num_tokens = disease_annotations.shape[0]

disease_relevant = pd.DataFrame()
phenotype_relevant = pd.DataFrame()

relevant = []
for k in range(num_tokens):
    disease = 0
    phenotype = 0
    
    for j in range(1, num_annotators+1):                
        d = disease_annotations['Annotator{}'.format(j)][k]
        p = phenotype_annotations['Annotator{}'.format(j)][k]
                
        disease += d 
        phenotype += p
        
    if disease == 0 and phenotype == 0:
        continue
    else:
        relevant.append(k)

relevant = np.array(relevant)
print '{} relevant words.'.format(strrelevant.size)
disease_relevant = disease_annotations.iloc[relevant]
phenotype_relevant = phenotype_annotations.iloc[relevant]


(1611, 'relevant words.')


# Initialize latent variables

In [8]:
K = disease_relevant.shape[0]
J = disease_relevant.shape[1] - 1

latent = {}

latent['theta'] = np.zeros(K)
latent['d_alpha'] = np.zeros(J)
latent['d_beta'] = np.zeros(J)
latent['d_z'] = np.zeros((K, J))

latent['theta'] = np.zeros(K)
latent['p_alpha'] = np.zeros(J)
latent['p_beta'] = np.zeros(J)
latent['p_z'] = np.zeros((K, J))

#Should these be separate for disease and phenotype?
latent['b0'] = np.zeros(2)
latent['B0_scale'] = 1
latent['m0'] = np.zeros(K) #TODO: ask HS regarding changing certain values here.
latent['C0_scale'] = 1

latent['K'] = K
latent['J'] = J

# Define functions for resampling using Gibbs sampling

In [47]:
def __get_alpha_beta_z(mode):
    if mode == 'disease':
        alpha = latent['d_alpha']
        beta = latent['d_beta']
        z = latent['d_z']
    else:
        alpha = latent['p_alpha']
        beta = latent['p_beta']
        z = latent['p_z']
    return alpha, beta, z
    

In [48]:
def __resample_z(alpha, beta, z, annotations):
    interval = [-100, 100] #Should this be changed?
    for k in range(K):
        for j in range(J):
            mean = alpha[j] + beta[j]*latent['theta'][k]
            std_dev = 1
            if(annotations['Annotator{}'.format(j+1)][k] == 0):
                z[k, j] = norm1d.truncnormal(mean, std_dev, interval[0], 0)
            elif(annotations['Annotator{}'.format(j+1)][k] == 1):
                z[k, j] = norm1d.truncnormal(mean, std_dev, 0, interval[1])
    
def resample_z(mode):
    alpha, beta, z = __get_alpha_beta_z(mode)
    if mode == 'disease':
        annotations = disease_annotations
    else:
        annotations = phenotype_annotations
    __resample_z(alpha, beta, z, annotations)
    return latent['d_z'] if mode == 'disease' else latent['p_z']

In [49]:
def __resample_alpha_beta(alpha, beta, z):
    for j in range(J):
        Y = z[:, j]
        X = np.vstack((np.ones(K), latent['theta'])).T
        
        WN, VN = bayesian_linear_regression.linreg_post(X, Y, latent['b0'], latent['B0_scale'], 1)
        sample = np.random.multivariate_normal(WN, VN)
        alpha[j] = sample[0]
        beta[j] = sample[1]

def resample_alpha_beta(mode):
    alpha, beta, z = __get_alpha_beta_z(mode)
    __resample_alpha_beta(alpha, beta, z)
    return alpha, beta

In [50]:
def resample_theta():
    for k in range(K):                
        Y = np.concatenate((latent['d_z'][k, :] - latent['d_alpha'], latent['p_z'][k, :] - latent['p_alpha']))
        X = np.concatenate((latent['d_beta'], latent['p_beta']))
        X = np.reshape(X, (-1, 1))
        WN, VN = bayesian_linear_regression.linreg_post(X, Y, latent['m0'][k], latent['C0_scale'], 1)
        latent['theta'][k] = np.random.normal(WN, math.sqrt(VN))    

In [108]:
def save_latent_variables():
    #TODO: save to csv.
    pass


def resample():
    d_z = resample_z('disease')
    p_z = resample_z('phenotype')
    
    resample_theta()
        
    d_alpha, d_beta = resample_alpha_beta('disease')
    p_alpha, p_beta = resample_alpha_beta('phenotype')    
    
    save_latent_variables()
    
    return latent['theta'], d_alpha, d_beta, p_alpha, p_beta, d_z, p_z
    
def has_converged():
    #TODO
    pass

def summarize_param(params, burn_in):
    return np.mean(params[burn_in:], axis=0)

def plot_param(params, param_name, burn_in=0):
    plt.hist(params)
    plt.xlabel(param_name)
    plt.show()

def run_sampling(num_iter, burn_in): 
    zs = []
    thetas = []
    d_alphas = []
    d_betas = []
    p_alphas = []
    p_betas = []
    d_zs = []
    p_zs = []
    
    #TODO: consider first step, when we decide if word is relevant or not.
    for it in xrange(1, num_iter+1):
        theta, d_alpha, d_beta, p_alpha, p_beta, d_z, p_z = resample()
        thetas.append(np.array(theta))
        d_alphas.append(np.array(d_alpha))
        d_betas.append(np.array(d_beta))
        p_alphas.append(np.array(p_alpha))
        p_betas.append(np.array(p_beta))
        d_zs.append(np.array(d_z))
        p_zs.append(np.array(p_z))
        
        if it%5 == 0:
            clear_output()
            print 'Iter {}/{} done.'.format(it, num_iter)
        if has_converged():
            break
    
    all_vars = {}            
    all_vars['thetas'] = thetas
    all_vars['d_alphas'] = d_alphas
    all_vars['d_betas'] = d_betas
    all_vars['p_alphas'] = p_alphas
    all_vars['p_betas'] = p_betas
    all_vars['d_zs'] = d_zs
    all_vars['p_zs'] = p_zs
    
    summary = {}                  
    summary['avg_d_alpha'] = summarize_param(d_alphas, burn_in)
    summary['avg_d_beta'] = summarize_param(d_betas, burn_in)
    summary['avg_p_alpha'] = summarize_param(p_alphas, burn_in)
    summary['avg_p_beta'] = summarize_param(p_betas, burn_in)
    summary['avg_theta'] = summarize_param(thetas, burn_in)    
    summary['avg_d_z'] = summarize_param(d_zs, burn_in)    
    summary['avg_p_z'] = summarize_param(p_zs, burn_in)    
      
    return all_vars, summary

In [109]:
all_vars, summary = run_sampling(2000, 200)

Iter 2000/2000 done.


In [110]:
print 'avg_d_alpha:', summary['avg_d_alpha']
print 'avg_d_beta:', summary['avg_d_beta']
print 'avg_p_alpha:', summary['avg_p_alpha']
print 'avg_p_beta:', summary['avg_p_beta']
print 'avg_theta:', summary['avg_theta']

avg_d_alpha: [-4.37793529 -3.89152899 -4.57312176 -4.56912782 -3.386732  ]
avg_d_beta: [3.05601462 2.66526233 3.09988286 3.02906629 2.69682765]
avg_p_alpha: [-2.18246866 -2.25574923 -2.53428157 -2.72996879 -1.22260312]
avg_p_beta: [0.80637322 1.03366649 1.21275986 1.67242461 0.26719972]
avg_theta: [ 1.51411543 -0.15540282  1.95629708 ... -0.339229   -0.22136739
 -0.36581798]


In [111]:
def get_annotator_bias(alpha, beta):
    # compute xr and xa using above latent vars.
#     xa = (4*alpha - beta*beta)/(4*beta)
#     xr = (4*alpha + beta*beta)/(4*beta)
    xa = (-2*alpha/beta - beta/2)/2
    xr = -2*alpha/beta - xa
    return xa, xr

def get_word_positions(theta, relevant):
    # convert theta locations to an interval [-1, 1]?    
#     x_coords = theta
#     y_coords = np.zeros(theta.size)

#     for i,token in enumerate(tokens):
#         x = x_coords[i]
#         y = y_coords[i]
#         plt.scatter(x, y, marker='x', color='red')
#         plt.text(x+0.3, y+0.3, token, fontsize=9)
#     plt.show()
    positions = pd.DataFrame()
    positions['Tokens'] = disease_annotations['Token']
    positions['Position'] = ''
    positions['Position'].iloc[relevant] = theta    
    for i in range(1, J+1):
        positions['D-A{}'.format(i)] = disease_annotations['Annotator{}'.format(i)]
        positions['P-A{}'.format(i)] = phenotype_annotations['Annotator{}'.format(i)]
    
    positions['Disease'] = np.sum([positions['D-A{}'.format(i)] for i in range(1, J+1)], axis=0)
    positions['Phenotype'] = np.sum([positions['P-A{}'.format(i)] for i in range(1, J+1)], axis=0)
    return positions

def plot_bias():
    # show the annotator's location along with the word's true position.
    pass


In [112]:
def alignment_to_expert_positions():
    pass

In [113]:
xpa, xpr = get_annotator_bias(summary['avg_p_alpha'], summary['avg_p_beta'])

In [114]:
xda, xdr = get_annotator_bias(summary['avg_d_alpha'], summary['avg_d_beta'])

In [115]:
xpa - xpr

array([-0.40318661, -0.51683325, -0.60637993, -0.83621231, -0.13359986])

In [116]:
xda - xdr

array([-1.52800731, -1.33263116, -1.54994143, -1.51453315, -1.34841382])

In [117]:
(xpa + xpr)/2

array([2.70652422, 2.18227954, 2.08968127, 1.63234191, 4.57561527])

In [118]:
(xda + xdr)/2

array([1.4325636 , 1.4600923 , 1.47525631, 1.5084278 , 1.25582071])

In [119]:
print min(summary['avg_theta'])
print max(summary['avg_theta'])

-0.9614429588136572
2.5046971545535563


In [120]:
positions = get_word_positions(summary['avg_theta'], relevant)

In [121]:
positions

Unnamed: 0,Tokens,Position,D-A1,P-A1,D-A2,P-A2,D-A3,P-A3,D-A4,P-A4,D-A5,P-A5,Disease,Phenotype
0,Haematuria,1.51412,0,1,0,1,1,1,0,1,1,1,2,5
1,and,,0,0,0,0,0,0,0,0,0,0,0,0
2,abdominal,-0.155403,1,0,1,0,1,0,1,1,1,0,5,1
3,aortic,1.9563,1,0,1,0,1,0,1,1,1,0,5,1
4,aneurysm,1.94427,1,0,1,0,1,0,1,1,1,0,5,1
5,.,,0,0,0,0,0,0,0,0,0,0,0,0
6,Haematuria,1.92921,0,1,0,1,1,1,0,1,1,1,2,5
7,and,,0,0,0,0,0,0,0,0,0,0,0,0
8,left,-0.228346,0,1,0,1,0,0,0,0,0,1,0,3
9,loin,1.52732,0,1,0,1,0,0,0,1,0,1,0,4


In [124]:
positions.iloc[relevant]

Unnamed: 0,Tokens,Position,D-A1,P-A1,D-A2,P-A2,D-A3,P-A3,D-A4,P-A4,D-A5,P-A5,Disease,Phenotype
0,Haematuria,1.51412,0,1,0,1,1,1,0,1,1,1,2,5
2,abdominal,-0.155403,1,0,1,0,1,0,1,1,1,0,5,1
3,aortic,1.9563,1,0,1,0,1,0,1,1,1,0,5,1
4,aneurysm,1.94427,1,0,1,0,1,0,1,1,1,0,5,1
6,Haematuria,1.92921,0,1,0,1,1,1,0,1,1,1,2,5
8,left,-0.228346,0,1,0,1,0,0,0,0,0,1,0,3
9,loin,1.52732,0,1,0,1,0,0,0,1,0,1,0,4
10,pain,-0.485733,0,1,0,1,1,1,0,1,0,1,1,5
15,abdominal,0.806437,1,0,1,0,1,0,1,1,1,0,5,1
16,aortic,1.05134,1,0,1,0,1,0,1,1,1,0,5,1


In [None]:
plot_param(np.mean(all_vars['thetas'], 'theta'))

In [126]:
all_vars['thetas']

[array([ 1.5692669 , -0.98034708,  2.10286193, ..., -1.03719765,
         0.36262094,  0.55757961]),
 array([ 1.49852791, -1.07235074,  1.9208797 , ..., -0.99040718,
         0.42483143,  0.76478302]),
 array([ 1.46360197, -1.06210672,  2.08726162, ..., -1.11545212,
         0.40487677,  0.80136473]),
 array([ 1.62682711, -1.12500396,  2.03628293, ..., -1.06795075,
         0.23925816,  0.86522117]),
 array([ 1.355473  , -0.96795012,  1.87826565, ..., -1.21894876,
        -0.2530398 ,  0.82549543]),
 array([ 1.50228319, -1.00647175,  1.87228288, ..., -1.2780292 ,
        -0.17749303,  0.60799424]),
 array([ 1.60651037, -1.33180254,  2.06881127, ..., -1.11967061,
        -0.41497537,  0.59939365]),
 array([ 1.58238356, -1.42012232,  1.95105472, ..., -1.05300906,
        -0.39822125,  0.54516381]),
 array([ 1.1892216 , -1.30960384,  1.97679219, ..., -0.80104158,
        -0.42532604,  0.89186758]),
 array([ 1.31101231, -1.43134341,  1.74856364, ..., -0.75082353,
        -0.03661148,  1.16