# Load libraries

In [262]:
%matplotlib inline

import os
import math

import configparser

import numpy as np
import pandas as pd
import matplotlib.pylab as plt

import sys
sys.path.append('../..')

from scripps.utils import read_datasets, norm1d, bayesian_linear_regression

In [263]:
#np.random.seed(23) #Not working

# Load all mark2cure citizen scientist annotations

In [264]:
CF = read_datasets.get_configuration()

def read_disease_annotations():
    annotations = pd.DataFrame()
    anno = read_datasets.load_dataset('m2c_citizen_disease', 'files1')
    anno[3].replace(['I-Disease', 'O'], [1, 0], inplace=True)
    annotations = pd.concat([annotations, anno[0].rename('Token')], axis=1)
    annotations = pd.concat([annotations, anno[3].rename('Annotator1')], axis=1)
    for i in range(2, 6):
        anno = read_datasets.load_dataset('m2c_citizen_disease', 'files{}'.format(i))
        anno[3].replace(['I-Disease', 'O'], [1, 0], inplace=True)
        annotations = pd.concat([annotations, anno[3].rename('Annotator{}'.format(i))], axis=1)
    return annotations

def read_phenotype_annotations():
    annotations = pd.DataFrame()
    anno = read_datasets.load_dataset('m2c_citizen_phenotype', 'files1')
    anno[3].replace(['I-Phenotype', 'O'], [1, 0], inplace=True)
    annotations = pd.concat([annotations, anno[0].rename('Token')], axis=1)
    annotations = pd.concat([annotations, anno[3].rename('Annotator1')], axis=1)
    for i in range(2, 6):
        anno = read_datasets.load_dataset('m2c_citizen_phenotype', 'files{}'.format(i))
        anno[3].replace(['I-Phenotype', 'O'], [1, 0], inplace=True)
        annotations = pd.concat([annotations, anno[3].rename('Annotator{}'.format(i))], axis=1)
    return annotations

In [265]:
disease_annotations = read_disease_annotations()
phenotype_annotations = read_phenotype_annotations()

# Initialize latent variables

In [266]:
K = disease_annotations.shape[0]
J = disease_annotations.shape[1] - 1

latent = {}

latent['theta'] = np.zeros(K)
latent['d_alpha'] = np.zeros(J)
latent['d_beta'] = np.zeros(J)
latent['d_z'] = np.zeros((K, J))

latent['theta'] = np.zeros(K)
latent['p_alpha'] = np.zeros(J)
latent['p_beta'] = np.zeros(J)
latent['p_z'] = np.zeros((K, J))

#Should these be separate for disease and phenotype?
latent['b0'] = np.zeros(2)
latent['B0_scale'] = 1
latent['m0'] = np.zeros(K) #TODO: ask HS regarding changing certain values here.
latent['C0_scale'] = 1

latent['K'] = K
latent['J'] = J

# Define functions for resampling using Gibbs sampling

In [267]:
def __get_alpha_beta_z(mode):
    if mode == 'disease':
        alpha = latent['d_alpha']
        beta = latent['d_beta']
        z = latent['d_z']
    else:
        alpha = latent['p_alpha']
        beta = latent['p_beta']
        z = latent['p_z']
    return alpha, beta, z
    

In [268]:
def __resample_z(alpha, beta, z, annotations):
    interval = [-100, 100] #Should this be changed?
    for k in range(K):
        for j in range(J):
            mean = alpha[j] + beta[j]*latent['theta'][k]
            std_dev = 1
            if(annotations['Annotator{}'.format(j+1)][k] == 0):
                z[k, j] = norm1d.truncnormal(mean, std_dev, interval[0], 0)
            elif(annotations['Annotator{}'.format(j+1)][k] == 1):
                z[k, j] = norm1d.truncnormal(mean, std_dev, 0, interval[1])
    
def resample_z(mode):
    alpha, beta, z = __get_alpha_beta_z(mode)
    if mode == 'disease':
        annotations = disease_annotations
    else:
        annotations = phenotype_annotations
    __resample_z(alpha, beta, z, annotations)

In [269]:
def __resample_alpha_beta(alpha, beta, z):
    for j in range(J):
        Y = z[:, j]
        X = np.vstack((np.ones(K), latent['theta'])).T
        
        WN, VN = bayesian_linear_regression.linreg_post(X, Y, latent['b0'], latent['B0_scale'], 1)
        sample = np.random.multivariate_normal(WN, VN)
        alpha[j] = sample[0]
        beta[j] = sample[1]

def resample_alpha_beta(mode):
    alpha, beta, z = __get_alpha_beta_z(mode)
    __resample_alpha_beta(alpha, beta, z)

In [270]:
def resample_theta():
    for k in range(K):
        Y = latent['d_z'][k, :] - latent['d_alpha'] + latent['p_z'][k, :] - latent['p_alpha']
        X = latent['d_beta'] + latent['p_beta']
        X = np.reshape(X, (-1, 1))
        WN, VN = bayesian_linear_regression.linreg_post(X, Y, latent['m0'][k], latent['C0_scale'], 1)
        latent['theta'][k] = np.random.normal(WN, math.sqrt(VN))

In [271]:
def save_latent_variables():
    #TODO: save to csv.
    pass


def resample():
    resample_z('disease')
    resample_z('phenotype')
    resample_theta()
    resample_alpha_beta('disease')
    resample_alpha_beta('phenotype')
    
    save_latent_variables()
    
def has_converged():
    #TODO
    pass
    
def run_sampling(num_iter):
    #TODO: consider first step, when we decide if word is relevant or not.
    for it in xrange(1, num_iter+1):
        resample()
        if it%5 == 0:
            print 'Iter {}/{} done.'.format(it, num_iter)
        if has_converged():
            break


In [272]:
run_sampling(10)

Iter 5/10 done.
Iter 10/10 done.


In [273]:
latent['d_z'][0]

array([-1.16569645, -0.43630676,  0.36413131, -0.32486116,  2.16551727])

In [274]:
latent['p_z'][0]

array([0.01135435, 0.16328215, 1.15243926, 0.13046897, 1.01885551])

In [242]:
disease_annotations.iloc[0:2]

Unnamed: 0,Token,Annotator1,Annotator2,Annotator3,Annotator4,Annotator5
0,Haematuria,0,0,1,0,1
1,and,0,0,0,0,0


In [243]:
phenotype_annotations.iloc[0:2]

Unnamed: 0,Token,Annotator1,Annotator2,Annotator3,Annotator4,Annotator5
0,Haematuria,1,1,1,1,1
1,and,0,0,0,0,0


In [251]:
def get_annotator_bias(alpha, beta):
    # compute xr and xa using above latent vars.
    xa = (4*alpha - beta*beta)/(4*beta)
    xr = (4*alpha + beta*beta)/(4*beta)
    return xa, xr

def get_word_position():
    # convert theta locations to an interval [-1, 1]?
    pass

def plot_bias():
    # show the annotator's location along with the word's true position.
    pass


In [250]:
def alignment_to_expert_positions():
    pass

In [275]:
xpa, xpr = get_annotator_bias(latent['p_alpha'], latent['p_beta'])

In [276]:
xpa

array([-3.84232111, -4.21868841, -3.68808512, -2.76334705, -4.32570715])

In [277]:
xpr

array([-3.61888025, -4.01714896, -3.4624524 , -2.48564895, -4.17559956])

In [278]:
xda, xdr = get_annotator_bias(latent['d_alpha'], latent['d_beta'])

In [279]:
xda

array([-2.47078282, -2.74021829, -2.85063176, -2.8680804 , -2.19371895])

In [280]:
xdr

array([-2.17309469, -2.4754409 , -2.57508274, -2.59439885, -1.90408841])

In [261]:
min(latent['theta'])

-4.181559044469624