# Generation test of LK-C-Model
### imports

In [29]:
from blocks.model import Model
from conllutil import CoNLLData
from itertools import chain
from network import *
from numpy import array, load, save, zeros
from os import path
from pandas import factorize
from random import randint
from scipy.stats import pearsonr
from theano import function
from theano.tensor.sharedvar import SharedVariable
from theano.tensor import matrix, TensorType
from util import StateComputer

### constants

In [30]:
ALPHA = .05
MODEL_FILE = './models/hdt/hdt-ncs-eos-np-35-7-1.pkl'
IX_2_TOK_FILE = './data/hdt-ncs-eos-np-35-7-1_ix2tok.npy'
HDT_DIR = '../datasets/hdt/hamburg-dependency-treebank-conll/'
NP_FOLDER = './data/np'

### Build model

In [3]:
ix2tok = load(IX_2_TOK_FILE).item()
nt = Network(NetworkType.LSTM, input_dim=len(ix2tok))
nt.set_parameters(MODEL_FILE)

### Building generator

In [4]:
model = Model(nt.generator.generate(n_steps=nt.x.shape[0], batch_size=nt.x.shape[1]))
param_dict = model.get_parameter_dict()
init_state_0 = param_dict['/sequencegenerator/with_fake_attention/transition/layer#0.initial_state#0']
init_state_1 = param_dict['/sequencegenerator/with_fake_attention/transition/layer#1.initial_state#1']
init_state_2 = param_dict['/sequencegenerator/with_fake_attention/transition/layer#2.initial_state#2']
init_cells_0 = param_dict['/sequencegenerator/with_fake_attention/transition/layer#0.initial_cells']
init_cells_1 = param_dict['/sequencegenerator/with_fake_attention/transition/layer#1.initial_cells']
init_cells_2 = param_dict['/sequencegenerator/with_fake_attention/transition/layer#2.initial_cells']
reset_values = {
    0: (init_state_0.get_value(), init_cells_0.get_value()),
    1: (init_state_1.get_value(), init_cells_1.get_value()),
    2: (init_state_2.get_value(), init_cells_2.get_value())
}
gen_func = model.get_theano_function(allow_input_downcast=True)

In [5]:
tok2ix = {v: k for k, v in ix2tok.items()}
sc = StateComputer(nt.cost_model, tok2ix)

# Before we continue ... save the model in a lightweight format

In [6]:
#save(path.join(NP_FOLDER, 'param_dict.npy'), array(param_dict))
#for k in param_dict:
#    save(path.join(NP_FOLDER, k.replace('/', '-')), param_dict[k].get_value())

### Generation procedure

In [7]:
def reset_generator():
    init_state_0 = reset_values[0][0]
    init_cells_0 = reset_values[0][1]
    init_state_1 = reset_values[1][0]
    init_cells_1 = reset_values[1][1]
    init_state_2 = reset_values[2][0]
    init_cells_2 = reset_values[2][1]
    
def init_zero():
    # note sure this is always a good idea
    d = init_state_0.get_value().shape[0]
    dt = 'float32'
    init_state_0.set_value(zeros(d, dtype=dt))
    init_cells_0.set_value(zeros(d, dtype=dt))
    init_state_1.set_value(zeros(d, dtype=dt))
    init_cells_1.set_value(zeros(d, dtype=dt))
    init_state_2.set_value(zeros(d, dtype=dt))
    init_cells_2.set_value(zeros(d, dtype=dt))
    
def generate_sequence(start, reset_func):
    
    seq = [start]
    ix = array([[tok2ix[start]]])
    while not seq[-1] == '<EOS>':
        state_0, cells_0, state_1, cells_1, state_2, cells_2, ix, costs = gen_func(ix)
        init_state_0.set_value(state_0[0][0])
        init_cells_0.set_value(cells_0[0][0])
        init_state_1.set_value(state_1[0][0])
        init_cells_1.set_value(cells_1[0][0])
        init_state_2.set_value(state_2[0][0])
        init_cells_2.set_value(cells_2[0][0])
        seq.append(ix2tok[ix[0][0]])
    
    reset_func()
    
    return ' '.join(seq[:-1])


In [8]:
#print(generate_sequence('ein', reset_generator))  # good results 500 - 1000

In [26]:
act_seq[1, 1:]

array([ 0.15378268, -0.19670351, -0.16562188,  0.03126331, -0.06216872,
       -0.40889442, -0.54963213, -0.11671217, -0.12459658, -0.35418853,
       -0.95723981, -0.39901543], dtype=float32)

# Check correlations with POS
## Step 1: Read all sentences from PART_A and store activations

__comment:__  Only part A is used, since this one's annotations are handmade and checked.

In [17]:
eos_ix = tok2ix['<EOS>']
cd = CoNLLData(HDT_DIR, ['part_A.conll'], tok2ix, word_transform=str.lower, lazy_loading=True, min_len=7, max_len=35)
sentences_ix = [[tok2ix[seq[i]] for i in range(len(seq))] + [eos_ix] for seq in cd.wordsequences()]

In [18]:
cell_name = sc.state_var_names[2]
activations = []
# for testing purposes only read one sequence, try again later with more
activations.append(sc.read_single_sequence(sentences_ix[0])[cell_name])

### Correlate for each pos tag

In [32]:
pos_correlations = {}
tagset = set(chain(*cd.possequences()))
for tag in tagset:
    pos_correlations[tag] = []
    for i in range(len(activations)):    
        act_seq = activations[i].transpose()
        pos_seq = factorize(cd.possequences()[0])[0]
        correls = []
        for r in range(act_seq.shape[0]):
            crl = pearsonr(act_seq[r, 1:], pos_seq)
            correls.append(crl[0] if crl[1] < ALPHA else .0)
        pos_correlations[tag].append(correls)