In [10]:
import pickle as p
import pandas as pd
import numpy as np
import os
import sys
from time import time

from PSyn import data_operators as dops
from PSyn import brain

In [11]:
source = pd.read_csv('data/task1/train/polish-train-high', sep='\t', names=['source', 'target', 'pos'])
test = pd.read_csv('data/task1/dev/polish-dev', sep='\t', names=['source', 'target', 'pos'])

In [12]:
letters = set()
for s in source['source']:
    for c in s:
        letters.add(c)
for t in source['target']:
    for c in t:
        letters.add(c)
letters.add('_')
index_to_letter = dict(enumerate(letters))
letter_to_index = dict((v, k) for k, v in index_to_letter.items())

In [13]:
source_inflection_dict = {}

for row in source.iterrows():
    source_inflection_dict[row[1]['source']] = [letter_to_index[l] for l in row[1]['target']]
    
test_inflection = {}

for row in test.iterrows():
    test_inflection[row[1]['source']] = [letter_to_index[l] for l in row[1]['target']]

In [14]:
max_s = max([len(s) for s, i in source_inflection_dict.items()])
max_v = max([len(i) for s, i in source_inflection_dict.items()])
for s, i in source_inflection_dict.items():
    if len(s) == max_s or len(i) == max_v:
        print(s)
        print(i)

max_t_s = max([len(s) for s, i in test_inflection.items()])
max_t_v = max([len(i) for s, i in test_inflection.items()])

lampa sygnalizacyjno-ostrzegawcza
[46, 56, 1, 32, 33, 57, 21, 36, 38, 27, 56, 46, 5, 58, 56, 7, 36, 45, 27, 8, 34, 8, 21, 11, 53, 58, 3, 38, 56, 31, 7, 58, 33]
nierodzima użytkowniczka języka
[27, 5, 3, 53, 8, 30, 58, 5, 1, 36, 1, 5, 57, 42, 37, 36, 11, 49, 8, 31, 27, 5, 7, 58, 49, 56, 1, 5, 57, 45, 43, 58, 36, 49, 56]


In [15]:
pairs = np.random.permutation(list(source_inflection_dict.keys()))
test_pairs = np.random.permutation(list(test_inflection.keys()))

input_vec = np.zeros((len(pairs), 33))
labels = np.zeros((len(pairs), 35))

for i, k in enumerate(pairs):
    v = source_inflection_dict[k]
    k += '_' * (33 - len(k))
    v += [0] * (35 - len(v))
    for j, c in enumerate(k):
        input_vec[i][j] = letter_to_index[c]
    for j, n in enumerate(v):
        labels[i][j] = n

input_vec = input_vec.astype(np.int32)
labels = labels.astype(np.int32)

data_train = zip(input_vec[:-1000], labels[:-1000])
data_val = zip(input_vec[-1000:], labels[-1000:])

test_vec = np.zeros((len(test_pairs), 33))
test_labels = np.zeros((len(test_pairs), 35))

for i, k in enumerate(test_pairs):
    v = test_inflection[k]
    k += '_' * (33 - len(k))
    v += [0] * (35 - len(v))

    for j, c in enumerate(k):
        test_vec[i][j] = letter_to_index[c]
    for j, n in enumerate(v):
        test_labels[i][j] = n

test_vec = test_vec.astype(np.int32)
test_labels = test_labels.astype(np.int32)

data_test = zip(test_vec, test_labels)

In [16]:
hyperparams = {}
hyperparams['input_seq_length'] = 33
hyperparams['output_seq_length'] = 35
hyperparams['input_vocab_size'] = len(letters)
hyperparams['output_vocab_size'] = len(letters)

In [17]:
train_iter = dops.DataIterator(data_train, 128)
val_iter = dops.DataIterator(data_val, 128)
test_iter = dops.DataIterator(data_test, 128)

In [27]:
import tensorflow as tf
from tensorflow.python import ops

ops.reset_default_graph()
try:
    sess.close()
except:
    pass
sess = tf.Session()

input_seq_length = hyperparams['input_seq_length']
output_seq_length = hyperparams['output_seq_length']
batch_size = 128

input_vocab_size = hyperparams['input_vocab_size']
output_vocab_size = hyperparams['output_vocab_size']
embedding_dim = 256

encode_input = [tf.placeholder(tf.int32,
                               shape=(None,),
                               name="ei_%i" % i)
                for i in range(input_seq_length)]

labels = [tf.placeholder(tf.int32,
                         shape=(None,),
                         name="l_%i" % i)
          for i in range(output_seq_length)]

decode_input = [tf.zeros_like(encode_input[0],
                              dtype=np.int32, name="START")] + labels[:-1]

# Meat of the model
keep_prob = tf.placeholder("float")
cells = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(embedding_dim),
                            output_keep_prob=keep_prob)
         for i in range(3)]
stacked_lstm = tf.contrib.rnn.MultiRNNCell(cells)

with tf.variable_scope("decoders") as scope:
    decode_outputs, decode_state = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm,
        input_vocab_size, output_vocab_size)

    scope.reuse_variables()

    decode_outputs_test, decode_state_test = tf.contrib.nn.legacy_seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm,
        input_vocab_size, output_vocab_size,
        feed_previous=True)

loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in labels]
loss = seq2seq.sequence_loss(decode_outputs, labels, loss_weights, output_vocab_size)
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(loss)

sess.run(tf.initialize_all_variables())

for i in range(len(list(data_train))):
    try:
        train_batch(train_iter)
        if i % 500 == 0:
            val_loss, val_predict = eval_batch(val_iter, 16)
            train_loss, train_predict = eval_batch(train_iter, 16)
            print("val loss   : %f, val predict   = %.1f%%" % (val_loss, val_predict * 100))
            print("train loss : %f, train predict = %.1f%%" % (train_loss, train_predict * 100))
            print
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("interrupted by user")
        break

eval_loss, output, X, Y = get_eval_batch_data(test_iter)

AttributeError: module 'tensorflow' has no attribute 'Session'