In [203]:
# standard
from IPython import embed
import pandas as pd
import numpy as np

# frameworks
from frameworks.seq2seq_keras.models import AttentionSeq2Seq
from gensim.models import Word2Vec

# custom
from data_utils import get_train_data
from word2vec import get_word_embedding

# Global Variables

In [204]:
_BATCH_SIZE = 64
_VOCAB_SIZE = 6000
_WORD_DIM = 128
_MODEL_DEPTH = 4

_INPUT_LENGTH = 25
_OUTPUT_LENGTH = 10

# Model

In [None]:
model = AttentionSeq2Seq(input_length=_INPUT_LENGTH, 
                         input_dim=_WORD_DIM, 
                         hidden_dim=_WORD_DIM, 
                         output_length=_OUTPUT_LENGTH, 
                         output_dim=_WORD_DIM, 
                         depth=_MODEL_DEPTH)
model.compile(loss='mse', optimizer='rmsprop')

# Data

In [48]:
embedding = get_word_embedding(_WORD_DIM)

In [6]:
train_data = get_train_data()
_, ch2int = get_vocab()

In [7]:
len(train_data)

39956

In [29]:
def pad_to(lst, length, value):
    for i in range(len(lst), length):
        lst.append(value)
    
    return lst

def clean_train_data(train_data):
    X_train = []
    Y_train = []
    for idx in xrange(len(train_data)):
        line_number = idx % 4
        
        keyword = train_data[idx]['keyword']
        current_sentence = train_data[idx]['sentence']
        previous_sentences = ''.join([train_data[idx - i]['sentence'] for i in range(line_number, 0, -1)])
        
        X_entry = pad_to([[ch2int[ch]] for ch in (keyword + previous_sentences)], 25, [_VOCAB_SIZE - 1])
        Y_entry = pad_to([[ch2int[ch]] for ch in current_sentence], 10, [_VOCAB_SIZE - 1])
        
        X_train.append(X_entry)
        Y_train.append(Y_entry)
        
    return X_train, Y_train

In [30]:
X_train, Y_train = clean_train_data(train_data)

In [85]:
X_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in X_train]

In [86]:
Y_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in Y_train]

# Training

In [195]:
model.fit(X_train_embedded, Y_train_embedded, epochs=1, verbose=1)

KeyboardInterrupt: 

# Generation

In [105]:
kw = u'山水'

In [113]:
kw_pad = [pad_to([[ch2int[ch]] for ch in kw], 25, [_VOCAB_SIZE - 1])]

In [148]:
kw_embed = [map(lambda x: embedding[x[0]], sample) for sample in kw_pad]

In [151]:
kw_embed_array = np.array(kw_embed)

In [196]:
pred = model.predict(kw_embed_array)
pred

array([[[-0.07901192,  0.05590886, -0.18306582, ..., -0.02780079,
          0.22095318,  0.06291175],
        [-0.08044007,  0.2160262 , -0.211705  , ..., -0.04248908,
          0.20641054,  0.20601243],
        [-0.12439758,  0.15313667, -0.10564294, ...,  0.0318213 ,
          0.1692463 , -0.03528845],
        ..., 
        [-0.03381282, -0.23145901,  0.80336988, ...,  0.77744192,
         -0.29983968, -0.52655691],
        [-0.04145544, -0.23551586,  0.82956284, ...,  0.80246538,
         -0.31093204, -0.54147804],
        [-0.03681917, -0.22557406,  0.83075798, ...,  0.80682409,
         -0.3058888 , -0.53655243]]], dtype=float32)

In [197]:
w2v_model = Word2Vec.load('data/word2vec.model')

In [198]:
result = []
for i in range(len(pred[0])):
    result.append(w2v_model.most_similar(positive=[pred[0][i]], topn=1))

In [199]:
for r in result:
    print r[0][0]

又
皈
春
花
送
灺
情
透
透
透
