In [1]:
import os
import re
import copy

import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential, load_model

Using TensorFlow backend.


In [2]:
g_model = None

In [3]:
def get_data_from_files(rootdir):
    data = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if ('.capp' in file):
                textfile = subdir+'/'+file
                with open(textfile,'r') as f :
                    lines = f.readlines()
                train = []
                test = []
                for sent in lines :
                    if '*CHI:' in sent :
                        sent = re.sub('\*[A-Z]+: ', '', sent)
                        test.append(sent)
                    else :
                        sent = re.sub('\*[A-Z]+: ', '', sent)
                        train.append(sent)
                data.append((file,train,test))

    return data

In [4]:

def prepare_seq(seq, maxlen):
    # Pads seq and slides windows
    x = []
    y = []
    for i, w in enumerate(seq):
        x_padded = pad_sequences([seq[:i]],
                                 maxlen=maxlen - 1,
                                 padding='pre')[0]  # Pads before each sequence
        x.append(x_padded)
        y.append(w)
    return x, y



def prepare_train_set(seqs) :
    maxlen = max([len(seq) for seq in seqs])
    x = []
    y = []

    # Slide windows over each sentence
    for seq in seqs:
        x_windows, y_windows = prepare_seq(seq, maxlen)
        x += x_windows
        y += y_windows

    x = np.array(x)
    y = np.eye(len(vocab))[(np.array(y) - 1)]  # One hot encoding

    return vocab_size,maxlen,x,y

In [5]:
def train_model(vocab_size,maxlen,x,y, output_size, hidden_size, epochs):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size+1,  # vocabulary size. Adding an
                                                   # extra element for <PAD> word
                        output_dim = output_size,  # size of embeddings
                        input_length = maxlen-1))  # length of the padded sequences
    model.add(LSTM(hidden_size))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile('rmsprop', 'categorical_crossentropy')

    # Train network
    model.fit(x, y, epochs=epochs)
    return model

In [6]:
def get_seq_prob(word, context, maxlen):
    global g_model
    sub_seq = list(context)
    sub_seq.append(word)
    x, y = prepare_seq(sub_seq, maxlen)
    x = np.array(x)
    y = np.array(y) - 1  # The word <PAD> does not have a class

    p_pred = g_model.predict(x)
    log_p_seq = 0

    for i, prob in enumerate(p_pred):
        prob_word = prob[y[i]]
        log_p_seq += np.log(prob_word)

    return np.exp(log_p_seq)


def eval_production(seq, maxlen):
    result = 0
    vocab = list(seq)
    context = []

    while vocab != [] :
        (next_word, max_prob) = max([(v, get_seq_prob(v, context, maxlen)) for v in vocab], key=lambda prob:prob[1])
        context.append(next_word)
        vocab.remove(next_word)

    if context == seq :
        result = 1

    return result

In [7]:

def get_seq_bylength(seqs) :
    seqs_bylength = dict()
    for seq in seqs :
        seqlen = len(seq)
        if seqlen > 1:
            if seqlen in seqs_bylength:
                seqs_bylength[seqlen].append(seq)
            else :
                seqs_bylength[seqlen] = [seq]
    return seqs_bylength



def get_performance_bylength(seqs_bylength, maxlen) :
    results_bylength = dict()
    for length,seqs in seqs_bylength.items():
        results_bylength[length] = [0, len(seqs)]
        print(str(length))
        for seq in seqs:
            results_bylength[length][0] += eval_production(seq, maxlen)

    return results_bylength

    

In [8]:
rootdir = '../'

data = get_data_from_files(rootdir)



In [10]:
file = data[0][0]
train = data[0][1]
test = data[0][2]

['play checkers .\n',
 'big drum .\n',
 'big drum .\n',
 'big drum .\n',
 'big drum .\n',
 'horse .\n',
 'who that ?\n',
 'two check .\n',
 'rig horn .\n',
 'yeah .\n',
 'play checkers .\n',
 'choochoo train .\n',
 'yep .\n',
 'dere water .\n',
 'big horn .\n',
 'water right dere .\n',
 'water .\n',
 'alright look tv .\n',
 'Jack Jill come .\n',
 'part .\n',
 'part .\n',
 'Jack Jill .\n',
 'part .\n',
 'part .\n',
 'part .\n',
 'take suitcase ?\n',
 'get over Mommy .\n',
 'nickel .\n',
 'here Urler .\n',
 'nickel .\n',
 'nickel .\n',
 'nickel .\n',
 'dont Adam foot .\n',
 'shadow .\n',
 'shadow .\n',
 'shadow .\n',
 'shadow .\n',
 'yeah shadow yeah .\n',
 'shadow funny .\n',
 'put dirt up .\n',
 'I Adam dont sit Adam foot .\n',
 'book .\n',
 'read book .\n',
 'put dirt up .\n',
 'see what bear ?\n',
 'put dirt up .\n',
 'upsadaisy .\n',
 'see marching bear go ?\n',
 'sit dere .\n',
 'sit dere .\n',
 'up upsadaisy .\n',
 'sit dere .\n',
 'where book ?\n',
 'what dat ?\n',
 'read Shadow 

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train+test)

vocab = tokenizer.word_index
vocab_size = len(vocab)

train_seqs = tokenizer.texts_to_sequences(train)
test_seqs = tokenizer.texts_to_sequences(test)

vocab_size,maxlen,x,y = prepare_train_set(train_seqs)
g_model = load_model('../../../../Adam_model.h5')
seqs_bylength = get_seq_bylength(test_seqs)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [14]:
seqs_bylength

{}

In [None]:

results = get_performance_bylength(seqs_bylength, maxlen)

with open(rootdir+'lstm_baseline/prod_results/'+file.split('.capp')[0]+'.prod_result.csv','w') as f :
    f.write("iter,utterance_length,nb_utterances,produced,production_score"+'\n')
    for length in results:
        f.write('1,'+str(length)+','+
                        str(results[length][1])+','+
                        str(results[length][0])+','+
                        str(results[length][0]/results[length][1])+'\n')
del g_model

In [163]:
seqs_bylength = get_seq_bylength(test)

In [164]:
results = get_performance_bylength(seqs_bylength, maxlen)

2
3
6


KeyboardInterrupt: 

In [165]:
seqs_bylength.keys()

dict_keys([2, 3, 6, 4, 7, 5, 8, 12, 9, 11, 10, 14, 13, 25, 15, 20, 18, 17, 19, 16, 44, 22, 21, 27, 23])

In [150]:
results

{2: [6, 7]}

In [166]:
model.save('test_model.h5')

In [None]:
rootdir = '../../data'

data = get_data_from_files(rootdir)
for file,train,test in data:
    vocab_size,maxlen,x,y = prepare_train_set(train)
    model = train_model(vocab_size,maxlen,x,y, output_size=10, epochs=10)
    model.save(str('../../trained_models/eng/'+file+'_model.h5'))
    seqs_bylength = get_seq_bylength(test)
    results = get_performance_bylength(seqs_bylength, maxlen)
    
    outputfile = open(rootdir+'lstm_baseline/prod_results/'+file.split('.capp')[0]+'.prod_result.csv','w')
    outputfile.write("iter,utterance_length,nb_utterances,produced,production_score"+'\n')
            for results in production_scores:
                for length in results:
                    outputfile.write('1,'+str(length)+','+
                                     str(results[length][1])+','+
                                     str(results[length][0])+','+
                                     str(results[length][0]/results[length][1])+'\n')
    del model 
    