In [122]:
import os
import sys
import csv
import time
import json
import datetime
import pickle as pkl
import tensorflow as tf
from tensorflow.contrib import learn
import numpy as np

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import datetime
import time
from keras.models import load_model
from utilities import *
from keras import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, GlobalMaxPooling1D, Embedding
from keras.optimizers import RMSprop

In [133]:
resource_dir = 'data/'
embeddings_dir = "embeddings/"
embedding_filename = 'word2vec_GoogleNews'
model_dir = 'models/'
model_name = "Embeddings Model"

# Load metadata
metadata = load_data(resource_dir + "metadata.pkl")
embeddings_dimension = 300
embeddings = load_data(embeddings_dir + embedding_filename + '_' + str(embeddings_dimension) + 'dim.pkl')

# Load Training and test sets
train_data = load_data(resource_dir + 'train_data.pkl')
train_x, train_y = generate_embeddings(train_data, metadata)
train_y=np.array(np.nonzero(train_y))[1] #####diko m

test_data = load_data(resource_dir + 'test_data.pkl')
test_x, test_y = generate_embeddings(test_data, metadata)
test_y=np.array(np.nonzero(test_y))[1]   #####diko m

val_data = load_data(resource_dir + 'val_data.pkl')
val_x, val_y = generate_embeddings(val_data, metadata)
val_y=np.array(np.nonzero(val_y))[1]  #####diko m

train_lengths = np.array(list(map(len, [sent for sent in train_data['utterances']]))) #####diko m
valid_lengths = np.array(list(map(len, [sent for sent in val_data['utterances']]))) #####diko m

# Parameters
vocabulary_size = metadata['vocabulary_size']
num_labels = metadata['num_labels']
max_utterance_len = metadata['max_utterance_len']
embedding_matrix = embeddings['embedding_matrix']
batch_size = 200
hidden_layer = 128
learning_rate = 0.001
num_epoch = 10
#model_name = model_name + " -" + \
#             " Epochs=" + str(num_epoch) + \
#             " Hidden Layers=" + str(hidden_layer)

print("------------------------------------")
print("Using parameters...")
print("Vocabulary size: ", vocabulary_size)
print("Number of labels: ", num_labels)
print("Embeddings dimension: ", embeddings_dimension)
print("Batch size: ", batch_size)
print("Hidden layer size: ", hidden_layer)
print("learning rate: ", learning_rate)
print("Epochs: ", num_epoch)

# Build the model

Loaded data from file data/metadata.pkl.
Loaded data from file embeddings/word2vec_GoogleNews_300dim.pkl.
Loaded data from file data/train_data.pkl.
Loaded data from file data/test_data.pkl.
Loaded data from file data/val_data.pkl.
Embeddings Model - Epochs=10 Hidden Layers=128
------------------------------------
Using parameters...
Vocabulary size:  23103
Number of labels:  41
Embeddings dimension:  300
Batch size:  200
Hidden layer size:  128
learning rate:  0.001
Epochs:  10


In [124]:
class rnn_clf(object):
    """"
    LSTM and Bi-LSTM classifiers for text classification
    """
    def __init__(self, config):
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
        self.num_layers = config.num_layers
        self.l2_reg_lambda = config.l2_reg_lambda

        # Placeholders
        self.batch_size = tf.placeholder(dtype=tf.int32, shape=[], name='batch_size')
        self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_x')
        self.input_y = tf.placeholder(dtype=tf.int64, shape=[None], name='input_y')
        self.keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob')
        self.sequence_length = tf.placeholder(dtype=tf.int32, shape=[None], name='sequence_length')

        # L2 loss
        self.l2_loss = tf.constant(0.0)

        # Word embedding
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            embedding = tf.get_variable('embedding',
                                        shape=[self.vocab_size, self.hidden_size],
                                        dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        # Input dropout
        self.inputs = tf.nn.dropout(inputs, keep_prob=self.keep_prob)

        # LSTM
        if config.clf == 'lstm':
            self.final_state = self.normal_lstm()
        else:
            self.final_state = self.bi_lstm()

        # Softmax output layer
        with tf.name_scope('softmax'):
            # softmax_w = tf.get_variable('softmax_w', shape=[self.hidden_size, self.num_classes], dtype=tf.float32)
            if config.clf == 'lstm':
                softmax_w = tf.get_variable('softmax_w', shape=[self.hidden_size, self.num_classes], dtype=tf.float32)
            else:
                softmax_w = tf.get_variable('softmax_w', shape=[2 * self.hidden_size, self.num_classes], dtype=tf.float32)
            softmax_b = tf.get_variable('softmax_b', shape=[self.num_classes], dtype=tf.float32)

            # L2 regularization for output layer
            self.l2_loss += tf.nn.l2_loss(softmax_w)
            self.l2_loss += tf.nn.l2_loss(softmax_b)

            # self.logits = tf.matmul(self.final_state[self.num_layers - 1].h, softmax_w) + softmax_b
            if config.clf == 'lstm':
                self.logits = tf.matmul(self.final_state[self.num_layers - 1].h, softmax_w) + softmax_b
                
            else:
                self.logits = tf.matmul(self.final_state, softmax_w) + softmax_b
            predictions = tf.nn.softmax(self.logits)
            self.predictions = tf.argmax(predictions, 1, name='predictions')

        # Loss
        with tf.name_scope('loss'):
            tvars = tf.trainable_variables()

            # L2 regularization for LSTM weights
            for tv in tvars:
                if 'kernel' in tv.name:
                    self.l2_loss += tf.nn.l2_loss(tv)


                    #i changed from sparse softmax to softmax otherwise i get error
            losses =  tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.input_y,
                                                                    logits=self.logits)
            self.cost = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss

        # Accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions, self.input_y)
            self.correct_num = tf.reduce_sum(tf.cast(correct_predictions, tf.float32))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')

    def normal_lstm(self):
        # LSTM Cell
        cell = tf.contrib.rnn.LSTMCell(self.hidden_size,
                                       forget_bias=1.0,
                                       state_is_tuple=True,
                                       reuse=tf.get_variable_scope().reuse)
        # Add dropout to cell output
        cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)

        # Stacked LSTMs
        cell = tf.contrib.rnn.MultiRNNCell([cell] * self.num_layers, state_is_tuple=True)

        self._initial_state = cell.zero_state(self.batch_size, dtype=tf.float32)

        # Dynamic LSTM
        with tf.variable_scope('LSTM'):
            outputs, state = tf.nn.dynamic_rnn(cell,
                                               inputs=self.inputs,
                                               initial_state=self._initial_state,
                                               sequence_length=self.sequence_length)

        final_state = state

        return final_state


In [125]:
def batch_iter(data, labels, lengths, batch_size, num_epochs):
    """
    A mini-batch iterator to generate mini-batches for training neural network
    :param data: a list of sentences. each sentence is a vector of integers
    :param labels: a list of labels
    :param batch_size: the size of mini-batch
    :param num_epochs: number of epochs
    :return: a mini-batch iterator
    """
    assert len(data) == len(labels) == len(lengths)

    data_size = len(data)
    epoch_length = data_size // batch_size

    for _ in range(num_epochs):
        for i in range(epoch_length):
            start_index = i * batch_size
            end_index = start_index + batch_size

            xdata = data[start_index: end_index]
            ydata = labels[start_index: end_index]
            sequence_length = lengths[start_index: end_index]

            yield xdata, ydata, sequence_length

In [126]:
####Delete all flags before declare#####

def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)




#import data_helper



try:
    from sklearn.model_selection import train_test_split
except ImportError as e:
    error = "Please install scikit-learn."
    print(str(e) + ': ' + error)
    sys.exit()

# Show warnings and errors only
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Parameters
# =============================================================================

# Model choices
tf.flags.DEFINE_string('clf', 'lstm', "Type of classifiers. Default: cnn. You have four choices: [cnn, lstm, blstm, clstm]")

# Data parameters
tf.flags.DEFINE_string('data_file', 'train_data.pkl', 'Data file path')
tf.flags.DEFINE_string('stop_word_file', None, 'Stop word file path')
tf.flags.DEFINE_string('language', 'en', "Language of the data file. You have two choices: [ch, en]")
tf.flags.DEFINE_integer('min_frequency',0, 'Minimal word frequency')
tf.flags.DEFINE_integer('num_classes', 41, 'Number of classes')
tf.flags.DEFINE_integer('max_length', 106, 'Max document length')
tf.flags.DEFINE_integer('vocab_size', 23103, 'Vocabulary size')
tf.flags.DEFINE_float('test_size', 0.1, 'Cross validation test size')

# Model hyperparameters
tf.flags.DEFINE_integer('embedding_size', 300, 'Word embedding size. For CNN, C-LSTM.')
tf.flags.DEFINE_string('filter_sizes', '3, 4, 5', 'CNN filter sizes. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('num_filters', 128, 'Number of filters per filter size. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('hidden_size', 128, 'Number of hidden units in the LSTM cell. For LSTM, Bi-LSTM')
tf.flags.DEFINE_integer('num_layers', 2, 'Number of the LSTM cells. For LSTM, Bi-LSTM, C-LSTM')
tf.flags.DEFINE_float('keep_prob', 0.5, 'Dropout keep probability')  # All
tf.flags.DEFINE_float('learning_rate', 1e-3, 'Learning rate')  # All
tf.flags.DEFINE_float('l2_reg_lambda', 0.001, 'L2 regularization lambda')  # All

# Training parameters
tf.flags.DEFINE_integer('batch_size', 200, 'Batch size')
tf.flags.DEFINE_integer('num_epochs', 10, 'Number of epochs')
tf.flags.DEFINE_integer('evaluate_every_steps', 100, 'Evaluate the model on validation set after this many steps')
tf.flags.DEFINE_integer('save_every_steps', 1000, 'Save the model after this many steps')
tf.flags.DEFINE_integer('num_checkpoint', 10, 'Number of models to store')

FLAGS = tf.flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')

if FLAGS.clf == 'lstm':
    FLAGS.embedding_size = FLAGS.hidden_size
elif FLAGS.clf == 'clstm':
    FLAGS.hidden_size = len(FLAGS.filter_sizes.split(",")) * FLAGS.num_filters

# Output files directory
timestamp = str(int(time.time()))
outdir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
if not os.path.exists(outdir):
    os.makedirs(outdir)

# Load and save data
# =============================================================================

#data, labels, lengths, vocab_processor = data_helper.load_data(file_path=FLAGS.data_file,
#                                                               sw_path=FLAGS.stop_word_file,
#                                                               min_frequency=FLAGS.min_frequency,
#                                                               max_length=FLAGS.max_length,
#                                                               language=FLAGS.language,
#                                                               shuffle=True)

# Save vocabulary processor
#vocab_processor.save(os.path.join(outdir, 'vocab'))

#FLAGS.vocab_size = len(vocab_processor.vocabulary_._mapping)

#FLAGS.max_length = vocab_processor.max_document_length

params = FLAGS.__flags
# Print parameters
model = params['clf']
if model == 'cnn':
    del params['hidden_size']
    del params['num_layers']
elif model == 'lstm' or model == 'blstm':
    del params['num_filters']
    del params['filter_sizes']
    params['embedding_size'] = params['hidden_size']
elif model == 'clstm':
    params['hidden_size'] = len(list(map(int, params['filter_sizes'].split(",")))) * params['num_filters']

params_dict = sorted(params.items(), key=lambda x: x[0])
print('Parameters:')
for item in params_dict:
    print('{}: {}'.format(item[0], item[1]))
print('')

# Save parameters to file
#params_file = open(os.path.join(outdir, 'params.pkl'), 'wb')
#pkl.dump(params, params_file, True)
#params_file.close()


# Simple Cross validation
# TODO use k-fold cross validation
#x_train, x_valid, y_train, y_valid, train_lengths, valid_lengths = train_test_split(data,
#                                                                                    labels,
#                                                                                    lengths,
#                                                                                    test_size=FLAGS.test_size,
#                                                                                    random_state=22)

# Batch iterator
data_train = batch_iter(train_x, train_y, train_lengths, FLAGS.batch_size, FLAGS.num_epochs)

# Train
# =============================================================================

with tf.Graph().as_default():
    with tf.Session() as sess:
        if FLAGS.clf == 'cnn':
            classifier = cnn_clf(FLAGS)
        elif FLAGS.clf == 'lstm' or FLAGS.clf == 'blstm':
            classifier = rnn_clf(FLAGS)
        elif FLAGS.clf == 'clstm':
            classifier = clstm_clf(FLAGS)
        else:
            raise ValueError('clf should be one of [cnn, lstm, blstm, clstm]')

        # Train procedure
        global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
        grads_and_vars = optimizer.compute_gradients(classifier.cost)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Summaries
        loss_summary = tf.summary.scalar('Loss', classifier.cost)
        accuracy_summary = tf.summary.scalar('Accuracy', classifier.accuracy)

        # Train summary
        train_summary_op = tf.summary.merge_all()
        train_summary_dir = os.path.join(outdir, 'summaries', 'train')
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Validation summary
        valid_summary_op = tf.summary.merge_all()
        valid_summary_dir = os.path.join(outdir, 'summaries', 'valid')
        valid_summary_writer = tf.summary.FileWriter(valid_summary_dir, sess.graph)

        saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoint)

        sess.run(tf.global_variables_initializer())


        def run_step(input_data, is_training=True):
            """Run one step of the training process."""
            input_x, input_y, sequence_length = input_data
            #input_y=np.transpose(np.array(np.nonzero(input_y)[1]))
            fetches = {'step': global_step,
                       'cost': classifier.cost,
                       'accuracy': classifier.accuracy}
            feed_dict = {classifier.input_x: input_x,
                         classifier.input_y: input_y}

            if FLAGS.clf != 'cnn':
                fetches['final_state'] = classifier.final_state
                feed_dict[classifier.batch_size] = len(input_x)
                feed_dict[classifier.sequence_length] = sequence_length

            if is_training:
                fetches['train_op'] = train_op
                fetches['summaries'] = train_summary_op
                feed_dict[classifier.keep_prob] = FLAGS.keep_prob
            else:
                fetches['summaries'] = valid_summary_op
                feed_dict[classifier.keep_prob] = 1.0

            vars = sess.run(fetches, feed_dict)
            step = vars['step']
            cost = vars['cost']
            accuracy = vars['accuracy']
            summaries = vars['summaries']

            # Write summaries to file
            if is_training:
                train_summary_writer.add_summary(summaries, step)
            else:
                valid_summary_writer.add_summary(summaries, step)

            time_str = datetime.datetime.now().isoformat()
            print("{}: step: {}, loss: {:g}, accuracy: {:g}".format(time_str, step, cost, accuracy))

            return accuracy


        print('Start training ...')

        for train_input in data_train:
            run_step(train_input, is_training=True)
            current_step = tf.train.global_step(sess, global_step)

            if current_step % FLAGS.evaluate_every_steps == 0:
                print('\nValidation')
                #val_y=np.array(np.nonzero(val_y)[1]) ###diko m
                run_step((val_x, val_y, valid_lengths), is_training=False)
                print('')

            if current_step % FLAGS.save_every_steps == 0:
                save_path = saver.save(sess, os.path.join(outdir, 'model/clf'), current_step)

        print('\nAll the files have been saved to {}\n'.format(outdir))

Parameters:
batch_size: <absl.flags._flag.Flag object at 0x00000240063C9208>
clf: <absl.flags._flag.Flag object at 0x000002400ED57898>
data_file: <absl.flags._flag.Flag object at 0x000002400ED57358>
embedding_size: <absl.flags._flag.Flag object at 0x000002400ED57048>
evaluate_every_steps: <absl.flags._flag.Flag object at 0x000002401FBE3C88>
f: <absl.flags._flag.Flag object at 0x000002401FBE3CC0>
filter_sizes: <absl.flags._flag.Flag object at 0x000002400ED57F28>
hidden_size: <absl.flags._flag.Flag object at 0x000002400ED57080>
keep_prob: <absl.flags._flag.Flag object at 0x000002400ED57550>
l2_reg_lambda: <absl.flags._flag.Flag object at 0x00000240063C95F8>
language: <absl.flags._flag.Flag object at 0x000002400ED57B00>
learning_rate: <absl.flags._flag.Flag object at 0x00000240063C93C8>
max_length: <absl.flags._flag.Flag object at 0x000002400ED573C8>
min_frequency: <absl.flags._flag.Flag object at 0x000002400ED57E80>
num_checkpoint: <absl.flags._flag.Flag object at 0x000002401FBE3390>
num

KeyboardInterrupt: 

In [12]:
val_x, val_y = generate_embeddings(val_data, metadata)

In [64]:
print(train_x.shape)
print(train_y.shape)


(192768, 106)
(192768,)


In [65]:
train_x, train_y = generate_embeddings(train_data, metadata)
train_y=np.array(np.nonzero(train_y))[1] #####diko m

In [77]:

c=0
for train_input in data_train: 
    if c==1:
        break
    input_x, input_y,lent = train_input
  
  #print(train_input)
  #input_x,input_y=train_input
    c=1
    
print(input_x) 
print(input_y)
print(lent)

[[60  0  0 ...  0  0  0]
 [ 4  5  0 ...  0  0  0]
 [ 4  0  6 ...  0  0  0]
 ...
 [61  0 11 ...  0  0  0]
 [72 25 18 ...  0  0  0]
 [20  1  0 ...  0  0  0]]
[ 1  3  2  1  3  0  0  1  0  1  0  0  1  2 15  1  2  1  2  0 20  1  3  3
  0  9  0 21 29  3  2 27  4  4  0  0 14  7  0  0  1  0  1  3  2  0  0  0
  1  0  0  2  2  1  2  4  3  2  4  2  4  3  0  2  2  1  2  2  4 16  4  2
  3  9  3  2  9  3  2  0  2  1  5  2  4  4  3  2  4  2  2  1 17  3  2  1
 22  6  6 28  0  0  0  0  0  0  1  0  0 14  7  3  0  0  3  5  0  0 20  4
  4  3  2  4  2  4  1  3  3  0  0 17  0  0  0  0  0  0  0  0  0  0  0  5
  1  0  0  2  0  0  1  0  0  0  1  2  2  1  2  1  2  1  2  0  0  1  0  0
  1  0  1  0  1  2  1  2  1  2  1  2  1  2  4  4  2  1  1  2  2  2  1  1
  0  0  0  0  0  0  0  1]
[ 1  4 31  2  3  8 13  1 29  2 11 16  2 25 22  2 19  2 22  8  6  2 14  2
  7  8  6  6  2 11 41  3  2  2 27 24  4  2 14 18  2 14  3  5 17  3  3 11
  2  6 14 28  8  2  4  2  5 37  6 47  2 10 11 33 27  2 17 28  2  4  2  3
  2 10  2 16  6