In [2]:
import numpy as np
import pandas as pd
import csv
import copy
import re
from collections import Counter

from sklearn.utils import shuffle
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Flatten, concatenate, multiply
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling2D

Using TensorFlow backend.


In [3]:
class Path():
    train_file_path = "../data/train_stories.csv"
    val_file_path = "../data/val_stories.csv"

In [424]:
class Data(Path):
    def __init__(self):
        self.most_common   = 2
        self.max_seqlen    = 30
        self.prepare_dummy = True
        
        self.unk = "<UNK>"
        self.bos = "<BOS>"
        self.eos = "<EOS>"
        self.pad = "<PAD>"
        
        self.train_dataset, self.y = self.load_train_text(Path.train_file_path)
        self.vocab                 = self.create_vocab() # used in test
        self.w2i_dict              = self.create_w2i_dict() # used in test
        self.i2w_dict              = self.create_i2w_dict() # used in test
        self.train_dataset_ids     = self.convert_w2i_dataset(self.train_dataset) 
        self.train_x               = self.train_dataset_ids
        
        self.test_dataset,         self.answers = self.load_test_text(Path.val_file_path)
        self.test_dataset_ids                   = self.convert_w2i_dataset(self.test_dataset)
        self.test_x, self.test_e1, self.test_e2 = self.split_test_dataset(self.test_dataset_ids)
        
        # report about variables
        print('''
        ==================================================
        words with frequency less than {} is not in vocab.
        maximum sentencelength: {}
        train_x.shape:  {}
        test_x.shape:   {}
        test_e1.shape:  {}
        test_e2.shape:  {}
        len(vocab):     {}
        ==================================================
        '''.format(self.most_common, self.max_seqlen, self.train_x.shape,
                  self.test_x.shape, self.test_e1.shape, self.test_e2.shape, len(self.vocab)))
        
    def split_test_dataset(self, dataset):
        # assuming dataset.shape = (datanum, 4 + 1 + 1, 30)
        # TODO: remove hardcoding
        return dataset[:, :4, :], dataset[:, 4:5, :], dataset[:, 5:, :]
        
    def clean_text(self, string):
        string = string.lower()

        # insert space before special symbols
        string = re.sub("(['.,!?])", r' \g<1>', string)

        return string

    def load_test_text(self, datapath):
        df = pd.read_csv(datapath)

        story_ids = df['InputStoryid'].tolist()
        stories = (df[['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4', 
                       'RandomFifthSentenceQuiz1', 'RandomFifthSentenceQuiz2']])
        answers = df['AnswerRightEnding'].tolist()
        
        lines = stories.values.tolist()
        if self.max_seqlen:
            lines = ([[[self.bos] + clean_text(string).split() + [self.eos]
                       + [self.pad] * (self.max_seqlen - len(clean_text(string).split()) - 2)
                      for string in line] for line in lines])
        
        self.test_story_ids = story_ids
        return lines, answers
        
    def augment_with_fake(self, df):
        """
        Given df, this function copies it and replace 'sentence5' with fake ending
        (picked from each of 'sentence1'-'sentence4' in the same row), appending new df to old df.
        so the dataset will be 5 times bigger.
        """
        augmented_df = copy.copy(df)
        augmented_answers = np.ones(len(df))
        
        for column_name in ['sentence1', 'sentence2', 'sentence3', 'sentence4']:
            fake_df = copy.copy(df)
            fake_answers = np.zeros(len(fake_df))
            
            fake_df['sentence5'] = df[column_name]
            augmented_df = augmented_df.append(fake_df, ignore_index=True)
            augmented_answers = np.concatenate((augmented_answers, fake_answers), axis=0)

        assert len(augmented_df) == len(augmented_answers)
        
        return augmented_df, augmented_answers
        
    def load_train_text(self, datapath):

        df = pd.read_csv(datapath)
        story_ids = df['storyid'].tolist()
        story_titles = df['storytitle']# extract only 'title'
        stories = df[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']]
        # stories_with_fake_ending = copy.copy(stories)
        # append "fake sentence" column to stories
        answers = np.ones(len(stories))
        
        if self.prepare_dummy:
            stories, answers = self.augment_with_fake(stories)
        
        lines = stories.values.tolist() 
        if self.max_seqlen:
            lines = ([[[self.bos] + clean_text(string).split() + [self.eos] 
                       + [self.pad] * (self.max_seqlen - len(clean_text(string).split()) - 2)
                       for string in line] for line in lines]) # extract 'sentence1 - 5'
        else:
            lines = [[clean_text(string).split() for string in line[2:]] for line in lines]
            
        self.train_story_ids = story_ids
        self.train_story_titles = story_titles
        return lines, answers

    def create_vocab(self):
        flattened_dataset = [word for sentences in self.train_dataset for sentence in sentences[1:] for word in sentence]
        vocab = dict(Counter(flattened_dataset), most_common=self.most_common)
        vocab[self.unk] = 1
        return vocab
    
    def create_w2i_dict(self):
        """
        vocab which converts word to id
        """
        w2i_vocab = dict()
        w2i_vocab[self.pad] = 0
        i = 1
        for key, val in self.vocab.items():
            if w2i_vocab.get(key) == None:
                w2i_vocab[key] = i
                i += 1            
                
        return w2i_vocab
    
    def create_i2w_dict(self):
        return {v: k for k, v in self.w2i_dict.items()}
    
    def get_id(self, word):
        ind = self.w2i_dict.get(word)
        if ind == None:
            return self.w2i_dict[self.unk]
        return ind
    
    def convert_w2i_dataset(self, dataset):
        array = np.array([[[self.get_id(word) for word in sentence] for sentence in sentences] for sentences in dataset])
        return array

    def convert_i2w_dataset(self):
        pass
    
    def prepare_training_data(self):
        train_x, train_y = self.dataset_ids[:, :4, :], self.dataset_ids[:, 4:, :]
        return train_x, train_y

    def depth(l):
        """
        get the depth of the list (unused for now)
        """
        if isinstance(l, list):
            return 1 + max(depth(item) for item in l)
        else:
            return 0
        
    # TODO is test data converted to ids correctly?

In [4]:
data = Data()

NameError: name 'Data' is not defined

In [5]:
df = pd.read_csv(Path.train_file_path)

In [12]:
max([len(sentence.split()) for sentence in df['sentence5'].tolist()])

19

In [13]:
df['sentence1'].tolist()[0].split()

['Kelly',
 'found',
 'her',
 "grandmother's",
 'pizza',
 'recipe',
 'in',
 'a',
 'shoebox',
 'of',
 'memories.']

In [294]:
class Seq2Seq():
    """
    ref: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
    """
    def __init__(self):
        self.embed_dim = 128
        self.hidden_dim = 64
        self.batchsize = 64
        self.epochs = 10
    
    def __call__(self, x, y):
        # seq2seq model.
        encoder_inputs = Input(shape=(None, 30))
        decoder_inputs = Input(shape=(None, 30))

        encoder = LSTM(self.hidden_dim, return_state=True)
        embeddings = Embedding(input_dim=len(data.vocab), output_dim=self.embed_dim,
                               input_length=data.max_seqlen, mask_zero=True)
        decoder = LSTM(self.hidden_dim, return_state=True)
        decoder_fc = Dense(len(data.vocab), activation='softmax')

        # start encoding.
        x = embeddings(encoder_inputs) # [batchsize, 30] -> [batchsize, 30, output_dim]
        encoder_outputs, state_h, state_c = encoder(x) # state: [batchsize, hidden_dim]
        encoder_states = [state_h, state_c]

        # start decoding, using `encoder_states` as initial state.
        decoder_embeddings =embeddings(decoder_inputs)
        decoder_outputs, _, _ = decoder(decoder_embeddings, initial_state = encoder_states)
        decoder_target = decoder_fc(decoder_outputs)
        
        model = Model([encoder_inputs, decoder_inputs], decoder_target)
        
        model.compile(optimizer='sgd', loss='categorical_crossentropy')
        model.fit([x, y], y[:, 1:, :],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.2)
        # decoder_target must be ahead by 1.

In [441]:
class Classifier():
    def __init__(self):
        self.embed_dim = 128
        self.hidden_dim = 64
        self.feature_dim = 32
        self.model = None
        self.batchsize = 64
        self.epochs = 1

        self.n_stories = 4
        self.n_options = 1
        
    def build_model(self):
        story_inputs = [Input(shape=(data.max_seqlen,)) for _ in range(self.n_stories)]
        option_inputs = [Input(shape=(data.max_seqlen,)) for _ in range(self.n_options)]
        # TODO; prepare and input dummy sentence
        
        inputs = story_inputs + option_inputs
        embed_layer = Embedding(input_dim=len(data.vocab), output_dim=self.embed_dim,
                               input_length=data.max_seqlen, mask_zero=True)
        birnn_layer = Bidirectional(LSTM(self.hidden_dim))
        dense_layer = Dense(self.feature_dim, activation='relu')
        ending_dense_layer = Dense(self.n_stories * self.feature_dim, activation='relu')

        embeddings = [embed_layer(_input) for _input in inputs]
        birnn_outputs = [birnn_layer(embedding) for embedding in embeddings]
        fc_outputs = [dense_layer(birnn_output) for birnn_output in birnn_outputs[:self.n_stories]]
        ending_features = ending_dense_layer(birnn_outputs[4])
        
        story_features = concatenate(fc_outputs)
        story_features = multiply([story_features, ending_features]) # TODO make it more exact like paper do
#         conv = Conv1D(16, kernel_size=3, activation='relu')(story_features)
        fc = Dense(1, activation='sigmoid')(story_features)
        model = Model(inputs=inputs, outputs=fc)
        model.compile(optimizer='sgd', loss = 'binary_crossentropy')
        self.model = model
    
    def train(self, inputs, outputs):
        if self.model == None:
            raise ValueError("self.model is None. run build_model() first.")
        self.model.fit(inputs, outputs, epochs=self.epochs, batch_size=self.batchsize) 
    
    def test(self, inputs, batchsize):
        if self.model == None:
            raise ValueError("self.model is None. run build_model() first.")
        prediction = self.model.predict(inputs, batchsize=batchsize)
        return prediction

In [445]:
classifier = Classifier()

In [446]:
classifier.build_model()

In [None]:
inputs = [data.train_x[:, i, :] for i in range(5)]
answers = data.y
classifier.train(inputs, answers)

Epoch 1/1
 65536/440805 [===>..........................] - ETA: 51:06 - loss: 0.5308  