In [1]:
import numpy as np
import pandas as pd
import csv
import copy
import re
from collections import Counter

from sklearn.utils import shuffle
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Flatten, concatenate, multiply
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling2D

Using TensorFlow backend.


In [3]:
class Path():
    train_file_path = "../data/train_stories.csv"
    val_file_path = "../data/val_stories.csv"

In [424]:
class Data(Path):
    def __init__(self):
        self.most_common   = 2
        self.max_seqlen    = 30
        self.prepare_dummy = True
        
        self.unk = "<UNK>"
        self.bos = "<BOS>"
        self.eos = "<EOS>"
        self.pad = "<PAD>"
        
        self.train_dataset, self.y = self.load_train_text(Path.train_file_path)
        self.vocab                 = self.create_vocab() # used in test
        self.w2i_dict              = self.create_w2i_dict() # used in test
        self.i2w_dict              = self.create_i2w_dict() # used in test
        self.train_dataset_ids     = self.convert_w2i_dataset(self.train_dataset) 
        self.train_x               = self.train_dataset_ids
        
        self.test_dataset,         self.answers = self.load_test_text(Path.val_file_path)
        self.test_dataset_ids                   = self.convert_w2i_dataset(self.test_dataset)
        self.test_x, self.test_e1, self.test_e2 = self.split_test_dataset(self.test_dataset_ids)
        
        # report about variables
        print('''
        ==================================================
        words with frequency less than {} is not in vocab.
        maximum sentencelength: {}
        train_x.shape:  {}
        test_x.shape:   {}
        test_e1.shape:  {}
        test_e2.shape:  {}
        len(vocab):     {}
        ==================================================
        '''.format(self.most_common, self.max_seqlen, self.train_x.shape,
                  self.test_x.shape, self.test_e1.shape, self.test_e2.shape, len(self.vocab)))
        
    def split_test_dataset(self, dataset):
        # assuming dataset.shape = (datanum, 4 + 1 + 1, 30)
        # TODO: remove hardcoding
        return dataset[:, :4, :], dataset[:, 4:5, :], dataset[:, 5:, :]
        
    def clean_text(self, string):
        string = string.lower()

        # insert space before special symbols
        string = re.sub("(['.,!?])", r' \g<1>', string)

        return string

    def load_test_text(self, datapath):
        df = pd.read_csv(datapath)

        story_ids = df['InputStoryid'].tolist()
        stories = (df[['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4', 
                       'RandomFifthSentenceQuiz1', 'RandomFifthSentenceQuiz2']])
        answers = df['AnswerRightEnding'].tolist()
        
        lines = stories.values.tolist()
        if self.max_seqlen:
            lines = ([[[self.bos] + clean_text(string).split() + [self.eos]
                       + [self.pad] * (self.max_seqlen - len(clean_text(string).split()) - 2)
                      for string in line] for line in lines])
        
        self.test_story_ids = story_ids
        return lines, answers
        
    def augment_with_fake(self, df):
        """
        Given df, this function copies it and replace 'sentence5' with fake ending
        (picked from each of 'sentence1'-'sentence4' in the same row), appending new df to old df.
        so the dataset will be 5 times bigger.
        """
        augmented_df = copy.copy(df)
        augmented_answers = np.ones(len(df))
        
        for column_name in ['sentence1', 'sentence2', 'sentence3', 'sentence4']:
            fake_df = copy.copy(df)
            fake_answers = np.zeros(len(fake_df))
            
            fake_df['sentence5'] = df[column_name]
            augmented_df = augmented_df.append(fake_df, ignore_index=True)
            augmented_answers = np.concatenate((augmented_answers, fake_answers), axis=0)

        assert len(augmented_df) == len(augmented_answers)
        
        return augmented_df, augmented_answers
        
    def load_train_text(self, datapath):

        df = pd.read_csv(datapath)
        story_ids = df['storyid'].tolist()
        story_titles = df['storytitle']# extract only 'title'
        stories = df[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']]
        # stories_with_fake_ending = copy.copy(stories)
        # append "fake sentence" column to stories
        answers = np.ones(len(stories))
        
        if self.prepare_dummy:
            stories, answers = self.augment_with_fake(stories)
        
        lines = stories.values.tolist() 
        if self.max_seqlen:
            lines = ([[[self.bos] + clean_text(string).split() + [self.eos] 
                       + [self.pad] * (self.max_seqlen - len(clean_text(string).split()) - 2)
                       for string in line] for line in lines]) # extract 'sentence1 - 5'
        else:
            lines = [[clean_text(string).split() for string in line[2:]] for line in lines]
            
        self.train_story_ids = story_ids
        self.train_story_titles = story_titles
        return lines, answers

    def create_vocab(self):
        flattened_dataset = [word for sentences in self.train_dataset for sentence in sentences[1:] for word in sentence]
        vocab = dict(Counter(flattened_dataset), most_common=self.most_common)
        vocab[self.unk] = 1
        return vocab
    
    def create_w2i_dict(self):
        """
        vocab which converts word to id
        """
        w2i_vocab = dict()
        w2i_vocab[self.pad] = 0
        i = 1
        for key, val in self.vocab.items():
            if w2i_vocab.get(key) == None:
                w2i_vocab[key] = i
                i += 1            
                
        return w2i_vocab
    
    def create_i2w_dict(self):
        return {v: k for k, v in self.w2i_dict.items()}
    
    def get_id(self, word):
        ind = self.w2i_dict.get(word)
        if ind == None:
            return self.w2i_dict[self.unk]
        return ind
    
    def convert_w2i_dataset(self, dataset):
        array = np.array([[[self.get_id(word) for word in sentence] for sentence in sentences] for sentences in dataset])
        return array

    def convert_i2w_dataset(self):
        pass
    
    def prepare_training_data(self):
        train_x, train_y = self.dataset_ids[:, :4, :], self.dataset_ids[:, 4:, :]
        return train_x, train_y

    def depth(l):
        """
        get the depth of the list (unused for now)
        """
        if isinstance(l, list):
            return 1 + max(depth(item) for item in l)
        else:
            return 0
        
    # TODO is test data converted to ids correctly?

In [4]:
data = Data()

NameError: name 'Data' is not defined

In [5]:
df = pd.read_csv(Path.train_file_path)

In [12]:
max([len(sentence.split()) for sentence in df['sentence5'].tolist()])

19

In [13]:
df['sentence1'].tolist()[0].split()

['Kelly',
 'found',
 'her',
 "grandmother's",
 'pizza',
 'recipe',
 'in',
 'a',
 'shoebox',
 'of',
 'memories.']

In [294]:
class Seq2Seq():
    """
    ref: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
    """
    def __init__(self):
        self.embed_dim = 128
        self.hidden_dim = 64
        self.batchsize = 64
        self.epochs = 10
    
    def __call__(self, x, y):
        # seq2seq model.
        encoder_inputs = Input(shape=(None, 30))
        decoder_inputs = Input(shape=(None, 30))

        encoder = LSTM(self.hidden_dim, return_state=True)
        embeddings = Embedding(input_dim=len(data.vocab), output_dim=self.embed_dim,
                               input_length=data.max_seqlen, mask_zero=True)
        decoder = LSTM(self.hidden_dim, return_state=True)
        decoder_fc = Dense(len(data.vocab), activation='softmax')

        # start encoding.
        x = embeddings(encoder_inputs) # [batchsize, 30] -> [batchsize, 30, output_dim]
        encoder_outputs, state_h, state_c = encoder(x) # state: [batchsize, hidden_dim]
        encoder_states = [state_h, state_c]

        # start decoding, using `encoder_states` as initial state.
        decoder_embeddings =embeddings(decoder_inputs)
        decoder_outputs, _, _ = decoder(decoder_embeddings, initial_state = encoder_states)
        decoder_target = decoder_fc(decoder_outputs)
        
        model = Model([encoder_inputs, decoder_inputs], decoder_target)
        
        model.compile(optimizer='sgd', loss='categorical_crossentropy')
        model.fit([x, y], y[:, 1:, :],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.2)
        # decoder_target must be ahead by 1.

In [441]:
class Classifier():
    def __init__(self):
        self.embed_dim = 128
        self.hidden_dim = 64
        self.feature_dim = 32
        self.model = None
        self.batchsize = 64
        self.epochs = 1

        self.n_stories = 4
        self.n_options = 1
        
    def build_model(self):
        story_inputs = [Input(shape=(data.max_seqlen,)) for _ in range(self.n_stories)]
        option_inputs = [Input(shape=(data.max_seqlen,)) for _ in range(self.n_options)]
        # TODO; prepare and input dummy sentence
        
        inputs = story_inputs + option_inputs
        embed_layer = Embedding(input_dim=len(data.vocab), output_dim=self.embed_dim,
                               input_length=data.max_seqlen, mask_zero=True)
        birnn_layer = Bidirectional(LSTM(self.hidden_dim))
        dense_layer = Dense(self.feature_dim, activation='relu')
        ending_dense_layer = Dense(self.n_stories * self.feature_dim, activation='relu')

        embeddings = [embed_layer(_input) for _input in inputs]
        birnn_outputs = [birnn_layer(embedding) for embedding in embeddings]
        fc_outputs = [dense_layer(birnn_output) for birnn_output in birnn_outputs[:self.n_stories]]
        ending_features = ending_dense_layer(birnn_outputs[4])
        
        story_features = concatenate(fc_outputs)
        story_features = multiply([story_features, ending_features]) # TODO make it more exact like paper do
#         conv = Conv1D(16, kernel_size=3, activation='relu')(story_features)
        fc = Dense(1, activation='sigmoid')(story_features)
        model = Model(inputs=inputs, outputs=fc)
        model.compile(optimizer='sgd', loss = 'binary_crossentropy')
        self.model = model
    
    def train(self, inputs, outputs):
        if self.model == None:
            raise ValueError("self.model is None. run build_model() first.")
        self.model.fit(inputs, outputs, epochs=self.epochs, batch_size=self.batchsize) 
    
    def test(self, inputs, batchsize):
        if self.model == None:
            raise ValueError("self.model is None. run build_model() first.")
        prediction = self.model.predict(inputs, batchsize=batchsize)
        return prediction

In [445]:
classifier = Classifier()

In [446]:
classifier.build_model()

In [None]:
inputs = [data.train_x[:, i, :] for i in range(5)]
answers = data.y
classifier.train(inputs, answers)

Epoch 1/1
 65536/440805 [===>..........................] - ETA: 51:06 - loss: 0.5308  

In [3]:
df = pd.read_csv("../data/storydata_all.csv")

CParserError: Error tokenizing data. C error: Expected 6 fields in line 14, saw 7


In [6]:
df = pd.read_csv('../data/ROCStories__spring2016 - ROCStories_spring2016.csv')

In [12]:
df

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,9a51198e-96f1-42c3-b09d-a3e1e067d803,Overweight Kid,Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a c...,They got themselves and Dan on a diet.
1,617e7ada-3878-488d-bd56-40695b91f053,The Bike Accident,Carrie had just learned how to ride a bike.,She didn't have a bike of her own.,Carrie would sneak rides on her sister's bike.,She got nervous on a hill and crashed into a w...,The bike frame bent and Carrie got a deep gash...
2,79b0da1f-e460-4173-ba58-8c9e2553c53a,Beach,Morgan enjoyed long walks on the beach.,She and her boyfriend decided to go for a long...,"After walking for over a mile, something happe...",Morgan decided to propose to her boyfriend.,Her boyfriend was upset he didn't propose to h...
3,d173b7de-4611-4cdf-934c-912834755e41,The bad customer.,Jane was working at a diner.,"Suddenly, a customer barged up to the counter.",He began yelling about how long his food was t...,Jane didn't know how to react.,"Luckily, her coworker intervened and calmed th..."
4,af0fd5a4-de36-47ba-8aa2-e99d10986d7a,Being Patient,I was talking to my crush today.,She continued to complain about guys flirting ...,I decided to agree with what she says and list...,"After I got home, I got a text from her.",She asked if we can hang out tomorrow.
5,5d5e7aeb-332f-4c8b-a3c3-44585501e493,Foolish Frank,Frank had been drinking beer.,"He got a call from his girlfriend, asking wher...",Frank suddenly realized he had a date that night.,"Since Frank was already a bit drunk, he could ...",Frank spent the rest of the night drinking mor...
6,bdceebb0-c6e8-417f-b6ae-741a0a71fcd3,Shark Adventure,Dave was in the Bahamas on vacation.,He decided to go snorkeling on his second day.,"While snorkeling, he saw a cave up ahead.","He went into the cave, and he was terrified wh...","Dave swam away as fast as he could, but the sh..."
7,a4a9aaca-d3d4-46b4-807c-ef75aea68c56,Too sunny for Sunny,Sunny enjoyed going to the beach.,"As she stepped out of her car, she realized sh...",It was quite sunny and she forgot her sunglasses.,Sunny got back into her car and heading toward...,Sunny found some sunglasses and headed back to...
8,8ddc9d26-d253-4eb3-b621-9a4e6d9c57e0,The Merry Widow,Sally was happy when her widowed mom found a n...,She discovered her siblings didn't feel the same.,Sally flew to visit her mom and her mom's new ...,"Although her mom was obviously in love, he was...",Sally went home and wondered about her parents...
9,80ff5d8f-bcb1-437b-9da1-71c52997def2,Golf Cheat,Dan hit his golf ball and watched it go.,The ball bounced on the grass and into the san...,Dan pretended that his ball actually landed on...,His friends were not paying attention so they ...,Dan snuck a ball on the green and made his put...


In [2]:
df = pd.read_csv('../data/storydata_all.csv')

CParserError: Error tokenizing data. C error: Expected 6 fields in line 63, saw 7


In [14]:
df

Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2,AnswerRightEnding
0,138d5bfb-05cc-41e3-bf2c-fa85ebad14e2,Rick grew up in a troubled household.,"He never found good support in family, and tur...",It wasn't long before Rick got shot in a robbery.,The incident caused him to turn a new leaf.,He is happy now.,He joined a gang.,1
1,bff9f820-9605-4875-b9af-fe6f14d04256,Laverne needs to prepare something for her fri...,She decides to bake a batch of brownies.,She chooses a recipe and follows it closely.,Laverne tests one of the brownies to make sure...,The brownies are so delicious Laverne eats two...,Laverne doesn't go to her friend's party.,1
2,e8f628d5-9f97-40ed-8611-fc0e774673c4,Sarah had been dreaming of visiting Europe for...,She had finally saved enough for the trip.,She landed in Spain and traveled east across t...,She didn't like how different everything was.,Sarah then decided to move to Europe.,Sarah decided that she preferred her home over...,2
3,f5226bfe-9f26-4377-b05f-3d9568dbdec1,Gina was worried the cookie dough in the tube ...,She was very happy to find she was wrong.,The cookies from the tube were as good as from...,Gina intended to only eat 2 cookies and save t...,Gina liked the cookies so much she ate them al...,Gina gave the cookies away at her church.,1
4,69ac9b05-b956-402f-9fff-1f926ef9176b,It was my final performance in marching band.,I was playing the snare drum in the band.,We played Thriller and Radar Love.,The performance was flawless.,I was very proud of my performance.,I was very ashamed of my performance.,1
5,0f65bab6-8165-4361-980a-117046569fe2,I had been giving this homeless man change eve...,He was on the same corner near my house.,"One day, as I was driving through my neighborh...","Soon enough, I saw the same homeless man emerg...",The next day I gave the man twenty dollars.,I never gave the man money again.,2
6,d80cabdd-7a85-47e3-86be-5ce6591ca51e,Jim found an old disposable camera in the bott...,He began snapping away at everything around him.,The counter clicked down to one final photo.,The gravity of the situation began to dawn on ...,Jim took time to decide what he would take a p...,Jim took 20 more photos.,1
7,58090d3f-8a91-4c89-83ef-2b4994de9d24,Ron started his new job as a landscaper today.,He loves the outdoors and has always enjoyed w...,His boss tells him to re-sod the front yard of...,"Ron is ecstatic, but does a thorough job and f...",Ron is immediately fired for insubordination.,His boss commends him for a job well done.,2
8,e17053ac-2046-48c8-a7a2-7b9509c10e64,John and Billy became very skilled at beer pong.,They entered a contest in college.,They won the contest and advanced to the next ...,The next level sent them to Vegas.,"In Vegas, John and Billy competed against eigh...",John and Billy were disappointed.,1
9,69b26ae4-b778-4cd1-9f13-27d28fd4430e,Caroline was a student in medical school.,Caroline worked very hard to get good grades.,One day Caroline failed a test by one point.,Caroline was very frustrated but she continued...,But she gave up.,"Later, she passed the test.",2
