In [1]:
!svn checkout https://github.com/deepgram/kur/trunk/examples/language-model/books

A    books/pride_and_prejudice.txt
A    books/shakespeare.txt
Checked out revision 371.


In [2]:
%%writefile make_data_func.py
# vocab.py

# length of the context sequence                        # what does context sequence mean? 
                                                        # a sequence of words used to predict the next word
# seq_len = 30

                                                        # to create a list of 26 alphabet letter
lowercase_letters = [                             
    chr(97 + i) for i in range(26)
]

                                                        # define a list of 4 symbols: space, ", \, .
symbols = [' ', '"', '\'', '.']             ############# QUESTION: all other numbers and symbols are ignored, ### 
                                                        # in what situations this is common practice on vocab? ###
                                                        # in what situations should we keep all num and symbols ##

                                                        # define a list of unique characters we allow in our data
vocab = lowercase_letters + symbols

                                                        # create a dictionary: {char:index}
                                                        # give each character an index
char_to_int = dict(
    (c, i) for i, c in enumerate(vocab)
)
                                                        # create a dictionary: {index:char}
                                                        # give each index an character
int_to_char = dict(enumerate(vocab))
                                                        # get the length of vocab
n_vocab = len(vocab)

###############################################################################################

# make_data.py


import numpy as np
# from vocab import *
import json
import os

                                                 # given 30 character in sequence to predict the next character
                                                 # dev = True, is to only use 10% of data
def make_data(seq_len=30, dev=True):

    if not os.path.exists('./data/'):             ### create a data folder if not already available ###
        os.mkdir('./data/')

                                                  ### convert character from indices to one-hot-encoding ###
    def one_hot(v, ndim):                         # v: indices of all the unique characters of a text
        v_one_hot = np.zeros(                     # ndim: num of all unique characters of the text
            (len(v), ndim,)
        )
        for i in range(len(v)):
            v_one_hot[i][v[i]] = 1.0              # for each unique character, make 1 at its index on column
        return v_one_hot

    x = []
    y = []

    all_chars = []

                                                  # for each book
    for book in [
        'pride_and_prejudice.txt',
        'shakespeare.txt'
    ]:                                          
                                                  # for every line of this book
        with open('books/%s' % book, 'r') as infile:
                                                  # split every word into separate characters, a space separate each word
            chars = [
                c for c in ' '.join(infile.read().lower().split())
                if c in set(vocab)
            ]
            all_chars += [' ']
            all_chars += chars
                                                  # put all words of book into a long list (see **example1** below)


                                                  # get rid of the space in the beginning
    all_chars = list(' '.join(''.join(all_chars).split()))
    num_chars = len(all_chars)                    # count num of characters in the book including spaces


                                                  # create an empty file named `cleaned.txt` to write data into
    with open('cleaned.txt', 'w') as outfile:
                                                  # write all the whole list of characters of the books into it
        outfile.write(''.join(all_chars))         # without a space in the beginning


    x, y = [], []                                 # set x, y as empty list


                                                  # define portions for each section: train, validate, evalute, test
    data_portions = [
        ('train', 0.8),
        ('validate', 0.05),
        ('test', 0.05),
        ('evaluate', 0.05),
    ]

#     dev = True                                     # reduce amount of data for training x10 times
    if dev:
                                                   # shrink every section data by x10 times

        for i in range(len(data_portions)):
            data_portions[i] = (
                data_portions[i][0],
                data_portions[i][1] * 0.1
            )


                                                   # sum up num of data points in each section (train, validate...) 
                                                   # max_i = sum_above - seq_len
    max_i = sum([
        int(round(len(all_chars) * fraction))
        for name, fraction in data_portions
    ]) - seq_len                                   # seq_len is defined inside vocab.py as 30


                                                   # for every element of max_i 
    for i in range(max_i):
                                                   # create a short list of length 30, assign to in_char_seq
                                                   # every in_char_seq only has 1 character different to its neighbour
        in_char_seq = all_chars[i: i + seq_len]

                                                   # one hot representation
                                                   # create a matrix of 0s with dim(30, 30), assigned to sample_x
        sample_x = np.zeros((len(in_char_seq), n_vocab,))


        for j, c in enumerate(in_char_seq):        # j as index from 0 to 29, c as character 
            sample_x[j][char_to_int[c]] = 1        # find unique index of the character, and put 1 in the index of column
                                                   # by now, a list of 30 characters turned into a matrix of 0s and 1s 
                                                   # in other words, 30 character in one-hot-encoding format 

        x.append(sample_x)                         # tranform all characters into one-hot-encoding format and save them
                                                   # all in x (x was an empty list)
                                                   # what does x look like? - a list of arrays see **example2** #######


                                                   # create a 1-d array of 0s of length 30 for sample_y 
        sample_y = np.zeros(n_vocab)

                                                   # get all characters from the 30th onward
                                                   # get the unique index of the character
                                                   # make the location of the index from 0 to 1
        sample_y[char_to_int[all_chars[i + seq_len]]] = 1
        y.append(sample_y)                         # now, we transformed the y from character to one-hot-encoding


                                                   # convert x, y from lists to arrays of arrays
    x, y = np.array(x).astype('int32'), np.array(y).astype('int32') 
                                                   # x as 3-d (max_i, 30, 30), y as 2-d (max_i, 30)  
                                                                                            # see **example2** #######


                                                    # set starting index
    start_i = 0     
                                                    # for each section: train, validate, evaluate, test
    for name, fraction in data_portions:
                                                    # get ending index of each section
        end_i = start_i + int(round(len(x) * fraction))    # len(x) == max_i
        print(start_i, end_i)
        x0 = x[start_i: end_i]
        y0 = y[start_i: end_i]
                                                    # print dim of each section's x and y
        print('dims:')
        print(x0.shape)
        print(y0.shape)
                                                    # set current section's ending index as starting index of next section
        start_i = end_i
                                                    # open an empty jsonl file and write to it
        with open('data/%s.jsonl' % name, 'w') as outfile:
            for sample_x, sample_y in zip(x0, y0):  # zip sample_x and sample_y together
                outfile.write(json.dumps({
                    'in_seq': sample_x.tolist(),    # write into file as list of list rather than numpy arrays
                    'out_char': sample_y.tolist()   # QUESTION: it means kur accept lists from jsonl file by default
                }))
                outfile.write('\n')

        del x0, y0

Overwriting make_data_func.py


In [3]:
from make_data_func import *
                                        # we only use a sequence of 2 characters to predict
data = make_data(2)

0 13302
dims:
(13302, 2, 30)
(13302, 30)
13302 14133
dims:
(831, 2, 30)
(831, 30)
14133 14964
dims:
(831, 2, 30)
(831, 30)
14964 15795
dims:
(831, 2, 30)
(831, 30)


In [4]:
%%writefile char_rnn_demo_dlnd_defaults.yaml

---

settings:


  vocab:                                         
    size: 30                          # This cannot be changed, it is fixed with dataset           
                
  

# QUESTION: what cause the following error, given the model build fine. see `build` in the next cell               
# we get the following error meassage: 
# Traceback (most recent call last):
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/model/executor.py", line 224, in train
#     **kwargs
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/model/executor.py", line 648, in wrapped_train
#     raise ValueError('Model loss is NaN.')
# ValueError: Model loss is NaN.
# 
# During handling of the above exception, another exception occurred:
# 
# Traceback (most recent call last):
#   File "/Users/Natsume/miniconda2/envs/dlnd-tf-lab/bin/kur", line 11, in <module>
#     load_entry_point('kur', 'console_scripts', 'kur')()
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/__main__.py", line 382, in main
#     sys.exit(args.func(args) or 0)
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/__main__.py", line 62, in train
#     func(step=args.step)
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/kurfile.py", line 371, in func
#     return trainer.train(**defaults)
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/model/executor.py", line 246, in train
#     info={'Reason' : reason}
#   File "/Users/Natsume/Downloads/kur_road/kur/kur/model/hooks/plot_hook.py", line 123, in notify
#     vbatch = numpy.arange(1, len(vloss)+1)
# TypeError: object of type 'NoneType' has no len()
# CPU times: user 511 ms, sys: 504 ms, total: 1.02 s
# Wall time: 31.6 s"""    

# ANSWER: try remove batch_normalization or change to gru, or change optimizers
# it turns out change from lstm to gru works 

## Another option is to use `lstm` with `hard_sigmoid`, but not workign here 
# see https://github.com/deepgram/kur/issues/7#issuecomment-282916382

  rnn:
    size: 512                                    # num_neurons of a rnn/lstm layer
    depth: 2                                     # num_rnn_layers for this RNN model
        


model:
  - input: in_seq


  - recurrent:
      size: "{{ rnn.size }}"
      type: lstm
      sequence: True                             # yes, meaning return the whole sequence of 30 characters??
      bidirectional: no
  - batch_normalization
  - dropout: "{{drop_neurons}}"
        
  - recurrent:
      size: "{{ rnn.size }}"
      type: lstm                           
      sequence: False                     # no, meaning only return only 1 character of the whole 30 char sequence??
      bidirectional: no
  - batch_normalization
  - dropout: "{{drop_neurons}}"


  - dense: "{{ vocab.size }}"                   # now it is like 30 class-classification problem, 
                                                # that's why we need 30 neurons here, right? 

  - activation: softmax

  - output: out_char                               # make a name of output layer
           

loss:
  - target: out_char
    name: categorical_crossentropy

train:
  data:
    - jsonl: data/train.jsonl
  epochs: "{{ num_epochs|default(5) }}"     
  weights:
    initial: t3_dlnd/best.w.kur
    best: t3_dlnd/best.w.kur
    last: t3_dlnd/last.w.kur
  log: t3_dlnd/log
  optimizer: 
    name: adam
  hooks:                                   
    - plot: t3_dlnd/loss.png
        
                                # QUESTION: by default optimizer is Adam, lr = 0.001
                                # given 'grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)`
                                # I should set clip as follows, right?
  clip:                                              
    norm: "{{grad_clip}}"                            

validate:
  data:
    - jsonl: data/validate.jsonl
  weights: t3_dlnd/best.w.kur


test:
  data:
    - jsonl: data/test.jsonl
  weights: t3_dlnd/best.w.kur


evaluate:
  data:
    - jsonl: data/evaluate.jsonl
  weights: t3_dlnd/best.w.kur

  destination: t3_dlnd/output.pkl

Overwriting char_rnn_demo_dlnd_defaults.yaml


In [5]:
%%writefile char_rrn_demo_dlnd_fluid.yaml

---
settings: 
  num_epochs: 1                    # leave it empty means inf number of epochs
                                 # so to use default value, just comment this line out
  drop_neurons: 0.5
  grad_clip: 5

include: char_rnn_demo_dlnd_defaults.yaml
...

Overwriting char_rrn_demo_dlnd_fluid.yaml


In [6]:
!kur -v train char_rrn_demo_dlnd_fluid.yaml

[1;37m[INFO 2017-03-07 00:31:42,429 kur.kurfile:699][0m Parsing source: char_rrn_demo_dlnd_fluid.yaml, included by top-level.[0m
[1;37m[INFO 2017-03-07 00:31:42,434 kur.kurfile:699][0m Parsing source: char_rnn_demo_dlnd_defaults.yaml, included by char_rrn_demo_dlnd_fluid.yaml.[0m
[1;37m[INFO 2017-03-07 00:31:42,451 kur.kurfile:82][0m Parsing Kurfile...[0m
[1;37m[INFO 2017-03-07 00:31:42,472 kur.loggers.binary_logger:71][0m Loading log data: t3_dlnd/log[0m
[1;37m[INFO 2017-03-07 00:31:43,012 kur.backend.backend:80][0m Creating backend: keras[0m
[1;37m[INFO 2017-03-07 00:31:43,012 kur.backend.backend:83][0m Backend variants: none[0m
[1;37m[INFO 2017-03-07 00:31:43,012 kur.backend.keras_backend:122][0m No particular backend for Keras has been requested.[0m
[1;37m[INFO 2017-03-07 00:31:44,075 kur.backend.keras_backend:195][0m Keras is loaded. The backend is: theano[0m
[1;37m[INFO 2017-03-07 00:31:44,075 kur.model.model:260][0m Enumerating the model containers.[0m