In [1]:
import random
import nltk
import sys
import numpy as np
import pickle

In [2]:
EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''

In [3]:
FILENAME = 'shakespeare.txt'

VOCAB_SIZE = 16000

SEQ_LEN = 10

In [4]:
'''
 read lines from file
     return [list of lines]
'''
def read_lines(filename):
    content = ''
    with open(filename) as f:
        for line in f:
            if line.strip():
                if not line.strip()[-1] == ':':
                    content += line
    return content.split('\n')[:-1]

In [5]:
'''
 split sentences in one line
  into multiple lines
    return [list of lines]
'''
def split_line(line):
    return line.split('.')

In [6]:
'''
 remove anything that isn't in the vocabulary
    return str(pure ta/en)
'''
def filter_line(line, whitelist):
    return ''.join([ ch for ch in line if ch in whitelist ])

In [7]:
'''
 read list of words, create index to word,
  word to index dictionaries
    return tuple( vocab->(word, count), idx2w, w2idx )
'''
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(tokenized_sentences)
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist


def to_array(tokenized, seqlen, w2idx):
    num_words = len(tokenized)
    # calc data_len
    data_len = num_words//seqlen
    # create numpy arrays
    X = np.zeros([data_len, seqlen])
    Y = np.zeros([data_len, seqlen])
    # fill in
    for i in range(data_len):
        X[i] = np.array([ w2idx[w] for w in tokenized[i*seqlen:(i+1)*seqlen] ])
        Y[i] = np.array([ w2idx[w] for w in tokenized[(i*seqlen) + 1 : ((i+1)*seqlen) + 1] ])
    # return ndarrays
    return X.astype(np.int32), Y.astype(np.int32)  

In [8]:
print('\n>> Read lines from file')
lines = read_lines(filename=FILENAME)


>> Read lines from file


In [10]:
lines[:5]

['Before we proceed any further, hear me speak.',
 'Speak, speak.',
 'You are all resolved rather to die than to famish?',
 'Resolved. resolved.',
 'First, you know Caius Marcius is chief enemy to the people.']

In [12]:
# change to lower case (just for en)
lines = [ line.lower() for line in lines ]
lines[:3]

['before we proceed any further, hear me speak.',
 'speak, speak.',
 'you are all resolved rather to die than to famish?']

In [13]:
print('\n:: Sample from read(p) lines\n')
print(lines[121:125])


:: Sample from read(p) lines

["'that i receive the general food at first,", 'which you do live upon; and fit it is,', 'because i am the store-house and the shop', 'of the whole body: but, if you do remember,']


In [14]:
 # filter out unnecessary characters
print('\n>> Filter lines\n')
lines = [ filter_line(line, EN_WHITELIST) for line in lines ]
print(lines[121:125])


>> Filter lines

['that i receive the general food at first', 'which you do live upon and fit it is', 'because i am the storehouse and the shop', 'of the whole body but if you do remember']


In [15]:
# convert list of [lines of text] into list of [list of words ]
print('\n>> Segment lines into words\n')
tokenized = [ w for wordlist in lines for w in wordlist.split(' ') ]
print('\n:: Sample from segmented list of words\n')
print(tokenized[60:70])


>> Segment lines into words


:: Sample from segmented list of words

['away', 'away', 'one', 'word', 'good', 'citizens', 'we', 'are', 'accounted', 'poor']


In [16]:
# indexing -> idx2w, w2idx : en/ta
print('\n >> Index words')
idx2w, w2idx, freq_dist = index_( tokenized, vocab_size=VOCAB_SIZE)


 >> Index words


In [19]:
len(idx2w)

12506

In [22]:
len(w2idx)

12506

In [24]:
# remove unknowns
tokenized = [ w for w in tokenized if w in idx2w ]

In [34]:
len(tokenized)

180875

In [27]:
# convert to ndarray
X, Y = to_array(tokenized, SEQ_LEN, w2idx)

In [31]:
X[0]

array([129,  32, 953, 124, 582, 110,  15, 101, 101, 101])

In [36]:
Y[0]

array([ 32, 953, 124, 582, 110,  15, 101, 101, 101,   5])

In [51]:
" ".join([idx2w[i] for i in X[2]])

'resolved resolved first you know caius marcius is chief enemy'

In [52]:
" ".join([idx2w[i] for i in Y[2]])

'resolved first you know caius marcius is chief enemy to'

In [41]:
print('\n >> Save numpy arrays to disk')
# save them
np.save('idx_x.npy', X)
np.save('idx_y.npy', Y)


 >> Save numpy arrays to disk


In [42]:
# let us now save the necessary dictionaries
metadata = {
            'w2idx' : w2idx,
            'idx2w' : idx2w,
            'seqlen' : SEQ_LEN,
            'freq_dist' : freq_dist
            }

In [43]:
 # write to disk : data control dictionaries
with open('metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

In [44]:
def load_data(PATH=''):
    # read data control dictionaries
    with open(PATH + 'metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    # read numpy arrays
    idx_x = np.load(PATH + 'idx_x.npy')
    idx_y = np.load(PATH + 'idx_y.npy')
    return idx_x, idx_y, metadata['idx2w'], metadata['w2idx'], metadata['seqlen']

In [45]:
X, Y, idx2w, w2idx, seqlen = load_data()

In [46]:
X.shape

(18087, 10)

In [47]:
seqlen

10

In [48]:
Y.shape

(18087, 10)

In [53]:
idx2w[:10]

['the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in']

In [55]:
len(w2idx)

12506