In [103]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from six.moves import cPickle

In [71]:
#Let's define some parameters for the program
#Directories
log_dir = 'logs'
save_dir = 'save'
#input
input = 'input.txt'
#model parameters
rnn_size = 256
num_layers = 2
model = 'lstm'
train_size = 0.9
test_size = 0.1
batch_size = 30
seq_length = 300
num_epochs = 25
save_every = 100000
grad_clip = 5.
learning_rate = 0.2 / 100
decay_rate = 0.97
gpu_mem = 0.666
init_from = None

In [47]:
data_file = open(input, 'r').read()
tensor_file = os.path.join("data.npy")
chars = list(set(data_file))
data_size, vocab_size = len(data_file), len(chars)

print("Count of Monte Cristo has %d characters, with %d unique characters." % (data_size, vocab_size))

Count of Monte Cristo has 2617267 characters, with 99 unique characters.


In [109]:
#Helps to convert our characters to integers in order to format our data into a vector format.
char_to_integer = { ch: i for i, ch in enumerate(chars)}
integer_to_char = { i: ch for i, ch in enumerate(chars)}

#Append no character to dictionary
char_to_integer[''] = len(char_to_integer)
integer_to_char[len(integer_to_char)] = ''
#This is our mapping from unique characters to integers
print(char_to_integer)
print(integer_to_char)

{'ï': 0, 'i': 1, 'C': 2, 'x': 3, 'T': 4, '1': 5, 'J': 6, '”': 7, ' ': 8, 'n': 9, '8': 10, '3': 11, 'é': 12, 'M': 13, 'y': 14, 'w': 15, 'g': 16, 'Z': 17, 'h': 18, 'î': 19, 'X': 20, 'E': 21, '6': 22, 'Q': 23, 'f': 24, '?': 25, '†': 26, '7': 27, '2': 28, 'P': 29, '5': 30, 'a': 31, 'è': 32, 'A': 33, 'k': 34, 'í': 35, '-': 36, 'æ': 37, 'S': 38, 'ô': 39, 'â': 40, '&': 41, 'ç': 42, '!': 43, 'U': 44, 'm': 45, '‘': 46, ';': 47, 'D': 48, ':': 49, 'ë': 50, 'c': 51, 'b': 52, '0': 53, 'F': 54, 'r': 55, 'ü': 56, 'u': 57, 't': 58, 'V': 59, 'B': 60, 'K': 61, 'œ': 62, ']': 63, 'd': 64, '(': 65, ')': 66, 'G': 67, 'É': 68, ',': 69, 'L': 70, 'Æ': 71, '.': 72, '“': 73, 'H': 74, 'Œ': 75, 'I': 76, 's': 77, 'q': 78, 'ê': 79, 'N': 80, 'p': 81, 'Y': 82, '4': 83, 'R': 84, '\n': 85, 'v': 86, 'O': 87, 'l': 88, 'j': 89, '9': 90, 'o': 91, 'e': 92, 'à': 93, '[': 94, '’': 95, '—': 96, 'z': 97, 'W': 98, '': 99}
{0: 'ï', 1: 'i', 2: 'C', 3: 'x', 4: 'T', 5: '1', 6: 'J', 7: '”', 8: ' ', 9: 'n', 10: '8', 11: '3', 12: 'é', 1

In [93]:
tensor = np.array(list(map(char_to_integer.get, data_file)))
char_tensor = np.array(list(data_file))
np.save(tensor_file, tensor)

print('Tensor is: ', tensor)
print('Shape of mapped tensor: ', np.shape(tensor))
print('Shape of char tensor: ', np.shape(char_tensor))

Tensor is:  [59 87 70 ... 43 95  7]
Shape:  (2617267,)
Shape:  (2617267,)


In [74]:
num_batches = int((tensor.size * train_size) / (batch_size * seq_length))
print("Number of batches:", num_batches)

Number of batches: 261


In [94]:
tensor = tensor[:num_batches * batch_size * seq_length]
char_tensor = char_tensor[:num_batches * batch_size * seq_length]
print("Shape of new tensor is: ", np.shape(tensor))

Shape of new tensor is:  (2349000,)


In [106]:
xdata = tensor
ydata = tensor[char_tensor]


#Create batches
x_batches = np.split(xdata.reshape(batch_size, -1), num_batches, 1)
y_batches = np.split(ydata.reshape(batch_size, -1), num_batches, 1)

#Pointer for batches
pointer = 0

#Save to file
with open (os.path.join(save_dir, 'char_dict.pkl'), 'wb') as f:
    cPickle.dump((integer_to_char), f)