-
Notifications
You must be signed in to change notification settings - Fork 1
/
parseText.py
executable file
·73 lines (47 loc) · 1.31 KB
/
parseText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy
from keras.utils import to_categorical
import re
def getVocab(args, to_set=True):
text = open(args.path, "r").read()
vocab = None
if args.predict_type == "letter":
vocab = list(text)
elif args.predict_type == "word":
"""
sym = ".,:;()[]\"?!\n"
asym = [".",",","\"","(",")","?","!","..."]
for s in sym:
text = text.replace(s, " ")
vocab = filter(None, text.split(" "))
"""
vocab = re.split("(\W)", text)
del text
if to_set:
vocab = list(set(vocab))
return vocab
def getInfo(path, args):
vocab = []
char_to_num = []
num_to_char = []
vocab = getVocab(args)
char_to_num = { b:a for a,b in enumerate(vocab) }
num_to_char = { a:b for a,b in enumerate(vocab) }
return vocab, char_to_num, num_to_char
def createDataset(path, char_to_int, seq_length, args):
text = open(args.path, "r").read()
if args.predict_type == "word":
text = getVocab(args, False)
x = []
y = []
for i in xrange(0, len(text) - seq_length, 1):
x.append( [char_to_int[c] for c in text[i:i+seq_length]] )
y.append( char_to_int[text[i+seq_length]] )
x = numpy.reshape(x, (len(x), seq_length, 1))
if args.normalize_inputs:
mean = numpy.avg(x)
std = numpy.std(x)
x = x/mean - std # x / float(len(char_to_int))
del text
y = to_categorical(y)
print x.shape, y.shape
return x, y