In [1]:
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [2]:
# load data
file = open("C:\\Users\\GOWTHAM-PC\\textgen\\tamil.txt").read()

In [3]:
# tokenization
#standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)


In [4]:
#char to num
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [5]:
#check if words to char or char to num has worked
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total no of characters : ", input_len)
print("Total vocab : ", vocab_len)

Total no of characters :  990
Total vocab :  32


In [6]:
#seq length
seq_length = 100
x_data = []
y_data = []

In [7]:
# loop through the sequence
for i in range(0, input_len - seq_length):
    in_seq = processed_inputs[i:i +seq_length]
    out_seq = processed_inputs[i +seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print("Total patterns : ", n_patterns)

Total patterns :  890


In [8]:
# convert input sequence to np array and so on
x = numpy.reshape(x_data, (n_patterns, seq_length, 1))
x = x/float(vocab_len) 

In [9]:
#one-hot encoding
y = np_utils.to_categorical(y_data)

In [10]:
# creating sequential model
model = Sequential()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))


In [11]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
# saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [13]:
# fit model and train
model.fit(x,y, epochs=12, batch_size=256, callbacks=desired_callbacks)

Epoch 1/12
Epoch 00001: loss improved from inf to 3.35320, saving model to model_weights_saved.hdf5
Epoch 2/12
Epoch 00002: loss improved from 3.35320 to 3.02735, saving model to model_weights_saved.hdf5
Epoch 3/12
Epoch 00003: loss improved from 3.02735 to 2.99470, saving model to model_weights_saved.hdf5
Epoch 4/12
Epoch 00004: loss improved from 2.99470 to 2.97164, saving model to model_weights_saved.hdf5
Epoch 5/12
Epoch 00005: loss did not improve from 2.97164
Epoch 6/12
Epoch 00006: loss improved from 2.97164 to 2.95645, saving model to model_weights_saved.hdf5
Epoch 7/12
Epoch 00007: loss improved from 2.95645 to 2.95641, saving model to model_weights_saved.hdf5
Epoch 8/12
Epoch 00008: loss did not improve from 2.95641
Epoch 9/12
Epoch 00009: loss improved from 2.95641 to 2.94481, saving model to model_weights_saved.hdf5
Epoch 10/12
Epoch 00010: loss did not improve from 2.94481
Epoch 11/12
Epoch 00011: loss did not improve from 2.94481
Epoch 12/12
Epoch 00012: loss improved fro

<tensorflow.python.keras.callbacks.History at 0x68b082e850>

In [14]:
# recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
#output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [50]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) -1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed:
" linguisticlinesstatehomenumberhistoricbuildingsmultireligiouspilgrimagesiteshillstationsthreeworldhe "


In [51]:
#generate the text
for i in range(25):
    x = numpy.reshape(pattern,(1,len(pattern), 1))
    x = x/float(vocab_len) 
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

aaaaaaaaaaaaaaaaaaaaaaaaa