In [1]:
import string
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

# Load data
df = pd.read_csv("stories.csv", encoding='utf-8')
story_list = df.stories.to_list()
texts = ''.join(story_list[0:1])
# text = ''.join(filter(lambda x: x in string.printable, texts))
# test = ''.join([x for x in text if x in string.printable])
text = [[char for char in text if char in string.printable] for text in texts]
# print(f'Corpus Length: {len(text)}')
text = ''.join([j for i in text for j in i])
print(f'Length of corpus: {len(text)}')

Length of corpus: 33778


In [2]:
# Creating character / word mappings

# Word mappings
words = sorted(list(set(text.split())))
print(len(words))

n_to_word = {n:word for n, word in enumerate(words)}
word_to_n = {word:n for n, word in enumerate(words)}

1976


In [3]:
# Data Preprocessing
X = []
Y = []

length = len(text.split())
seq_length = 100

for i in range(0, length-seq_length, 1):
    sequence = text.split()[i:i + seq_length]
    label = text.split()[i + seq_length]
    X.append([word_to_n[word] for word in sequence])
    Y.append(word_to_n[label])

In [4]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified/float(len(words))
Y_modified = np_utils.to_categorical(Y)

In [14]:
# Baseline Model
model = Sequential()
model.add(LSTM(100, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 100, 100)          40800     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100, 100)          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1976)              199576    
Total params: 320,776
Trainable params: 320,776
Non-trainable params: 0
_________________________________________________________________


In [15]:
from keras import backend as K

# Fit Model
model.fit(X_modified, Y_modified, epochs=10, batch_size=100)

filename = '700_0.2_700_0.2e10_b100.h5'
model.save_weights(filename)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Generate texts with last saved fit
model.load_weights(filename)

string_mapped = X[99]
full_string = [n_to_word[value] for value in string_mapped]

# Generation
for i in range(100):
    x = np.reshape(string_mapped, (1, len(string_mapped), 1))
    x = x / float(len(words))
    
    pred_index = np.argmax(model.predict(x, verbose = 0))
    seq = [n_to_word[value] for value in string_mapped]
    full_string.append(n_to_word[pred_index])
    
    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]
    
# Combine generated text and output
txt = ""
for word in full_string[100:]:
    txt = txt+' '+word
print(txt)