In [93]:
import string
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

# Load data
df = pd.read_csv("stories.csv", encoding='utf-8')
story_list = df.stories.to_list()
printable = set(string.printable)
texts = ''.join(story_list[0:10])
texts = texts.lower()
print(f'Corpus Length: {len(texts)}')

Corpus Length: 326206


In [94]:
# Creating character / word mappings

# characters = sorted(list(set(texts)))
# n_to_char = {n:char for n, char in enumerate(characters)}
# char_to_n = {char:n for n, char in enumerate(characters)}

# Trying word mappings
words = sorted(list(set(texts.split())))
print(len(words))

n_to_word = {n:word for n, word in enumerate(words)}
word_to_n = {word:n for n, word in enumerate(words)}

9405


In [95]:
# Data Preprocessing
X = []
Y = []

length = len(texts.split())
seq_length = 100

for i in range(0, length-seq_length, 1):
    sequence = texts.split()[i:i + seq_length]
    label = texts.split()[i + seq_length]
    X.append([word_to_n[word] for word in sequence])
    Y.append(word_to_n[label])

In [96]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified/float(len(words))
Y_modified = np_utils.to_categorical(Y)

In [97]:
# Baseline Model
model = Sequential()
model.add(LSTM(100, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 100, 100)          40800     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 9405)              949905    
Total params: 1,071,105
Trainable params: 1,071,105
Non-trainable params: 0
_________________________________________________________________


In [98]:
# Fit Model
model.fit(X_modified, Y_modified, epochs=10, batch_size=100)

filename = '100_0.2_100_0.2_e10_b100.h5'
model.save_weights(filename)

Epoch 1/10

KeyboardInterrupt: 

In [90]:
# Generate texts with last saved fit
model.load_weights(filename)

string_mapped = X[99]
full_string = [n_to_word[value] for value in string_mapped]

# Generation
for i in range(100):
    x = np.reshape(string_mapped, (1, len(string_mapped), 1))
    x = x / float(len(words))
    
    pred_index = np.argmax(model.predict(x, verbose = 0))
    seq = [n_to_word[value] for value in string_mapped]
    full_string.append(n_to_word[pred_index])
    
    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]
    
# Combine generated text and output
txt = ""
for word in full_string[100:]:
    txt = txt+' '+word
print(txt)

 the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
