In [1]:
# imports

from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding, SimpleRNN
import numpy as np

In [2]:
# data preparation

lines = open("data100.csv").readlines()
data = []

# tokenize strings
tokenizer = keras.preprocessing.text.Tokenizer(
    num_words=None,
    filters=""" .,?!#$%&()*+-<=>@[\\]^_`{|}~\t\n'"/ """,
    lower=False,
    split=" ",
)
tokenizer.fit_on_texts(lines)
for line in lines:
    line = line.split(" ")
    data.append(tokenizer.texts_to_sequences(line))


# split into groups of 50 words
features = []  # the 48 words that the ai is trained on
labels = []  # the next word that they ai should guess
for text in data:
    for i in range(50, len(text), 50):
        words = []
        for i in text[i - 50 : i]:
            try:
                words.append((i[0]-1)/len(tokenizer.word_index))
            except IndexError:
                words.append(0)

        features.append(words[:48])
        labels.append(words[49])
train_features = np.array(features[: int(len(features) * 0.8)])
train_labels = np.array(labels[: int(len(features) * 0.8)])
test_features = np.array(features[int(len(features) * 0.8) :])
test_labels = np.array(labels[int(len(features) * 0.8) :])

# one-hot encoding
# train_labels = keras.utils.to_categorical(train_labels)
# test_labels = keras.utils.to_categorical(test_labels)

print(f"Number of data points: {len(labels)}")

Number of data points: 8009


In [3]:
# model

model = Sequential()

# input layer
model.add(
    Embedding(
        input_dim=1,
        input_length=48,
        output_dim=100,
        trainable=False,
        mask_zero=True,
    )
)

# processing layers
model.add(LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)) # 
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="softmax"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 48, 100)           100       
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 46565 (181.89 KB)
Trainable params: 46465 (181.50 KB)
Non-trainable params: 100 (400.00 Byte)
_________________________________________________________________


In [4]:
# train model

model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

model.fit(
    train_features,
    train_labels,
    validation_data=(test_features, test_labels),
    epochs=150,
    verbose=2,
    shuffle=True,
)

scores = model.evaluate(test_features, test_labels,verbose=0) 
print('Test accuracy:', scores[1]) 

Epoch 1/150
201/201 - 8s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 8s/epoch - 41ms/step
Epoch 2/150
201/201 - 7s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 7s/epoch - 33ms/step
Epoch 3/150
201/201 - 7s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 7s/epoch - 35ms/step
Epoch 4/150
201/201 - 9s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 9s/epoch - 45ms/step
Epoch 5/150
201/201 - 9s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 9s/epoch - 46ms/step
Epoch 6/150
201/201 - 6s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 6s/epoch - 30ms/step
Epoch 7/150
201/201 - 6s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0.8722 - val_accuracy: 0.0000e+00 - 6s/epoch - 30ms/step
Epoch 8/150
201/201 - 6s - loss: 0.8770 - accuracy: 0.0000e+00 - val_loss: 0

In [125]:
# save model

model.save("rnn.keras")