In [1]:
import random
import pickle
import os
import csv

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.saving import load_model

2023-08-14 12:02:55.699863: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-14 12:02:55.701168: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-14 12:02:55.727613: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-14 12:02:55.728181: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Global variables
inputSentenceLength = 10
trainingSetSize = 10000

In [3]:
path = os.path.join(os.getcwd(), "TrainingData/eng_sentences.tsv")
text_raw = pd.read_csv(path,sep='\t',header=None, quoting=csv.QUOTE_NONE)
joined_text = " ".join(text_raw[2])

In [4]:
partial_text = joined_text[:trainingSetSize]
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
allWords = tokenizer.tokenize(partial_text)# purposely not doing lowercase
uniqueWords = np.unique(allWords)
# # create a dictionary where token is mapped to idx
# # so this creates a dictionary from the enumeration of uniqueWords
uniqueWordsIndex = {token: idx for idx, token in enumerate(uniqueWords)}

In [5]:
# Training Data Population. 

input_sentences = []
next_words = []

# Put every possible training sentence into a list
for i in range(len(allWords) - inputSentenceLength):
    input_sentences.append(allWords[i:i+inputSentenceLength])
    next_words.append(allWords[i+inputSentenceLength])

# I want to map out in a matrix, which word for every possible spot is being used
modelInput = np.zeros(shape=(len(input_sentences), inputSentenceLength, len(uniqueWords)), dtype=bool)
# I also want to map out for each possible next word, which word is activated
modelOutput = np.zeros(shape=(len(next_words), len(uniqueWords)), dtype=bool)

for i, sentence in enumerate(input_sentences):
    for j, word in enumerate(sentence):
        modelInput[i,j,uniqueWordsIndex[word]] = 1
    modelOutput[i,uniqueWordsIndex[next_words[i]]] = 1
                        
#TODO change this to predict between questions and answers later

In [6]:
# Model creation
model = Sequential()
model.add(LSTM(128,input_shape=(inputSentenceLength, len(uniqueWords)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(uniqueWords)))
model.add(Activation("softmax"))

In [7]:
# Model Variables
modelBatchSize = 128
modelEpochs = 10

In [8]:
# Model execution
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(modelInput,modelOutput, batch_size=modelBatchSize, epochs=modelEpochs, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ff51c562fb0>

In [9]:
model.save("mymodel.keras")

In [10]:
model = load_model("mymodel.keras")

In [33]:
def predict_next_word(input_text, n_best):
    predictionInput = np.zeros(shape=(1,inputSentenceLength, len(uniqueWords)))
    for i, word in enumerate(input_text.split()):
        predictionInput[0,i,uniqueWordsIndex[word]] = 1

    # So how this predicts is you have a bunch of matrices.
    # Based on how the matrix looks, we want to predict and output

    predictions = model.predict(predictionInput, verbose=0)[0]
    return np.argpartition(predictions, n_best)[-n_best:]

In [36]:
def generate_text(input_text, text_length, creativity=3):
    word_sequence = tokenizer.tokenize(input_text)
    current = 0
    # The underscore just means we don't care about the variable
    for _ in range(text_length):
        sub_sequence = " ".join(word_sequence[current:current+inputSentenceLength])
        try:
            next_word = uniqueWords[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            next_word = random.choice(uniqueWords)
        word_sequence.append(next_word)
        current += 1
    return " ".join(word_sequence)

In [37]:
# attempt = predict_next_word("This is a try",5)
# print ([uniqueWords[idx] for idx in attempt])
generate_text("This is a", 100, 5)


'This is a "A you your !" ! you ! !" !" ! you ! "I "I !" !" ! your your you you "A ! "I "I "I "A your you !" "A you ! "I !" ! !" ! !" your !" your "A ! you "I ! your ! !" you you !" "A !" ! your your "A your your you "A ! you ! !" !" you your !" you ! ! your you you you your you !" "A "A ! ! ! ! !" your "A "A "A you ! your !" you you your "A'