# Imports

In [35]:
import re # text parsing and clean up
import nltk # data resources and cleaning tools
from nltk.corpus import stopwords
from nltk.corpus import brown
import pandas as pd # tbd tabular data manipulation
from collections import Counter # counting word frequency
import os # interacting with directories and file names
import numpy as np
import random
import pickle
from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop
nltk.download('stopwords')
nltk.download('brown')
stop_words = set(stopwords.words('english'))
brown_corpus = brown.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\p0pp1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\p0pp1\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


# ETL

In [None]:
txts = os.listdir('texts/') 
# gets all files and subdirectories in the text directory,
#if there were subdirectories or files that can't be interperted as str we would error

In [None]:
# load text
words = []
punctuation_edges = re.compile(r'[^\w\s]\B|\B[^\w\s]\b')
inbetween = re.compile(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK[\S\s]*?\*\*\* S.*')
start_marker = re.compile(r'The Project Gutenberg eBook of[\S\s]*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*')
end_marker = re.compile(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK[\s\S]*?eBooks\.')
all_text = ''
for txt in txts: # loop through all files in our texts directory
    with open(f'texts/{txt}','r', encoding='utf-8') as file:
        # clean the data
        all_text += ' ' + file.read()
all_text = inbetween.sub('', all_text)
all_text = start_marker.sub('', all_text, count=1) # finds and removes the meta data from the text
all_text = end_marker.sub('', all_text, count=1)
all_text = all_text.lower() # all lowercase for uniformity and to count Bee and bee as the same word.
all_text = punctuation_edges.sub('', all_text) # Removes punctuation on the edges of a word
words = all_text.split() # makes the string into a list of words

In [None]:
filtered_words = [w for w in words if w not in stop_words and len(w)>2]

In [None]:
selected_texts_length = len(filtered_words)
selected_texts_length

In [None]:
selected_texts_counter = Counter(filtered_words)
selected_texts_counter

In [26]:
brown_corpus_joined = ' '.join(brown_corpus)
brown_corpus_joined = brown_corpus_joined.lower()
brown_corpus = brown_corpus_joined.split()
filtered_brown_corpus = [word for word in brown_corpus if word not in stop_words and len(word)>2]

# Descriptive

In [27]:
brown_counter = Counter(filtered_brown_corpus)
brown_length = len(filtered_brown_corpus)
brown_length

528829

In [None]:
brown_df = pd.DataFrame({'word':brown_counter.keys(),'count':brown_counter.values()})

In [None]:
selected_texts_df = pd.DataFrame({'word':selected_texts_counter.keys(),'count':selected_texts_counter.values()})

In [13]:
brown_df['percent'] = brown_df['count']/brown_length * 100
brown_df

Unnamed: 0,word,count,percent
0,fulton,17,0.003215
1,county,155,0.029310
2,grand,48,0.009077
3,jury,67,0.012670
4,said,1961,0.370819
...,...,...,...
49330,aviary,1,0.000189
49331,olive-flushed,1,0.000189
49332,coral-colored,1,0.000189
49333,boucle,1,0.000189


# ETL

In [63]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(brown_corpus_joined[:100000])

In [64]:
unique_tokens = np.unique(tokens)
unique_token_index = {token: idx for idx, token in enumerate(unique_tokens)}

# Prediction

In [65]:
n_words = 10
input_words = []
next_words = []
for i in range(len(tokens)-n_words):
    input_words.append(tokens[i:i+n_words])
    next_words.append(tokens[i+n_words])

In [66]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [67]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_words[i]]] = 1

In [68]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation('softmax'))

In [69]:
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(X, y, batch_size=128, epochs=30, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x2d26a5c9a50>

In [70]:
model.save('mymodel.h5')

  saving_api.save_model(


In [71]:
model = load_model("mymodel.h5")

In [72]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, n_best)[-n_best:]

In [83]:
possible = predict_next_word("The details of john early life, as frankly set down in 'Up from Slavery'", 5)

KeyError: 'life,'

In [78]:
print([unique_tokens[idx] for idx in possible])

['1920', '15', 'youth', '13th', 'karns']


In [84]:
import tensorflow as tf
print('Num GPUs Available: ', len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0
