<a href="https://colab.research.google.com/github/AAdewunmi/Next-Word-Prediction-Project/blob/main/Predict_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Words!

In [48]:
# Import libraries

import string
import nltk
import re
from nltk.corpus import stopwords
import pkg_resources
import pickle
import json
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize

In [49]:
# Import libraries

import numpy
from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [50]:
# Add utility functions for document handling and sample data directory detection

from pathlib import Path
from typing import List
import string

def _detect_sample_dir() -> Path:
    """
    Return a usable /sample_data/ directory for the current environment.

    Priority:
      1) /sample_data (if it exists)
      2) /content/sample_data (Colab default)
      3) Create /sample_data if neither exists (for local/Jupyter)
    """
    candidates = [Path("/sample_data"), Path("/content/sample_data")]
    for d in candidates:
        if d.exists():
            return d
    # Create a local /sample_data if nothing exists
    d = Path("/sample_data")
    d.mkdir(parents=True, exist_ok=True)
    return d

SAMPLE_DIR = _detect_sample_dir()

def load_doc(filename: str) -> str:
    """
    Read the entire contents of a text file.

    Args:
        filename: Path to the text file (absolute or relative).

    Returns:
        File contents as a single string.
    """
    path = Path(filename)
    with path.open("r", encoding="utf-8") as f:
        return f.read()

def clean_doc(doc: str) -> List[str]:
    """
    Convert raw document text into cleaned, lowercased, alphabetic tokens.

    Steps:
      1) Replace double hyphens with a space.
      2) Split on whitespace.
      3) Remove ASCII punctuation from each token.
      4) Keep only purely alphabetic tokens (isalpha()).
      5) Lowercase all tokens.

    Args:
        doc: Raw document text.

    Returns:
        List of cleaned tokens.
    """
    doc = doc.replace("--", " ")
    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in doc.split()]
    tokens = [w.lower() for w in tokens if w.isalpha()]
    return tokens

def save_doc(lines: List[str], filename: str) -> None:
    """
    Save a list of strings to disk, one per line.

    Args:
        lines: Strings to write (e.g., tokens).
        filename: Output file path to write.
    """
    path = Path(filename)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        f.write("\n".join(lines))



In [51]:
# Add script to tokenize text file from sample_data and save tokens

# Point to files in /sample_data (or /content/sample_data if that's what exists)
INPUT_FILE = (SAMPLE_DIR / "republic.txt").as_posix()
OUTPUT_FILE = (SAMPLE_DIR / "republic-tokenised.txt").as_posix()

# Execute the pipeline
text = load_doc(INPUT_FILE)
tokens = clean_doc(text)
save_doc(tokens, OUTPUT_FILE)

print(f"Read from: {INPUT_FILE}")
print(f"Wrote  to: {OUTPUT_FILE}")
print(f"Sample tokens: {tokens[:25]}")
print(f"Total tokens: {len(tokens):,}")



Read from: /content/sample_data/republic.txt
Wrote  to: /content/sample_data/republic-tokenised.txt
Sample tokens: ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other']
Total tokens: 209,695


In [52]:
# Add sanity checks for document processing pipeline

# Minimal assertions to catch common regressions
assert isinstance(text, str) and len(text) > 0, "Input text is empty."
assert all(isinstance(t, str) for t in tokens), "Tokens must be strings."
assert all(t.isalpha() for t in tokens), "Non-alphabetic tokens slipped through."
assert Path(OUTPUT_FILE).exists(), "Output file was not written."

# Round-trip check on saver/loader behavior
tmp_out = (SAMPLE_DIR / "_tmp_tokens.txt").as_posix()
save_doc(["A", "b", "c"], tmp_out)
reloaded = load_doc(tmp_out).splitlines()
assert reloaded == ["A", "b", "c"], "save_doc/load_doc round-trip failed."
Path(tmp_out).unlink(missing_ok=True)

print("Sanity checks passed.")


Sanity checks passed.


In [53]:
# Text I/O and Cleaning Utilities
# ========================================

# Defines reusable helpers for:
  # • File I/O for plain-text corpora (`load_doc`, `save_doc`)
  # • Document tokenization for large files (`clean_doc`)
  # • Social-text normalization for short messages (tweets/posts) via
    # explicit, testable steps (`strip_html`, `strip_urls`, `strip_emails`,
    # `keep_letters_only`, `remove_roman_numerals`, `normalize_whitespace`,
    # high-level `clean_social_text`, corpus-level `clean_social_corpus`,
    # and a simple whitespace tokenizer `tokenize_simple`)
  # • Environment detection for a writable sample-data directory (`SAMPLE_DIR`)
    # supporting both local Jupyter and Google Colab patterns.

# --- Social text cleaning helpers (fits alongside load_doc / clean_doc / save_doc) ---
import re
from typing import List, Iterable

# Optional progress bar; falls back to a no-op if tqdm isn't available
try:
    from tqdm.auto import tqdm  # type: ignore
except Exception:  # pragma: no cover
    def tqdm(x):  # type: ignore
        return x

# Pre-compile patterns once
_HTML_TAGS_RE   = re.compile(r"<.*?>")
_URL_RE         = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
_EMAIL_RE = re.compile(
    r'\b(?:mailto:)?(?:at\s+)?[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b',
    flags=re.IGNORECASE,
)
_NON_LETTERS_RE = re.compile(r"[^A-Za-z]+")   # ASCII letters only; see note below
_ROMAN_RE       = re.compile(r"\b[MDCLXVI]+\b\.?", flags=re.IGNORECASE)
_WW_RE          = re.compile(r"ww+", flags=re.IGNORECASE)  # catch stray 'www' fragments
_WS_RE          = re.compile(r"\s+")

def strip_html(text: str) -> str:
    """
    Remove HTML tags from text.
    """
    return _HTML_TAGS_RE.sub("", text)

def strip_urls(text: str) -> str:
    """
    Remove URLs (http[s]:// and bare www.*) from text.
    """
    return _URL_RE.sub("", text)

def strip_emails(text: str) -> str:
    """
    Remove email addresses and a preceding 'at ' if present.

    Examples:
        'Email me at jane@x.com'  -> 'Email me'
        'Contact jane@x.com now'  -> 'Contact now'
    """
    return _EMAIL_RE.sub("", text)


def remove_roman_numerals(text: str) -> str:
    """
    Remove standalone Roman numerals (I, IV, XIV, etc.), optionally with trailing period.
    """
    return _ROMAN_RE.sub("", text)


def keep_letters_only(text: str) -> str:
    """
    Replace any non-letter character with a space (A–Z only).
    Note: this strips digits, punctuation, emojis, and diacritics.
    """
    return _NON_LETTERS_RE.sub(" ", text)

def normalize_whitespace(text: str) -> str:
    """
    Collapse multiple spaces/newlines to a single space and trim edges.
    """
    return _WS_RE.sub(" ", text).strip()

def clean_social_text(text: str, *, letters_only: bool = True, lowercase: bool = True) -> str:
    """
    Clean a single social post/message.

    Pipeline:
      1) Strip HTML tags
      2) Remove URLs
      3) Remove emails
      4) (Optional) keep only letters (A-Z), replacing others with spaces
      5) Lowercase
      6) Remove 'www' fragments and standalone Roman numerals
      7) Normalize whitespace

    Args:
        text: Raw input text.
        letters_only: If True, drop non-letters (digits, punctuation, emojis).
        lowercase: If True, lowercase the text.

    Returns:
        Cleaned text as a single string.
    """
    if text is None:
        return ""

    x = strip_html(text)
    x = strip_urls(x)
    x = strip_emails(x)
    if letters_only:
        x = keep_letters_only(x)
    if lowercase:
        x = x.lower()
    # Misc cleanups mirroring your original intent
    x = _WW_RE.sub("", x)           # remove leftover www/ww fragments
    x = remove_roman_numerals(x)    # drop roman numerals like 'XIV'
    x = normalize_whitespace(x)
    return x

def tokenize_simple(text: str) -> List[str]:
    """
    Basic whitespace tokenizer for already-cleaned text.
    """
    if not text:
        return []
    return text.split()

def clean_social_corpus(
    texts: Iterable[str],
    *,
    to_tokens: bool = False,
    show_progress: bool = True,
    letters_only: bool = True,
    lowercase: bool = True,
) -> List[List[str]] | List[str]:
    """
    Clean a collection of social texts and optionally tokenize.

    Args:
        texts: Iterable of raw texts (e.g., tweets, comments).
        to_tokens: If True, return List[List[str]] (tokens per text). If False, return cleaned strings.
        show_progress: If True, show a progress bar when tqdm is available.
        letters_only: Keep only letters (A-Z) before tokenization.
        lowercase: Lowercase text before tokenization.

    Returns:
        If to_tokens is False: List[str] of cleaned strings.
        If to_tokens is True:  List[List[str]] of tokenized strings per input text.
    """
    it = tqdm(texts) if show_progress else texts
    if to_tokens:
        return [tokenize_simple(clean_social_text(t, letters_only=letters_only, lowercase=lowercase)) for t in it]
    else:
        return [clean_social_text(t, letters_only=letters_only, lowercase=lowercase) for t in it]



In [54]:
# Example corpus (replace with your own list of tweets/messages)

raw_texts = [
    "<p>Check this out: https://example.com GREAT DEAL!!!</p>",
    "Email me at John.Doe@example.org or visit www.mysite.org",
    "We met on XIV. It was fun :)",
    "Hello—World! New\nline\tand\ttabs.",
]

cleaned = clean_social_corpus(raw_texts, to_tokens=False, show_progress=False)
tokenized = clean_social_corpus(raw_texts, to_tokens=True, show_progress=False)

print("Cleaned strings:")
for s in cleaned:
    print("  ", s)

print("\nTokenized (per text):")
for toks in tokenized:
    print("  ", toks)


Cleaned strings:
   check this out great deal
   email me or visit
   we met on it was fun
   hello world new line and tabs

Tokenized (per text):
   ['check', 'this', 'out', 'great', 'deal']
   ['email', 'me', 'or', 'visit']
   ['we', 'met', 'on', 'it', 'was', 'fun']
   ['hello', 'world', 'new', 'line', 'and', 'tabs']


In [55]:
# Sanity tests to catch regressions quickly

def _assert_equal(a, b, msg=""):
    assert a == b, f"{msg}\nExpected: {b}\nActual:   {a}"

# 1) URL & HTML stripping
sample1 = "<b>Deal</b> at https://x.y/z and www.foo.com"
out1 = clean_social_text(sample1)
_assert_equal(out1, "deal at and", "URL/HTML removal failed")

# 2) Email removal
sample2 = "Contact a@b.co now! or A.B-c_d@domain.io later."
out2 = clean_social_text(sample2)
_assert_equal(out2, "contact now or later", "Email removal failed")

# 3) Roman numerals dropping
sample3 = "This is Chapter XIV. And Section vi."
out3 = clean_social_text(sample3)
_assert_equal(out3, "this is chapter and section", "Roman numeral removal failed")

# 4) Letters-only + whitespace normalization
sample4 = "Hello—World! New\nline\tand\ttabs. #hashtag 123"
out4 = clean_social_text(sample4)
_assert_equal(out4, "hello world new line and tabs hashtag", "Letters-only/whitespace failed")

# 5) Corpus path (clean strings)
raws = ["Email me: joe@x.com", "Visit <i>www.example.com</i> TODAY!!"]
cleaned = clean_social_corpus(raws, to_tokens=False, show_progress=False)
_assert_equal(cleaned, ["email me", "visit today"], "Corpus cleaning failed")

# 6) Corpus path (tokens)
tokenized = clean_social_corpus(raws, to_tokens=True, show_progress=False)
_assert_equal(tokenized, [["email", "me"], ["visit", "today"]], "Corpus tokenization failed")

print("All social-text cleaning tests passed.")


All social-text cleaning tests passed.


In [56]:
# Implement littleCleaning function to filter sentences by length

def littleCleaning(sentences):
    print("Starting cleaning Process")
    ret_list = []
    for sentence in sentences:
      words = sentence.split(" ")
      if len(words) > 5:
        ret_list.append(sentence)
      else:
        continue
    return ret_list

In [57]:
# Download necessary NLTK data files (wordnet, punkt)

nltk.download('wordnet')

nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [58]:
# Load and preprocess 'republic.txt' corpus

path = '/content/sample_data/republic.txt'
text = open(path).read().lower()
print('length of the corpus is: :', len(text))

length of the corpus is: : 1174387


In [59]:
# Converting the data into lists

data_list = text.split(".")
data_list[:20]

['the project gutenberg ebook of the republic, by plato\n\nthis ebook is for the use of anyone anywhere in the united states and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever',
 ' you may copy it, give it away or re-use it under the terms\nof the project gutenberg license included with this ebook or online at\nwww',
 'gutenberg',
 'org',
 ' if you are not located in the united states, you\nwill have to check the laws of the country where you are located before\nusing this ebook',
 '\n\ntitle: the republic\n\nauthor: plato\n\ntranslator: b',
 ' jowett\n\nrelease date: october, 1998 [ebook #1497]\n[most recently updated: september 11, 2021]\n\nlanguage: english\n\n\nproduced by: sue asscher and david widger\n\n*** start of the project gutenberg ebook the republic ***\n\n\n\n\nthe republic\n\nby plato\n\ntranslated by benjamin jowett\n\nnote: see also “the republic” by plato, jowett, ebook #150\n\n\ncontents\n\n introduction and analysis',
 '\n the 

In [60]:
# --- Normalization pipeline that uses social-text utilities ---

from typing import Callable, Iterable, List, Union

def normalization_pipeline(
    texts: Iterable[str],
    *,
    to_tokens: bool = False,
    postprocess: Callable[[List[str]], List[str]] | None = None,
    show_progress: bool = True,
    letters_only: bool = True,
    lowercase: bool = True,
) -> Union[List[str], List[List[str]]]:
    """
    Normalize a collection of short texts using Cell 1 social-text utilities.

    - Uses `clean_social_corpus` for HTML/URL/email stripping, letters-only, lowercasing,
      roman-numeral removal, and whitespace normalization.
    - Returns strings by default (`to_tokens=False`) or tokens (`to_tokens=True`).
    - Will only apply `postprocess` if you pass it explicitly.
    """
    print("Starting Normalization Process")
    cleaned_or_tokens = clean_social_corpus(
        texts,
        to_tokens=to_tokens,
        show_progress=show_progress,
        letters_only=letters_only,
        lowercase=lowercase,
    )

    # Only apply postprocess if explicitly provided
    if callable(postprocess):
        if to_tokens:
            # If your postprocess expects strings, join first.
            try:
                joined = [" ".join(toks) for toks in cleaned_or_tokens]  # type: ignore[arg-type]
                maybe = postprocess(joined)
                cleaned_or_tokens = maybe if maybe is not None else joined  # type: ignore[assignment]
            except Exception as e:
                raise TypeError(
                    "Postprocess failed on tokenized data. "
                    "Provide a postprocess that accepts List[List[str]] or join tokens yourself."
                ) from e
        else:
            maybe = postprocess(cleaned_or_tokens)  # type: ignore[arg-type]
            # Guard against in-place functions that return None
            if maybe is not None:
                cleaned_or_tokens = maybe  # type: ignore[assignment]

    print("Normalization Process Finished")
    return cleaned_or_tokens



In [61]:
# pro_sentences: list of cleaned strings (default, matches your previous pipeline)

pro_sentences = normalization_pipeline(
    data_list,         # your existing list of raw texts
    to_tokens=False,   # keep strings to stay compatible with littleCleaning
    show_progress=False
)

pro_sentences[:5]

Starting Normalization Process
Normalization Process Finished


['the project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever',
 'you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at',
 'gutenberg',
 'org',
 'if you are not located in the united states you will have to check the laws of the country where you are located before using this ebook']

In [62]:
# Tokenize and preprocess data list

pro_tokens = normalization_pipeline(
    data_list,
    to_tokens=True,    # returns List[List[str]]
    show_progress=False
)
pro_tokens[:2]

Starting Normalization Process
Normalization Process Finished


[['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'the',
  'republic',
  'by',
  'plato',
  'this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'in',
  'the',
  'united',
  'states',
  'and',
  'most',
  'other',
  'parts',
  'of',
  'the',
  'world',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions',
  'whatsoever'],
 ['you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or',
  're',
  'use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included',
  'with',
  'this',
  'ebook',
  'or',
  'online',
  'at']]

In [63]:
# Add unit tests for normalization_pipeline

def _assert_equal(a, b, msg=""):
    assert a == b, f"{msg}\nExpected: {b}\nActual:   {a}"

_demo = [
    "<b>Deal</b> at https://x.y/z and www.foo.com #promo",
    "Email me at Jane.Doe@example.org ASAP — thanks!",
]

# Strings out
out = normalization_pipeline(_demo, to_tokens=False, show_progress=False)
_assert_equal(out, ["deal at and promo", "email me asap thanks"], "String normalization failed")

# Tokens out
out_tok = normalization_pipeline(_demo, to_tokens=True, show_progress=False)
_assert_equal(out_tok, [["deal", "at", "and", "promo"], ["email", "me", "asap", "thanks"]], "Token normalization failed")

print("Normalization pipeline tests passed.")





Starting Normalization Process
Normalization Process Finished
Starting Normalization Process
Normalization Process Finished
Normalization pipeline tests passed.


In [64]:
# Check processed sentence count

len(pro_sentences)

7012

In [65]:
# Structuring the text into a paragraph

dataText = "".join(pro_sentences[: 700])
dataText[: 200]

'the project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions what'

In [66]:
# turn a doc into clean tokens

def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [67]:
# Tokenize and analyze corpus statistics

tokens = clean_doc(dataText)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoeveryou', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'atgutenbergorgif', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'you', 'will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebooktitle', 'the', 'republic', 'author', 'plato', 'translator', 'bjowett', 'release', 'date', 'october', 'ebook', 'most', 'recently', 'updated', 'september', 'language', 'english', 'produced', 'by', 'sue', 'asscher', 'and', 'david', 'widger', 'start', 'of'

In [68]:
# Implement sequence creation for language modeling

length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 18332


In [69]:
# Implement utility function save_doc for writing sequences to file

def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
out_filename = '/content/sample_data/republic_sequences.txt'
save_doc(sequences, out_filename)

In [70]:
# Implement data preparation and tokenization pipeline.

# Consolidated Imports

import numpy
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential # Use tensorflow namespace
from tensorflow.keras.layers import Dense, LSTM, Embedding # Use tensorflow namespace

in_filename = '/content/sample_data/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [71]:
# Define and train Recurrent Neural Network (RNN) using Long Short-Term Memory (LSTM)

# Consolidated Imports

import numpy
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential # Use tensorflow namespace
from tensorflow.keras.layers import Dense, LSTM, Embedding # Use tensorflow namespace

# Define model
model = Sequential()

model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

# The input_shape is (batch_size, sequence_length), where None allows any batch size.
model.build(input_shape=(None, seq_length))

# This will now display the parameter counts.
print(model.summary())

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit model
batch_size=128
epochs=50
model.fit(X, y, batch_size=batch_size, epochs=epochs)

None
Epoch 1/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 143ms/step - accuracy: 0.0439 - loss: 7.3374
Epoch 2/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 108ms/step - accuracy: 0.0856 - loss: 6.2013
Epoch 3/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 107ms/step - accuracy: 0.0858 - loss: 6.0859
Epoch 4/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.1142 - loss: 5.9900
Epoch 5/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 108ms/step - accuracy: 0.1210 - loss: 5.7933
Epoch 6/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 118ms/step - accuracy: 0.1282 - loss: 5.6929
Epoch 7/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 109ms/step - accuracy: 0.1261 - loss: 5.6631
Epoch 8/50
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 109ms/step - accuracy: 0.1374 - loss: 5.5486
Epoch 9/50


<keras.src.callbacks.history.History at 0x7a059d5592e0>

In [72]:
# Add functionality to persist next word prediction model and tokenizer

# save the model to file

model.save('/content/sample_data/models/nexWordPredict/nextWord.keras')

# save the tokenizer

dump(tokenizer, open('/content/sample_data/models/tokenizer.pkl', 'wb'))

In [73]:
# Implement text generation function using a trained Keras model

import numpy as np

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        # yhat = model.predict_classes(encoded, verbose=0)
        predict_x=model.predict(encoded)
        yhat=np.argmax(predict_x,axis=1)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)