<a href="https://colab.research.google.com/github/AAdewunmi/Next-Word-Prediction-Project/blob/main/Predict_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Words!

In [None]:
# Import libraries

import string
import nltk
import re
from nltk.corpus import stopwords
import pkg_resources
import pickle
import json
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize

In [None]:
# Install tensorflow

!pip install tensorflow



In [None]:
# Import libraries

from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Add utility functions for document handling and sample data directory detection

from pathlib import Path
from typing import List
import string

def _detect_sample_dir() -> Path:
    """
    Return a usable /sample_data/ directory for the current environment.

    Priority:
      1) /sample_data (if it exists)
      2) /content/sample_data (Colab default)
      3) Create /sample_data if neither exists (for local/Jupyter)
    """
    candidates = [Path("/sample_data"), Path("/content/sample_data")]
    for d in candidates:
        if d.exists():
            return d
    # Create a local /sample_data if nothing exists
    d = Path("/sample_data")
    d.mkdir(parents=True, exist_ok=True)
    return d

SAMPLE_DIR = _detect_sample_dir()

def load_doc(filename: str) -> str:
    """
    Read the entire contents of a text file.

    Args:
        filename: Path to the text file (absolute or relative).

    Returns:
        File contents as a single string.
    """
    path = Path(filename)
    with path.open("r", encoding="utf-8") as f:
        return f.read()

def clean_doc(doc: str) -> List[str]:
    """
    Convert raw document text into cleaned, lowercased, alphabetic tokens.

    Steps:
      1) Replace double hyphens with a space.
      2) Split on whitespace.
      3) Remove ASCII punctuation from each token.
      4) Keep only purely alphabetic tokens (isalpha()).
      5) Lowercase all tokens.

    Args:
        doc: Raw document text.

    Returns:
        List of cleaned tokens.
    """
    doc = doc.replace("--", " ")
    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in doc.split()]
    tokens = [w.lower() for w in tokens if w.isalpha()]
    return tokens

def save_doc(lines: List[str], filename: str) -> None:
    """
    Save a list of strings to disk, one per line.

    Args:
        lines: Strings to write (e.g., tokens).
        filename: Output file path to write.
    """
    path = Path(filename)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        f.write("\n".join(lines))



In [None]:
# Add script to tokenize text file from sample_data and save tokens

# Point to files in /sample_data (or /content/sample_data if that's what exists)
INPUT_FILE = (SAMPLE_DIR / "republic.txt").as_posix()
OUTPUT_FILE = (SAMPLE_DIR / "republic-tokenised.txt").as_posix()

# Execute the pipeline
text = load_doc(INPUT_FILE)
tokens = clean_doc(text)
save_doc(tokens, OUTPUT_FILE)

print(f"Read from: {INPUT_FILE}")
print(f"Wrote  to: {OUTPUT_FILE}")
print(f"Sample tokens: {tokens[:25]}")
print(f"Total tokens: {len(tokens):,}")



Read from: /content/sample_data/republic.txt
Wrote  to: /content/sample_data/republic-tokenised.txt
Sample tokens: ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other']
Total tokens: 209,695


In [None]:
# Add sanity checks for document processing pipeline

# Minimal assertions to catch common regressions
assert isinstance(text, str) and len(text) > 0, "Input text is empty."
assert all(isinstance(t, str) for t in tokens), "Tokens must be strings."
assert all(t.isalpha() for t in tokens), "Non-alphabetic tokens slipped through."
assert Path(OUTPUT_FILE).exists(), "Output file was not written."

# Round-trip check on saver/loader behavior
tmp_out = (SAMPLE_DIR / "_tmp_tokens.txt").as_posix()
save_doc(["A", "b", "c"], tmp_out)
reloaded = load_doc(tmp_out).splitlines()
assert reloaded == ["A", "b", "c"], "save_doc/load_doc round-trip failed."
Path(tmp_out).unlink(missing_ok=True)

print("Sanity checks passed.")


Sanity checks passed.


In [None]:
# Text cleaning and preprocessing function for Twitter data

def tokenize_twitter(sentences):
    """
    Tokenize sentences into tokens (words)

    Args:
        sentences: List of strings

    Returns:
        List of lists of tokens
    """
    print("Starting Cleaning Process")
    tokenized_sentences = []
    for sentence in tqdm(sentences):

        # Convert to lowercase letters
        sentence = cleanhtml(sentence)
        sentence = _replace_urls(sentence)
        sentence = remove_email(sentence)
        sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
        sentence = sentence.lower()
        sentence = misc(sentence)


        # tokenized = nltk.word_tokenize(sentence)

        # append the list of words to the list of lists
        # tokenized_sentences.append(tokenized)
        tokenized_sentences.append(sentence)

    return tokenized_sentences