# Model Training

## Imports & Setup

In [None]:
# Change working directory to repo directory if we are in the notebook directory
import os
if os.getcwd().endswith("utils"):
    os.chdir("..")

In [None]:
import glob
import json
import multiprocess
import random
import re
import string
import time

import editdistance
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
from nnsplit import NNSplit
import numpy as np

import tensorflow.keras as keras
from tensorflow.keras import callbacks
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
### Helper functions/variables setup ###

notebook_path = os.path.dirname(os.path.realpath(next(
    glob.iglob(os.path.join(os.getcwd(), "**", "training.ipynb")), "./training.ipynb"
)))

corpus_path = os.path.join(notebook_path, "..", "corpus")
os.makedirs(corpus_path, exist_ok=True)

data_path = os.path.join(notebook_path, "..", "data")
os.makedirs(data_path, exist_ok=True)

In [None]:
with open(os.path.join(data_path, "dictionary.txt")) as f:
    dictionary = set(line.strip().lower() for line in f.readlines())
dictionary_list = sorted(dictionary)

def clean_word(word):
    word_lower = word.lower()
    return min(dictionary_list, key=lambda w: editdistance.eval(w, word_lower))
    # return match_capitalization(word, new_word)

def clean_worker(queue, words):
    for word in words:
        queue.put({ word: clean_word(word) })

def clean_text(text):
    # Find all words that aren't in the dictionary and clean those

    tokens = nltk.word_tokenize(text.lower())
    words = set(word for word in tokens if word not in "..." + string.punctuation)
    words.difference_update(dictionary)

    # Get word mappings on multiple cores to speed up process
    words_list = list(words)
    results_queue = multiprocess.Queue()
    procs = []
    n_procs = max(1, multiprocess.cpu_count() - 1)
    for i in range(n_procs):
        proc = multiprocess.Process(target=clean_worker, args=(results_queue, words_list[i*len(words_list)//n_procs : (i+1)*len(words_list)//n_procs],))
        procs.append(proc)
        proc.start()
    word_cleaned = {}
    # Pull from the multiprocess queue while workers are cleaning words
    while all(proc.is_alive() for proc in procs):
        while not results_queue.empty():
            cleaned = results_queue.get()
            word_cleaned.update(cleaned)
        time.sleep(1)
    for proc in procs:
        proc.join()

    # Now replace the words that aren't present in the dictionary
    new_tokens = [word_cleaned.get(word, word) for word in tokens]
    return TreebankWordDetokenizer().detokenize(new_tokens)

In [None]:
# Setting corpus name for training on
corpus_name = "sonicsez"

## Text Model Training

**Note**: replace `corpus_name` above Punctuation Restoration Training with the filename of the corpus you want to train on! (without the path and extension)

In [None]:
# Prepare the data for training
with open(os.path.join(corpus_path, f"{corpus_name}.txt")) as f:
    contents = [line.strip() for line in f.readlines()]

text = clean_text("\n".join(contents))

# Strip stray apostrophes
text = re.sub(r"(?:^'|(?<![a-z])'(?![a-z])|'$)", "", text, flags=re.IGNORECASE)

In [None]:
# Optionally save the cleaned text for later usage since the process is intensive
with open(os.path.join(corpus_path, f"{corpus_name}.cleaned.txt"), "w") as f:
    f.write(text)

In [None]:
# Optionally load the cleaned text on a later rerun
with open(os.path.join(corpus_path, f"{corpus_name}.cleaned.txt")) as f:
    text = f.read()

In [None]:
# Separate corpus by newlines
corpus = [sentence.strip() for sentence in text.split("\n")]

In [None]:
# Fit tokenizer including basic punctuation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

print(f"Size of vocab: {len(tokenizer.word_index)}")

In [None]:
# Convert corpus into index sequences
extracted_sequences = tokenizer.texts_to_sequences(corpus)
max_seq_len = 20

# Flatten extracted sequences and build new sequences from that
flattened_sequences = [token for seq in extracted_sequences for token in seq]
corpus_len = len(flattened_sequences)
step_size = round(max(1, corpus_len / 50000))

# Create a sequences list stepping every few words in the corpus
sequences = []

for i in range(0, len(flattened_sequences), step_size):
    seq = flattened_sequences[i : i + max_seq_len]
    if len(seq) > 2:
        sequences.append(seq)

vocab_len = len(tokenizer.word_index) + 1
idx_word = tokenizer.index_word

In [None]:
# Pad the sequences to make them all the same length, so we can input them into the RNN later
padded_sequences = np.array(pad_sequences(sequences, maxlen=max_seq_len, padding="pre"))

# Now set up sequence -> word output vectors for training
predictors, labels = padded_sequences[:, :-1], padded_sequences[:, -1]

# One-hot encode the outputs (bag-of-words)
labels = utils.to_categorical(labels, num_classes=vocab_len)

In [None]:
# Create the model to train on
seq_len = max_seq_len - 1
hidden_nodes = (seq_len + 1) * 2
model = models.Sequential([
    layers.Embedding(vocab_len, 20, input_length=seq_len),
    layers.LSTM(hidden_nodes),
    layers.Dropout(0.2),
    layers.Dense(vocab_len, activation="softmax")
])
model.compile(loss="categorical_crossentropy", metrics=["acc"], optimizer="adam")

# Save initial model data for reloading, and also checkpointing for saving model weights
model_path = os.path.join(notebook_path, f"{corpus_name}.model.h5")
checkpoint = callbacks.ModelCheckpoint(model_path, monitor="loss", mode="min", save_best_only=True, verbose=1)
model_callbacks = [checkpoint]

with open(os.path.join(notebook_path, f"{corpus_name}.wordmap.json"), "w") as f:
   f.write(json.dumps(tokenizer.index_word))

In [None]:
# Train the model
def sample(prediction, temperature = 1.0):
    prediction = np.asarray(prediction, dtype=np.float64)
    prediction = np.log(prediction) / temperature
    exp_prediction = np.exp(prediction)
    prediction = exp_prediction / np.sum(exp_prediction)
    probabilities = np.random.multinomial(1, prediction[0,:], 1)
    return np.argmax(probabilities)

EPOCHS = 150
BATCH_SIZE = 256

model.fit(
    predictors, labels, epochs=EPOCHS, batch_size=BATCH_SIZE,
    callbacks=model_callbacks, validation_split=0.1
)

print(f"\nTesting output from training:")
seed_idx = random.randrange(0, len(predictors))
pattern = predictors[seed_idx].tolist()
current_sentence = " ".join(idx_word.get(idx, "") for idx in pattern)
for temperature in [0.2, 0.5, 1.0, 1.2]:
    generated_text = ""
    current_pattern = pattern.copy()
    
    for i in range(25):
        # Run prediction through the model and get the index of the prediction
        input_sequence = np.reshape(current_pattern, (1, len(current_pattern)))
        prediction = model.predict(input_sequence, verbose=0)
        idx = sample(prediction, temperature)
        result = idx_word.get(idx, "")
        generated_text += result + " "

        current_pattern.append(idx)
        current_pattern = current_pattern[1:]
    
    print(f"- For temperature {temperature}:")
    print(f"--- Input seed: {current_sentence}")
    print(f"---  Generated: {generated_text}")

print()

In [None]:
# Test different temperatures with sentence segmentation

model_path = os.path.join(notebook_path, f"{corpus_name}.model.h5")
model = models.load_model(model_path)
with open(os.path.join(notebook_path, f"{corpus_name}.wordmap.json")) as f:
    # Fix JSON requiring keys to be strings by making them ints again
    idx_word = {int(key): val for key, val in json.load(f).items()}

with open(os.path.join(notebook_path, "punctuation.probabilities.json")) as f:
    punc_probabilities = json.load(f)
sent_splitter = NNSplit.load("en")

def sample(prediction, temperature = 1.0):
    prediction = np.asarray(prediction, dtype=np.float64)
    prediction = np.log(prediction) / temperature
    exp_prediction = np.exp(prediction)
    prediction = exp_prediction / np.sum(exp_prediction)
    probabilities = np.random.multinomial(1, prediction[0,:], 1)
    return np.argmax(probabilities)

max_word_index = max(key for key in idx_word.keys())
pattern = [random.randint(1, max_word_index) for _ in range(19)]
current_sentence = " ".join(idx_word.get(idx, "") for idx in pattern)
for temperature in np.arange(0.2, 1.3, 0.1):
    generated_text = ""
    current_pattern = pattern.copy()
    
    for i in range(50):
        # Run prediction through the model and get the index of the prediction
        input_sequence = np.reshape(current_pattern, (1, len(current_pattern)))
        prediction = model.predict(input_sequence, verbose=0)
        idx = sample(prediction, temperature)
        result = idx_word.get(idx, "")
        generated_text += result + " "

        current_pattern.append(idx)
        current_pattern = current_pattern[1:]
    
    print(f"- For temperature {temperature}:")
    print(f"--- Input seed: {current_sentence}")
    print(f"---  Generated: {generated_text}")
    print( "---  Sentences:")
    for sentence in sent_splitter.split([generated_text])[0]:
        sentence = str(sentence).strip()
        last_part_of_speech = nltk.pos_tag(nltk.word_tokenize(sentence))[-1][1]
        punc_marks = punc_probabilities.get(last_part_of_speech, {})
        if len(punc_marks) > 0:
            punc_mark_probs = [(mark, pr) for mark, pr in punc_marks.items()]
            marks = [mark for mark, pr in punc_mark_probs]
            probs = [pr for mark, pr in punc_mark_probs]
            punc_mark = random.choices(marks, probs, k=1)[0]
        else:
            punc_mark = ""
        # Rudimentary sentence constructor, will change
        sentence = sentence[0].upper() + sentence[1:] + punc_mark
        print(f"----- {sentence}")
    
    print()

In [None]:
# Manually save the latest version of the model. Not needed if using checkpoints above
# model.save(model_path)

# Re-create the model with the below code:
# model = models.load_model(...)

## Punctuation Restoration Training

**Note**: replace `corpus_name` above this section with the filename of the corpus you want to train on! (without the path and extension)

Right now, this is very basic, only calculating the probability a part of speech has a punctuation mark after it. After sentence segmentation, these probabilities will be used to punctuate sentences.

In [None]:
# Prepare the data for training
with open(os.path.join(corpus_path, f"{corpus_name}.txt")) as f:
    contents = [line.strip() for line in f.readlines()]

text = clean_text("\n".join(contents))

# Strip stray apostrophes
text = re.sub(r"(?:^'|(?<![a-z])'(?![a-z])|'$)", "", text, flags=re.IGNORECASE)

# Remove punctuation besides the ones we care about
remove_regex = "[" + re.escape("".join(mark for mark in string.punctuation if mark not in '!"\',.:;?')) + "]"
text = re.sub(remove_regex, "", text)

In [None]:
speech_parts = nltk.pos_tag(nltk.word_tokenize(text))

punc_instances = {}
for word1, word2 in zip(speech_parts, speech_parts[1:]):
    part1 = word1[1]
    text2 = word2[0] # Should contain the actual punctuation mark if it is punctuation
    part2 = word2[1]
    if part1 not in '.,:': # If first token of pair is an actual word, count that instance
        if part1 not in punc_instances:
            punc_instances[part1] = {}
        if part2 in '.,:': # If second token of pair is punctuation, add pair to known pairs
            punc_instances[part1][text2] = punc_instances[part1].get(text2, 0) + 1

# Clean unused keys in the instances dict, and calculate probabilities
punc_probabilities = {}
for part in punc_instances.keys():
    if len(punc_instances[part]) > 0:
        sum_instances = sum(punc_instances[part].values())
        punc_probabilities[part] = { punc: round(punc_instances[part][punc] / sum_instances, 10) for punc in punc_instances[part]}

with open(os.path.join(notebook_path, "punctuation.probabilities.json"), "w") as f:
    f.write(json.dumps(punc_probabilities))

## Convert h5 to onnx Models

In [None]:
import os
if os.getcwd().endswith("utils"):
    os.chdir("..")
import glob
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import models
import onnx
import tf2onnx

notebook_path = os.path.dirname(os.path.realpath(next(
    glob.iglob(os.path.join(os.getcwd(), "**", "training.ipynb")), "./training.ipynb"
)))

In [None]:
for fname in glob.glob(os.path.join(notebook_path, "*.h5")):
    model = models.load_model(fname)
    input_signature = [tf.TensorSpec([1, 19], tf.float32, name="input")]
    onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature)
    model_name = os.path.splitext(os.path.basename(fname))[0]
    onnx.save(onnx_model, os.path.join(notebook_path, f"{model_name}.onnx"))