# CPSC 393 Machine Learning - Homework 4
By Matthew Favela

In [None]:
import keras as kb

import numpy as np
import string
from random import randint
from pickle import load
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import os

: 

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("GPU acceleration enabled")
else:
    print("GPU not available, using CPU")

# Enable parallel CPU execution
tf.config.threading.set_intra_op_parallelism_threads(16)
tf.config.threading.set_inter_op_parallelism_threads(16)

# Use mixed precision for faster computation
tf.keras.mixed_precision.set_global_policy('mixed_float16')
token_length = 50

In [None]:
def load_doc(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except UnicodeDecodeError:
        # Fallback to latin-1 if UTF-8 fails
        with open(filename, 'r', encoding='latin-1') as file:
            text = file.read()
        return text

def clean_doc(doc):
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

def save_doc(lines, filename):
    data = '\n'.join(lines)
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(data)

in_filename = "alices_adventure_in_wonderland.txt"
doc = load_doc(in_filename)
print(doc[:token_length])

tokens = clean_doc(doc)
print(tokens[:token_length])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

length = token_length + 1
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

out_filename = '{}_sequence.txt'.format(in_filename)
save_doc(sequences, out_filename)

in_filename = '{}_sequence.txt'.format(in_filename)
doc = load_doc(in_filename)
lines = doc.split('\n')

In [16]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = kb.utils.to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, token_length, input_shape=(seq_length,)))
model.add(tf.keras.layers.LSTM(100))
model.add(tf.keras.layers.Dense(vocab_size, activation='softmax'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

model.save('model.tf')

In [None]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = kb.preprocessing.sequence.pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat = np.argmax(model.predict(encoded, verbose = 0), axis=-1)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

seed_text = lines[randint(0,len(lines))]
print("SEED TEXT:", seed_text)

generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print("GENDERATED TEXT:", generated + '\n\n')