# Project 4
## Students:
 - Bryson Gullett
 - Robert Schaffer
 - Matthew Dixson
 

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow import optimizers
from tensorflow.keras import layers
import numpy as np
import re
import random as rand

2023-05-06 15:11:46.925550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(tf.__version__)# you may want to upgrade to 2.10.0

2.10.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [3]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=80, rate=0.1, lr=.0001):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate
        self.lr = lr

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        #Use the rate variable for the dropout layers and remember to use two dense layers
        #See assignment and its figures for more details.
        multihead_atten = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.num_heads, dropout=self.rate)(inputs, inputs, use_causal_mask=True)
        add1 = layers.Add()([multihead_atten, inputs])
        # layer_norm1 = layers.LayerNormalization(epsilon=1e-6)(add1)
        layer_norm1 = layers.LayerNormalization()(add1)
        dense1 = layers.Dense(self.ff_dim, activation='relu')(layer_norm1)
        dense2 = layers.Dense(self.ff_dim, activation='relu')(dense1)
        dropout1 = layers.Dropout(self.rate)(dense2)
        add2 = layers.Add()([dropout1, layer_norm1])
        layer_norm2 = layers.LayerNormalization(epsilon=1e-6)(add2)

        return layer_norm2
    
    def EmbeddingLayer(self, inputs):
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer
        positions = layers.Lambda(lambda x: tf.multiply(tf.ones_like(x, dtype='int32'), tf.range(self.maxlen)))(inputs)
        token_embeddings = layers.Embedding(self.vocab_size, self.embed_dim, input_length=self.maxlen)(inputs)
        position_embeddings = layers.Embedding(self.maxlen, self.embed_dim, input_length=self.maxlen)(positions)

        add = layers.Add()([token_embeddings, position_embeddings])
        return add
    
    def create_model(self):
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)
        inputs = keras.Input(shape=(self.maxlen,))
        embedding_layer = self.EmbeddingLayer(inputs)
        prev_layer = embedding_layer
        for _ in range(self.num_blocks):
            transformer_block = self.TransformerBlock(prev_layer)
            prev_layer = transformer_block
        final_dense = layers.Dense(self.vocab_size, activation='softmax')(prev_layer)
        model = keras.Model(inputs=inputs, outputs=final_dense)
        self.opt = optimizers.Adam(learning_rate=self.lr)
        self.loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
        model.compile(loss=keras.losses.SparseCategoricalCrossentropy, optimizer=self.opt)
        return model
    
my_model = TransformerModel(1000)
print(my_model.create_model().summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 80)]         0           []                               
                                                                                                  
 lambda (Lambda)                (None, 80)           0           ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, 80, 256)      256000      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 80, 256)      20480       ['lambda[0][0]']                 
                                                                                              

2023-05-06 15:11:54.953768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Task 2

In [4]:
class DataSet():
    def __init__(self, filename, len):
        #load the text from the file
        self.text = ''
        self.len = len
        with open(filename) as fin:
            for line in fin:
                self.text += line

    def prep_text(self):
        #remove all punctuation, set to lowercase, remove duplicate spaces and other whitespace (keep newlines)
        self.text = re.sub(r'[^\w\s]', ' ', self.text)
        self.text = re.sub(r' +', ' ', self.text)
        self.text = re.sub(r'â', '', self.text)
        
        
        
        
    def tokenize_text(self):
        #seperate into words, create a vocab and convert the text to a list of numbers using the vocab such that each unique word is represented by its own number number
        self.text = self.text.split()
        self.vocab = np.unique(self.text)
        self.vocab = np.append(self.vocab, ['PAD'])
        self.vocab_nums = [i for i in range(len(self.vocab))]
        self.vocab_dict = dict(zip(self.vocab, self.vocab_nums))
        

    def create_dataset(self):
        #split the tokenized data into sequences of length len, return the sequences and vocab
        self.prep_text()
        self.tokenize_text()
        X = []
        Y = []
        for i in range(int(len(self.text)/self.len)):
            sequence_X = []
            sequence_Y = []
            for j in range(self.len):
                if i*self.len+j < len(self.text):
                    sequence_X.append(self.vocab_dict[self.text[i*self.len+j]])
                else:
                    pass
                    #sequence_X.append(self.vocab_dict['PAD'])
                if i*self.len+j+1 < len(self.text):
                    #y = np.zeros(len(self.vocab))
                    #y[self.vocab_dict[self.text[i*self.len+j+1]]] = 1
                    #y = [0 if self.vocab_dict[self.text[i*self.len+j+1]] != k else 1 for k in range(len(self.vocab))]
                    sequence_Y.append(self.vocab_dict[self.text[i*self.len+j+1]])
                else:
                    #y = np.zeros(len(self.vocab))
                    #y[self.vocab_dict['PAD']] = 1
                    #y = [0 if self.vocab_dict['PAD'] != k else 1 for k in range(len(self.vocab))]
                    #sequence_Y.append([self.vocab_dict['PAD']])
                    pass
            X.append(sequence_X)
            Y.append(sequence_Y)
        return X, Y, self.vocab

## Task 3

In [5]:
class GenerateText():
    def __init__(self, model, vocab, vocab_dict):
        self.model = model
        self.vocab = vocab
        self.vocab_dict = vocab_dict
    
    def generate_text(self, start_string, num_generate=100, seq_len=80):
        #generate text using the model and vocab, start with the start_string and generate num_generate words
        model_input = [self.vocab_dict['PAD'] for _ in range(seq_len)]
        
        #Text is the output of generate_text()
        text = start_string
        #Tokenize string to encode into integers
        start_string = start_string.split()
        #Get starting index for first word generated
        next_index = len(start_string)
        #Generate model input
        for i in range(len(start_string)):
            model_input[i] = self.vocab_dict[start_string[i]]
        
        #Generate text
        for i in range(num_generate):
            prediction = self.model.predict(np.array([model_input]), verbose=0)
            # print(len(prediction))
            # print(len(prediction[0]))
            # print(len(prediction[0][0]))
            word_index = np.argmax(prediction[0][next_index])
            next_index += 1
            if word_index >= len(self.vocab):
                print('Invalid word index: ', word_index, "Valid max word index = ", len(self.vocab))
                word_index = self.vocab_dict['PAD']
            if next_index < seq_len:
                model_input[next_index] = word_index
            else:
                model_input.pop(0)
                model_input.append(word_index)
                next_index -= 1
          
            next_word = self.vocab[word_index]
            text += ' ' + next_word
            # print(model_input)
        return text
    
    def generate_random_text(self, num_generate=100):
        text = ''
        for _ in range(num_generate):
            word = self.vocab[rand.randrange(len(self.vocab))]
            text += word + ' '
            if word == 'PAD':
                break
        return text   

In [6]:
data = DataSet('beatles.txt', 80)
_, _, vocab = data.create_dataset()
transformer = TransformerModel(len(vocab))
model = transformer.create_model()
text = GenerateText(model, vocab, data.vocab_dict)
out = text.generate_text("i read a crowd of people", num_generate=5)
print(out)
out = text.generate_random_text(num_generate=5)
print(out)
# print(len(out.split()))

i read a crowd of people sergeant skip ya nurse oa
door match seat rings is 


## Task 4: Model Traning and Testing

In [6]:
#Train the model while periodically generating text to show progress
# Used Keras' example on how to write a training loop
# https://keras.io/guides/writing_a_training_loop_from_scratch
def train_model(model, vocab, X, Y, epochs=50):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    for i in range(epochs):
        
        # Open a GradientTape to record the operations run
        # during the forward pass, which enables auto-differentiation.
        with tf.GradientTape() as tape:

            # Run the forward pass of the layer.
            # The operations that the layer applies
            # to its inputs are going to be recorded
            # on the GradientTape.
            logits = model(X, training=True)  # Logits for this minibatch

            # Compute the loss value for this minibatch.
            loss_value = loss_fn(Y, logits)

        # Use the gradient tape to automatically retrieve
        # the gradients of the trainable variables with respect to the loss.
        grads = tape.gradient(loss_value, model.trainable_weights)
       
        # Run one step of gradient descent by updating
        # the value of the variables to minimize the loss.
        model.optimizer.apply_gradients(zip(grads, model.trainable_weights))



        # model.fit(x,y)
        text = GenerateText(model, vocab, data.vocab_dict)
        out = text.generate_text('hello world', 50)
        # out = text.generate_random_text(num_generate=10)
        print(f"Epoch {i}\n\tloss: {loss_value}\n\tText: {out}")

    return model

## Setup input data for training

In [7]:
data = DataSet('beatles.txt', 80)
x, y, vocab = data.create_dataset()
X = []
for e in x:
    X.append(np.asarray(e).astype(np.int32))
X = np.asarray(X)

# Some elements of y are lists of size 1, convert to int
for i in range(451):
    for j in range(80):
        if isinstance(y[i][j], list):
            y[i][j] = y[i][j][0]

Y = np.asarray(y).astype(np.int32)


## Train models

In [9]:
#1
transformer = TransformerModel(len(vocab),rate=.5)
model1 = transformer.create_model()
train_model(model1, vocab, X, Y, epochs=1)

Epoch 0
	loss: 7.91572380065918
	Text: hello world life white risk dan nicht n crying tangerine birds stinking gas filled becomes single 1 spaniel taxman stone baby acquainted deeper chuck rainy where so drink fore each understands wearing melting thank follow saw marshmellow open proud cheat boots hideaway birds relax whigwam after shoulders sounds policemen magazine risk penetrate


<keras.engine.functional.Functional at 0x1caefcc9790>

In [10]:
#50
transformer = TransformerModel(len(vocab), lr=.001)
model50 = transformer.create_model()
train_model(model50, vocab, X, Y, epochs=50)

Epoch 0
	loss: 7.8974761962890625
	Text: hello world dessert match banks wie lasted disease allan wait gives here i appear sexy rice hair see place pulled though alerted stupid warum fed complainin herself baby country pleasin gave loner looking calls charity shall yeh hearted tchaikovsky you appear filled sincere p before fbi carve sleeps bad ease stared never
Epoch 1
	loss: 7.693174839019775
	Text: hello world dessert match banks wie lasted every t you i happinness i appear you put doctor table you end i questo bad the same roll you are than pleasin you you i the send shall yeh never tchaikovsky you you leave the sympathize before fbi the snores heading fore caressing pilchard
Epoch 2
	loss: 7.480116367340088
	Text: hello world dessert the phone glow the i the broken i happinness i the i wanna you i the end i i the i the you you i the walrus you you i the of long yeh you you you you i i the door i the word i can pride o
Epoch 3
	loss: 7.247936725616455
	Text: hello world dessert the 

<keras.engine.functional.Functional at 0x1caf19e8190>

In [11]:
#100
transformer = TransformerModel(len(vocab), lr=.001, rate=.001)
model100 = transformer.create_model()
train_model(model100, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.911420822143555
	Text: hello world been al whole chance knickers invitation round silently eat evermore evermore jude gently resting a ah ooh daddy sunshine five magil woman stretches boots places sitting perfectly mao tonight tumble pleas books sour born queen lit van beat show misunderstanding telephone heaven lay foolish shake but killer inquire fingertips lorry
Epoch 1
	loss: 7.706535816192627
	Text: hello world been in a fool think thought honey same this last belle up you honey losing honey she wives since soon went turn breakfast own old bout i i shook want honey she saved aunt diamonds still i have whigwam bright washed joker to you test silver true yes that you
Epoch 2
	loss: 7.49732780456543
	Text: hello world been in a fool think thought honey same you you after tour calls honey a you you dizzy to be wanna rest weeks charity you tell you i i want i i i m never do i have whigwam use you test to you you ve done movies tonight you
Epoch 3
	loss: 7.2736740112304

<keras.engine.functional.Functional at 0x1caf5d78ee0>

In [12]:
transformer = TransformerModel(len(vocab), lr=.001, rate=.001)
model400 = transformer.create_model()
train_model(model400, vocab, X, Y, epochs=400)

Epoch 0
	loss: 7.9063286781311035
	Text: hello world bring banker havin mundo lips flew diamant donated cut pretty musst used rybody prrr honey boac golden burning teen oh i colder summernight tres under drove rybody wing head customer nun lying said arms phone martha dance sound rybody flowers fragrant multicoloured hoe friends change rights truffle windy shine tasting
Epoch 1
	loss: 7.700497150421143
	Text: hello world bring banker havin someday obscene rent you i strong meeting get ay ooh tax maxwell scarlet it huh cia opaque cry keep diamant fil read warst nasty mambo you i maid desmond the word shook walked get cheat i percetly mckenzie late wondering i not fed someone the word won
Epoch 2
	loss: 7.487282752990723
	Text: hello world you i want a man teen you i m she just won i m you i it i m she party the word you i ve you i every works you i said i m you i m i percetly your man i m ballad been they the word won
Epoch 3
	loss: 7.255715847015381
	Text: hello world you i want you i 

<keras.engine.functional.Functional at 0x1caf8823520>

In [8]:
# 4 heads
transformer = TransformerModel(len(vocab), lr=.001, rate=.001, num_heads=4)
modelHead4 = transformer.create_model()
train_model(modelHead4, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.908016681671143
	Text: hello world resting soon ready space holes meant holland assured guy gibraltar look expert clubs mustard out move sho textpert lot chocolate mich new smiling returned sweet gee playin lets aunt would incredibly head misunderstood became abrigado pay mountain gain drive number decide love hide will ends an six sorrow shouts niar
Epoch 1
	loss: 7.705815315246582
	Text: hello world resting i crash if fly refuse more thumb unkind you know getter like you if the desert we were if loser place writing hoping stirring risk clue deeper you diamant tight nose caught these bath breaks clear gone sexy you know day me ve grade beep you know word you
Epoch 2
	loss: 7.494471549987793
	Text: hello world someone i crash i coming that quarter there you re you re out i out i believe i believe i believe i believe stepping stirring risk clue rything you you know you re the s morning i will you know word her leave leave leave leave leave i turn to
Epoch 3
	loss: 7.264

<keras.engine.functional.Functional at 0x7fed00ba9a80>

In [9]:
# 8 heads
transformer = TransformerModel(len(vocab), lr=.001, rate=.001, num_heads=8)
modelHead8 = transformer.create_model()
train_model(modelHead8, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.905703544616699
	Text: hello world see come suprise six start moan chords 909 pray taste scratch carve newspapers pleasure someone drive asked sooner million back we maid joa at town stairs mending testimonial sad parasol majoring clock dragged choking are warnin shout hour walrus town duty tracks lighten risk dessert station inviting patiently rest dear
Epoch 1
	loss: 7.696584701538086
	Text: hello world see come except on heba attractively hay and i riding belong avoid travels i admit danced could you corporation helping we be there life i asked you heart brown ways around home such duchess are warnin m paperback child michelle again you you well please s live time watch hide
Epoch 2
	loss: 7.482611656188965
	Text: hello world see come on the light pictures come to i riding belong avoid done i m comfort be you you you the showes i at i want ah like said they made i in need last thru only hour you i again you you you corporation last you time watch hide
Epoch 3
	loss:

<keras.engine.functional.Functional at 0x7fed0d444100>

In [10]:
# 16 heads
transformer = TransformerModel(len(vocab), lr=.001, rate=.001, num_heads=16)
modelHead16 = transformer.create_model()
train_model(modelHead16, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.918808937072754
	Text: hello world aaahhh mending man way break hammer once talked begin soooo feels kansas hurting she pum cut comb sitting peace turns clowns an cha dancin bird have don ahead marry papers skirts downstairs wondering band gehen tried thought spinning knew don grabbed thumb mary dose baby raleigh together worth tricks performs
Epoch 1
	loss: 7.705054759979248
	Text: hello world marry four into stood worse has hello comfort so sing coming ob darling i man way night t lovin shuop been so skirts far i don gee i i admit into my regret you destruction you love to glaubst cares watching cent you caressing you darling yes you sent ob
Epoch 2
	loss: 7.483435153961182
	Text: hello world marry way you can you you can you i m arizona i i i i d you you can you i i you hammer came day i i i admit point you left i m you love to i don you i you fears rocky you good more heart i
Epoch 3
	loss: 7.2330732345581055
	Text: hello world i i you i you you you you i i i i i i

<keras.engine.functional.Functional at 0x7fecf18b9870>

## Overtrain

Train for a 1000 epochs to overtrain

In [18]:
transformer = TransformerModel(len(vocab), lr=.001, rate=.001)
modelOverTrain = transformer.create_model()
train_model(modelOverTrain, vocab, X, Y, epochs=600)

Epoch 0
	loss: 7.915494918823242
	Text: hello world gurus playroom someday rybody pretend gimme words broken fireside space wedding steal 50 and who scene with loud bullet shy care zapped from rock appointment taxis tree ma thirty stinking while wants forever ja books bloody n paperback sad thumb same precisely work silly freely twist within clue green obladi
Epoch 1
	loss: 7.895442485809326
	Text: hello world gurus playroom someday rybody pretend gimme words broken fireside space wedding steal 50 and who scene with loud bullet shy care zapped from rock appointment taxis tree ma thirty stinking while wants forever ja books bloody n paperback sad thumb same precisely work silly freely twist within clue green sunny
Epoch 2
	loss: 7.875162601470947
	Text: hello world gurus playroom someday rybody pretend gimme words broken fireside space wedding steal 50 and who scene with loud bullet shy care zapped from rock appointment taxis tree ma thirty stinking while wants forever ja books bloody n

<keras.engine.functional.Functional at 0x1ccc6e19070>

Train for another 600 epochs if needed. (We ran this once- 1200 epochs total)

In [20]:
train_model(modelOverTrain, vocab, X, Y, epochs=600)

Epoch 0
	loss: 2.6404521465301514
	Text: hello world he got a week eight days a week eleanor rigby ah the world yes i m gonna have some fun tonight we re going to be let me mine i m gonna be alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright
Epoch 1
	loss: 2.638044595718384
	Text: hello world he got a week eight days a week eleanor rigby ah the world yes i m gonna have some fun tonight we re going to be let me mine i m gonna be alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright
Epoch 2
	loss: 2.6358284950256348
	Text: hello world he got a week eight days a week eleanor rigby ah the world yes i m gonna have some fun tonight we re going to be let me mine i m gonna be alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright alright
Epoch 3
	loss: 2.633

<keras.engine.functional.Functional at 0x1ccc6e19070>

## Test with different starting phrases

In [19]:

phrases = [
    "a day in the ",
    "hello world",
    "k party hendersons broken",
    "junior tear dizzy bother strawberry would",
    "fly"
]

for phrase in phrases:
    text1 = GenerateText(model1, vocab, data.vocab_dict)
    out1 = text1.generate_text(phrase,num_generate=30)
    
    text50 = GenerateText(model50, vocab, data.vocab_dict)
    out50 = text50.generate_text(phrase,num_generate=30)
    
    text100 = GenerateText(model100, vocab, data.vocab_dict)
    out100 = text100.generate_text(phrase, num_generate=30)

    text400 = GenerateText(model400, vocab, data.vocab_dict)
    out400= text400.generate_text(phrase, num_generate=30)  

    textOver = GenerateText(modelOverTrain, vocab, data.vocab_dict)
    outOver= textOver.generate_text(phrase, num_generate=30)      
    print(f"Phrase: {phrase}")
    print(f"\t1 Epoch: {out1}")
    print(f"\t50 Epochs: {out50}")
    print(f"\t100 Epochs: {out100}")
    print(f"\t400 Epochs: {out400}")
    print(f"\tOvertrained: {outOver}")


Phrase: a day in the 
	1 Epoch: a day in the  parted hold side hurry changes wonders birds stinking gas filled becomes single 1 spaniel taxman stone baby acquainted deeper chuck rainy where so drink fore each understands wearing melting thank
	50 Epochs: a day in the  i m down and i m in the night before when i m so i m so i m so i m so i m down don t be let
	100 Epochs: a day in the  the end through the mystery tour roll up for the tears cried for me all the mystery tour is all the things that s got a ginger sling with you
	400 Epochs: a day in the  to try our best to hold on the things that was wrong is having we didn t know it down ev rybody pulled their socks up the haze but every
	Overtrained: a day in the  the end a rockin in the word is a lucky you re going to be a little girl you can t be your life i m in the game i
Phrase: hello world
	1 Epoch: hello world life white risk dan nicht n crying tangerine birds stinking gas filled becomes single 1 spaniel taxman stone baby acquaint


# Report

## Introduction

## Results

## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.