# Project 4
## Students:
 - Bryson Gullett
 - Robert Schaffer
 - Matthew Dixson
 

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow import optimizers
from tensorflow.keras import layers
import numpy as np
import re
import random as rand

2023-05-06 15:11:46.925550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
print(tf.__version__)# you may want to upgrade to 2.10.0

2.10.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [5]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=80, rate=0.1, lr=.0001):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate
        self.lr = lr

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        #Use the rate variable for the dropout layers and remember to use two dense layers
        #See assignment and its figures for more details.
        multihead_atten = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.num_heads, dropout=self.rate)(inputs, inputs, use_causal_mask=True)
        add1 = layers.Add()([multihead_atten, inputs])
        # layer_norm1 = layers.LayerNormalization(epsilon=1e-6)(add1)
        layer_norm1 = layers.LayerNormalization()(add1)
        dense1 = layers.Dense(self.ff_dim, activation='relu')(layer_norm1)
        dense2 = layers.Dense(self.ff_dim, activation='relu')(dense1)
        dropout1 = layers.Dropout(self.rate)(dense2)
        add2 = layers.Add()([dropout1, layer_norm1])
        layer_norm2 = layers.LayerNormalization(epsilon=1e-6)(add2)

        return layer_norm2
    
    def EmbeddingLayer(self, inputs):
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer
        positions = layers.Lambda(lambda x: tf.multiply(tf.ones_like(x, dtype='int32'), tf.range(self.maxlen)))(inputs)
        token_embeddings = layers.Embedding(self.vocab_size, self.embed_dim, input_length=self.maxlen)(inputs)
        position_embeddings = layers.Embedding(self.maxlen, self.embed_dim, input_length=self.maxlen)(positions)

        add = layers.Add()([token_embeddings, position_embeddings])
        return add
    
    def create_model(self):
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)
        inputs = keras.Input(shape=(self.maxlen,))
        embedding_layer = self.EmbeddingLayer(inputs)
        prev_layer = embedding_layer
        for _ in range(self.num_blocks):
            transformer_block = self.TransformerBlock(prev_layer)
            prev_layer = transformer_block
        final_dense = layers.Dense(self.vocab_size, activation='softmax')(prev_layer)
        model = keras.Model(inputs=inputs, outputs=final_dense)
        self.opt = optimizers.Adam(learning_rate=self.lr)
        self.loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
        model.compile(loss=keras.losses.SparseCategoricalCrossentropy, optimizer=self.opt)
        return model
    
my_model = TransformerModel(1000)
print(my_model.create_model().summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 80)]         0           []                               
                                                                                                  
 lambda (Lambda)                (None, 80)           0           ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, 80, 256)      256000      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 80, 256)      20480       ['lambda[0][0]']                 
                                                                                              

2023-05-06 15:11:54.953768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Task 2

In [6]:
class DataSet():
    def __init__(self, filename, len):
        #load the text from the file
        self.text = ''
        self.len = len
        with open(filename) as fin:
            for line in fin:
                self.text += line

    def prep_text(self):
        #remove all punctuation, set to lowercase, remove duplicate spaces and other whitespace (keep newlines)
        self.text = re.sub(r'[^\w\s]', ' ', self.text)
        self.text = re.sub(r' +', ' ', self.text)
        self.text = re.sub(r'â', '', self.text)
        
        
        
        
    def tokenize_text(self):
        #seperate into words, create a vocab and convert the text to a list of numbers using the vocab such that each unique word is represented by its own number number
        self.text = self.text.split()
        self.vocab = np.unique(self.text)
        self.vocab = np.append(self.vocab, ['PAD'])
        self.vocab_nums = [i for i in range(len(self.vocab))]
        self.vocab_dict = dict(zip(self.vocab, self.vocab_nums))
        

    def create_dataset(self):
        #split the tokenized data into sequences of length len, return the sequences and vocab
        self.prep_text()
        self.tokenize_text()
        X = []
        Y = []
        for i in range(int(len(self.text)/self.len)):
            sequence_X = []
            sequence_Y = []
            for j in range(self.len):
                if i*self.len+j < len(self.text):
                    sequence_X.append(self.vocab_dict[self.text[i*self.len+j]])
                else:
                    pass
                    #sequence_X.append(self.vocab_dict['PAD'])
                if i*self.len+j+1 < len(self.text):
                    #y = np.zeros(len(self.vocab))
                    #y[self.vocab_dict[self.text[i*self.len+j+1]]] = 1
                    #y = [0 if self.vocab_dict[self.text[i*self.len+j+1]] != k else 1 for k in range(len(self.vocab))]
                    sequence_Y.append(self.vocab_dict[self.text[i*self.len+j+1]])
                else:
                    #y = np.zeros(len(self.vocab))
                    #y[self.vocab_dict['PAD']] = 1
                    #y = [0 if self.vocab_dict['PAD'] != k else 1 for k in range(len(self.vocab))]
                    #sequence_Y.append([self.vocab_dict['PAD']])
                    pass
            X.append(sequence_X)
            Y.append(sequence_Y)
        return X, Y, self.vocab

## Task 3

In [7]:
class GenerateText():
    def __init__(self, model, vocab, vocab_dict):
        self.model = model
        self.vocab = vocab
        self.vocab_dict = vocab_dict
    
    def generate_text(self, start_string, num_generate=100, seq_len=80):
        #generate text using the model and vocab, start with the start_string and generate num_generate words
        model_input = [self.vocab_dict['PAD'] for _ in range(seq_len)]
        
        #Text is the output of generate_text()
        text = start_string
        #Tokenize string to encode into integers
        start_string = start_string.split()
        #Get starting index for first word generated
        next_index = len(start_string)
        #Generate model input
        for i in range(len(start_string)):
            model_input[i] = self.vocab_dict[start_string[i]]
        
        #Generate text
        for i in range(num_generate):
            prediction = self.model.predict(np.array([model_input]), verbose=0)
            # print(len(prediction))
            # print(len(prediction[0]))
            # print(len(prediction[0][0]))
            word_index = np.argmax(prediction[0][next_index])
            next_index += 1
            if word_index >= len(self.vocab):
                print('Invalid word index: ', word_index, "Valid max word index = ", len(self.vocab))
                word_index = self.vocab_dict['PAD']
            if next_index < seq_len:
                model_input[next_index] = word_index
            else:
                model_input.pop(0)
                model_input.append(word_index)
                next_index -= 1
          
            next_word = self.vocab[word_index]
            text += ' ' + next_word
            # print(model_input)
        return text
    
    def generate_random_text(self, num_generate=100):
        text = ''
        for _ in range(num_generate):
            word = self.vocab[rand.randrange(len(self.vocab))]
            text += word + ' '
            if word == 'PAD':
                break
        return text   

In [8]:
data = DataSet('beatles.txt', 80)
_, _, vocab = data.create_dataset()
transformer = TransformerModel(len(vocab))
model = transformer.create_model()
text = GenerateText(model, vocab, data.vocab_dict)
out = text.generate_text("i read a crowd of people", num_generate=5)
print(out)
out = text.generate_random_text(num_generate=5)
print(out)
# print(len(out.split()))

i read a crowd of people tan taken silver feelings knees
lifetime flat rything dragged waters 


## Task 4: Model Traning and Testing

In [9]:
#Train the model while periodically generating text to show progress
# Used Keras' example on how to write a training loop
# https://keras.io/guides/writing_a_training_loop_from_scratch
def train_model(model, vocab, X, Y, epochs=50):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    for i in range(epochs):
        
        # Open a GradientTape to record the operations run
        # during the forward pass, which enables auto-differentiation.
        with tf.GradientTape() as tape:

            # Run the forward pass of the layer.
            # The operations that the layer applies
            # to its inputs are going to be recorded
            # on the GradientTape.
            logits = model(X, training=True)  # Logits for this minibatch

            # Compute the loss value for this minibatch.
            loss_value = loss_fn(Y, logits)

        # Use the gradient tape to automatically retrieve
        # the gradients of the trainable variables with respect to the loss.
        grads = tape.gradient(loss_value, model.trainable_weights)
       
        # Run one step of gradient descent by updating
        # the value of the variables to minimize the loss.
        model.optimizer.apply_gradients(zip(grads, model.trainable_weights))



        # model.fit(x,y)
        text = GenerateText(model, vocab, data.vocab_dict)
        out = text.generate_text('hello world', 50)
        # out = text.generate_random_text(num_generate=10)
        print(f"Epoch {i}\n\tloss: {loss_value}\n\tText: {out}")

    return model

## Setup input data for training

In [10]:
data = DataSet('beatles.txt', 80)
x, y, vocab = data.create_dataset()
X = []
for e in x:
    X.append(np.asarray(e).astype(np.int32))
X = np.asarray(X)

# Some elements of y are lists of size 1, convert to int
for i in range(451):
    for j in range(80):
        if isinstance(y[i][j], list):
            y[i][j] = y[i][j][0]

Y = np.asarray(y).astype(np.int32)


## Train models

In [11]:
#1
transformer = TransformerModel(len(vocab),rate=.5)
model1 = transformer.create_model()
train_model(model1, vocab, X, Y, epochs=1)

Epoch 0
	loss: 7.911502361297607
	Text: hello world crazy split einmal shoulders dark save watch weight anytime busy mystery drove northern deep help wusste weather eyes raccoon quarter never licks gear den peter flew match from true fifty speed feeling we top summer clear dir wine them syndicate glimmering lying moments brew stay help bop nature marmalade turning


<keras.engine.functional.Functional at 0x23ee98cc340>

In [12]:
#50
transformer = TransformerModel(len(vocab), lr=.001)
model50 = transformer.create_model()
train_model(model50, vocab, X, Y, epochs=50)

Epoch 0
	loss: 7.911893844604492
	Text: hello world ringing ay health asked tide true lorry boyfriend babe own asleep spoke sdaeh some beep poppies slip realize log cos trampoline let textpert starts dock misunderstood steady poppies space air and shoulder tenderly first dressed another row guitar wiping tragic please agree shore long guest devil glow slip dry drehtest
Epoch 1
	loss: 7.714039325714111
	Text: hello world i opaque joint boat congo wiping music peter oscar come yellow spoke sdaeh some beep poppies slip face summer searching here difference delay would balalaika mac bist poppies space beethoven and from to saw jobber another row guitar wiping tragic please agree shore long i d la spoil turnstyle now
Epoch 2
	loss: 7.514174461364746
	Text: hello world i you i d you i d parted love you i m in here you i week la please thrill you because black would you i sea dan washed hi and from i saw hand i d forget love you don ooh i ll i d you from i you
Epoch 3
	loss: 7.301247119903564

<keras.engine.functional.Functional at 0x23ee9965220>

In [13]:
#100
transformer = TransformerModel(len(vocab), lr=.001, rate=.001)
model100 = transformer.create_model()
train_model(model100, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.920411109924316
	Text: hello world hide plasticine bra stirring dancer number room thoughtlessly jar done say coconut leads filter war runs nothin must is choose girls rushes cloud style um yesterday parlour certain dinner queue own slither anytime fast allein di fog diamond another nun drop sdaeh backing self gideon listen crossed take toe giving
Epoch 1
	loss: 7.714327335357666
	Text: hello world leave a your grabbed well you don you i a you i i you bin ob soap wisdom could r pick letters stands questo taxis street change einmal fifty whoa its drop love town policeman einer andern lot remain doctor cracks thoughts eat you don barrow allein cold lot dead
Epoch 2
	loss: 7.502594947814941
	Text: hello world like a your i you you don you i a you i i you don you you don is i to t realise chances you near you don you by letters stands time i i to you i i to you vanish you by letters tour is gives lady to
Epoch 3
	loss: 7.275322437286377
	Text: hello world like i to the you

<keras.engine.functional.Functional at 0x23eebd2f160>

In [14]:
transformer = TransformerModel(len(vocab), lr=.001, rate=.001)
model400 = transformer.create_model()
train_model(model400, vocab, X, Y, epochs=400)

Epoch 0
	loss: 7.9159255027771
	Text: hello world in beatin flowers lady resign sigh met showes begging affection meaning woos port us business write problems correct beds dresses glow feed sound town we deine crime tripper mornin unfair glow maybe bbbbb your octopus pepper coming jubilee fine check good age cry then lancashire holy you stood dancin roll
Epoch 1
	loss: 7.712531089782715
	Text: hello world in the middle dose make l a song montelimat school say goodbye be let a madonna she s one think know protected your town we deine crime tripper you headed daily admit river lift cries wouldn spinal speed here majesty colourful go that machine speed flight river ticket another uh
Epoch 2
	loss: 7.50670051574707
	Text: hello world in the middle dose make i have you wives on little log i i a little you i left i think know this holy you you i go i you sun affection i think nearly man any by now true i for anytime you you you i you dancin roll
Epoch 3
	loss: 7.2863898277282715
	Text: hello

<keras.engine.functional.Functional at 0x23ef01ff370>

In [8]:
# 4 heads
transformer = TransformerModel(len(vocab), lr=.001, rate=.001, num_heads=4)
modelHead4 = transformer.create_model()
train_model(modelHead4, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.908016681671143
	Text: hello world resting soon ready space holes meant holland assured guy gibraltar look expert clubs mustard out move sho textpert lot chocolate mich new smiling returned sweet gee playin lets aunt would incredibly head misunderstood became abrigado pay mountain gain drive number decide love hide will ends an six sorrow shouts niar
Epoch 1
	loss: 7.705815315246582
	Text: hello world resting i crash if fly refuse more thumb unkind you know getter like you if the desert we were if loser place writing hoping stirring risk clue deeper you diamant tight nose caught these bath breaks clear gone sexy you know day me ve grade beep you know word you
Epoch 2
	loss: 7.494471549987793
	Text: hello world someone i crash i coming that quarter there you re you re out i out i believe i believe i believe i believe stepping stirring risk clue rything you you know you re the s morning i will you know word her leave leave leave leave leave i turn to
Epoch 3
	loss: 7.264

<keras.engine.functional.Functional at 0x7fed00ba9a80>

In [9]:
# 8 heads
transformer = TransformerModel(len(vocab), lr=.001, rate=.001, num_heads=8)
modelHead8 = transformer.create_model()
train_model(modelHead8, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.905703544616699
	Text: hello world see come suprise six start moan chords 909 pray taste scratch carve newspapers pleasure someone drive asked sooner million back we maid joa at town stairs mending testimonial sad parasol majoring clock dragged choking are warnin shout hour walrus town duty tracks lighten risk dessert station inviting patiently rest dear
Epoch 1
	loss: 7.696584701538086
	Text: hello world see come except on heba attractively hay and i riding belong avoid travels i admit danced could you corporation helping we be there life i asked you heart brown ways around home such duchess are warnin m paperback child michelle again you you well please s live time watch hide
Epoch 2
	loss: 7.482611656188965
	Text: hello world see come on the light pictures come to i riding belong avoid done i m comfort be you you you the showes i at i want ah like said they made i in need last thru only hour you i again you you you corporation last you time watch hide
Epoch 3
	loss:

<keras.engine.functional.Functional at 0x7fed0d444100>

In [10]:
# 16 heads
transformer = TransformerModel(len(vocab), lr=.001, rate=.001, num_heads=16)
modelHead16 = transformer.create_model()
train_model(modelHead16, vocab, X, Y, epochs=100)

Epoch 0
	loss: 7.918808937072754
	Text: hello world aaahhh mending man way break hammer once talked begin soooo feels kansas hurting she pum cut comb sitting peace turns clowns an cha dancin bird have don ahead marry papers skirts downstairs wondering band gehen tried thought spinning knew don grabbed thumb mary dose baby raleigh together worth tricks performs
Epoch 1
	loss: 7.705054759979248
	Text: hello world marry four into stood worse has hello comfort so sing coming ob darling i man way night t lovin shuop been so skirts far i don gee i i admit into my regret you destruction you love to glaubst cares watching cent you caressing you darling yes you sent ob
Epoch 2
	loss: 7.483435153961182
	Text: hello world marry way you can you you can you i m arizona i i i i d you you can you i i you hammer came day i i i admit point you left i m you love to i don you i you fears rocky you good more heart i
Epoch 3
	loss: 7.2330732345581055
	Text: hello world i i you i you you you you i i i i i i

<keras.engine.functional.Functional at 0x7fecf18b9870>

## Overtrain

Train for a 1000 epochs to overtrain

In [15]:
transformer = TransformerModel(len(vocab), lr=.001, rate=.001)
modelOverTrain = transformer.create_model()
train_model(modelOverTrain, vocab, X, Y, epochs=600)

Epoch 0
	loss: 7.931509971618652
	Text: hello world longer appreciate fishwife ho living universe travels ay ja quietly set they collapsed whisper jar over here year money carousel allein places speak chasing gum anybody nice goodbyes savoy trouble marmalade mustard albert misses cracker loving sending per parted evermore ould woldn going anything canary postman heads mad carousel as
Epoch 1
	loss: 7.732109069824219
	Text: hello world you everyone you walk cruel past you worked known hast living say whatever door going cos saved waits you bout gets semoc customer point more twelve reprise seven tacks will her pocket ask re cracker loving sending per parted ve places frantic going anything canary postman heads mad carousel as
Epoch 2
	loss: 7.530437469482422
	Text: hello world you know diamant taking you didididi you can gone quietly she on an told you can poor small a northern local farm know long to red past you test a song givin the head truth ba sit around child you believe you say 

<keras.engine.functional.Functional at 0x23ee8a75d60>

Train for another 600 epochs if needed. (We ran this once- 1200 epochs total)

In [16]:
train_model(modelOverTrain, vocab, X, Y, epochs=600)

Epoch 0
	loss: 0.07429875433444977
	Text: hello world woman my love don t give me presents i m so why tell me why did you cried and i say goodbye hello hello hello goodbye i don t say you in love c mon please please me right now when i don t know when she s come home
Epoch 1
	loss: 0.07321019470691681
	Text: hello world woman my love don t give me presents i m so won t really stand it s not treat me right love has a nasty habit of you honey pie within you dead little girl i will plainly see another man you better keep your head little girl of i
Epoch 2
	loss: 0.0731772929430008
	Text: hello world woman my love don t give me presents i m so won t really stand it s not treat me right love has a nasty habit of you honey pie within you dead little girl i will plainly see another man you better keep your head little girl of i
Epoch 3
	loss: 0.07326163351535797
	Text: hello world woman my love don t give me presents i m so won t really stand it s not treat me right love has a nasty habit of 

<keras.engine.functional.Functional at 0x23ee8a75d60>

## Test with different starting phrases

In [18]:

phrases = [
    "a day in the ",
    "hello world",
    "k party hendersons broken",
    "junior tear dizzy bother strawberry would",
    "fly"
]

for phrase in phrases:
    text1 = GenerateText(model1, vocab, data.vocab_dict)
    out1 = text1.generate_text(phrase,num_generate=150)
    
    text50 = GenerateText(model50, vocab, data.vocab_dict)
    out50 = text50.generate_text(phrase,num_generate=150)
    
    text100 = GenerateText(model100, vocab, data.vocab_dict)
    out100 = text100.generate_text(phrase, num_generate=150)

    text400 = GenerateText(model400, vocab, data.vocab_dict)
    out400= text400.generate_text(phrase, num_generate=150)  

    textOver = GenerateText(modelOverTrain, vocab, data.vocab_dict)
    outOver= textOver.generate_text(phrase, num_generate=150)      
    print(f"Phrase: {phrase}")
    print(f"\t1 Epoch: {out1}")
    print(f"\t50 Epochs: {out50}")
    print(f"\t100 Epochs: {out100}")
    print(f"\t400 Epochs: {out400}")
    print(f"\tOvertrained: {outOver}")


Phrase: a day in the 
	1 Epoch: a day in the  babe penguin disappear bla lights husband girl this lane board nicht happen finally running round most drove writing once case folks sergeant cos clock husband images aches there rectify winging sein tube although investigation wouldn like reprise cos views gotta would semolina no cool rug wave clouds lizard hurting coral best send against deep sending shoe cause sh hankerchief singer turned nurse maybe givin cicce ye 50 shoe henry assured rights imagine writer lose lend may showed any edison hurry skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip skip
	50 Epochs: a day in the  i m in the sun shines down i m in the world is all the sun here it s all t


# Report

## Introduction
We created transformer models to generate Beatles lyrics. We trained our models for 1, 50, 100, 400, and 1200 epochs. Each model used a learning rate of .001, a dropout rate of .001, 2 attention heads, and one transformer block. We used 2 attention heads because increasing this number did not seem to drastically increase the quality of the model during preliminary testing. We used only one transformer blocks because using more transformer blocks greatly increased training time.

The 1200 epoch model is the model we attempted to overtrain.

## Results
### Time
\# attention heads = 2
| Epochs | Time (sec)|
|--------|------|
| 1 | 8.7|
| 50| 392.0|
| 100| 792.8|
| 400| 2387.3|
| 1200| 6149.6|

These times are expected because the only variable changed was epochs. Thus, the amount of time is dependent on the number of epochs. 50 epochs took about 50 times the time as 1 epoch. 100 epochs took about a hundred times as long, etc. 

### Loss
| Attention heads | Epochs | Loss |
|-----------------|--------|------|
|2|1|7.9115|
|2|50|3.2347|
|2|100|2.3614|
|2|400|0.2441|
|2|1200|0.0243|

Loss of the models generally decreased as the number of epochs increased. In the case of the 1200 epoch model, the loss got to below .03 around epoch 1000, but we continued to train until the loss stayed around .025.

### Example lyrics (Starting phrase in quotes)
* 1 epoch, 2 attention heads: "a day in the " babe penguin disappear bla lights husband girl this lane board 

* 50 epochs, 2 attention heads: "hello world" i m in the sun sun here it s a little girl you know i m gonna be long time i m so i m so i m in the sun here comes the sun here comes the sun

* 100 epochs, 2 attention heads: "hello world" in the queen was dead b c thirty one more than wine her standin there s gonna have some fun tonight we re going to be free

* 400 epochs, 2 attention heads: "junior tear dizzy bother strawberry would" out work it baby now shake it up baby two of love all come on my own it won t be better some have gone baby ooh now girl take a good look around you re going to be alright

* 1200 epochs, 2 attention headds: "fly" be whisper words their ve tried but still they said baby s gone why would she said so can t see her time but now these daya are gone i will never die and i m happy just to you any time at all any time at all any time at all all you and all i ll never be the lonely one

The generated text quality generally increased with the number of training epochs. Training with 1 epoch results in seemingly random text. 50 epochs' quality is still fairly poor. It repeats several words and phrases such as "i m in the sun". This is occurs because the model has not be trained for a long enough period of time. Training for 100 and 400 epochs results in generate text that has structure and could reasonably be used as a song lyric. Examples include "We[']re going to be free tonight" (100 epoch) and "take a good look around you['] going to be alright". Our attempt at overtraining the model using 1200 epochs resulted in some good text, but also some repeated text before returning to generating other text. In the above exmple, "all any time" or some combination of these three words begins to repeat. However, the model does generate "never be the lonely one" afterwards breaking this loop.

It is important to note that all models end up repeating words/phrases at some point. This is most noticable if 300 words are generated. The previously mentioned repeating phrase in the overtrained model is different from this because it generated eventually generated different text.





## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.