<a href="https://colab.research.google.com/github/BogdanIvchenko/308Group4Production/blob/main/Tutorial2ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from numpy import array
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
  in_text, result = seed_text, seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    encoded = array(encoded)
    # predict a word in the vocabulary
    yhat = np.argmax(model.predict(encoded, verbose=0), axis=-1)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    # append   to input
    in_text, result = out_word, result + ' ' + out_word
  return result


# define the model
def define_model(vocab_size):
  model = Sequential()
  model.add(Embedding(vocab_size, 10, input_length=1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation= 'softmax' ))


  # compile network
  model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
  # summarize defined model
  model.summary()
  plot_model(model, to_file= 'model4.png' , show_shapes=True)
  return model
  
  
# source text
data = """ Theres dirt beneath my nails\n
  That vomes from my travails\n
  And though at supper I must clean\n
  The evidence of where Ive been\n """


# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( 'Vocabulary Size: %d' % vocab_size)
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
  sequence = encoded[i-1:i+1]
  sequences.append(sequence)
  print( 'Total Sequences: %d' % len(sequences))

# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
# define model
model = define_model(vocab_size)
# fit network
model.fit(X, y, epochs=500, verbose=2)
# evaluate
print(generate_seq(model, tokenizer, 'Theres' , 6))


Vocabulary Size: 23
Total Sequences: 1
Total Sequences: 2
Total Sequences: 3
Total Sequences: 4
Total Sequences: 5
Total Sequences: 6
Total Sequences: 7
Total Sequences: 8
Total Sequences: 9
Total Sequences: 10
Total Sequences: 11
Total Sequences: 12
Total Sequences: 13
Total Sequences: 14
Total Sequences: 15
Total Sequences: 16
Total Sequences: 17
Total Sequences: 18
Total Sequences: 19
Total Sequences: 20
Total Sequences: 21
Total Sequences: 22
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 1, 10)             230       
                                                                 
 lstm_12 (LSTM)              (None, 50)                12200     
                                                                 
 dense_12 (Dense)            (None, 23)                1173      
                                                                

In [21]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # pre-pad sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=max_length, padding= 'pre' )
    # predict probabilities for each word
    yhat = np.argmax(model.predict(encoded, verbose=0), axis=-1)
    # map predicted word index to word
    out_word = ' '
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    
    # append to input
    in_text += ' '+ out_word
  return in_text


# define the model
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 10, input_length=max_length-1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation= 'softmax' ))

  # compile network
  model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])

  # summarize defined model
  model.summary()
  plot_model(model, to_file= 'model3.png' , show_shapes=True)
  return model

# source text
data = """ Theres dirt beneath my nails\n
  That vomes from my travails\n
  And though at supper I must clean\n
  The evidence of where Ive been\n """

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( 'Vocabulary Size: %d' % vocab_size)
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
  sequence = encoded[i-2:i+1]
  sequences.append(sequence)
print( 'Total Sequences: %d' % len(sequences))
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding= 'pre' )
print( 'Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(X, y, epochs=500, verbose=2)
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Theres dirt', 3))
print(generate_seq(model, tokenizer, max_length-1, 'The evidence', 4))
print(generate_seq(model, tokenizer, max_length-1, 'though at', 5))
print(generate_seq(model, tokenizer, max_length-1, 'I must', 5))

Vocabulary Size: 23
Total Sequences: 21
Max Sequence Length: 3
Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 2, 10)             230       
                                                                 
 lstm_13 (LSTM)              (None, 50)                12200     
                                                                 
 dense_13 (Dense)            (None, 23)                1173      
                                                                 
Total params: 13,603
Trainable params: 13,603
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 3s - loss: 3.1352 - accuracy: 0.0000e+00 - 3s/epoch - 3s/step
Epoch 2/500
1/1 - 0s - loss: 3.1341 - accuracy: 0.0476 - 12ms/epoch - 12ms/step
Epoch 3/500
1/1 - 0s - loss: 3.1331 - accuracy: 0.0476 - 10ms/epoch - 10ms/step
Epoch 4/50

In [22]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # pre-pad sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=max_length, padding= 'pre' )
    # predict probabilities for each word
    yhat = np.argmax(model.predict(encoded, verbose=0), axis=-1)
    # map predicted word index to word
    out_word = ' '
    for word, index in tokenizer.word_index.items():

      if index == yhat:
        out_word = word
        break
    # append to input
    in_text += ' ' + out_word
  return in_text


# define the model
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 10, input_length=max_length-1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation= 'softmax' ))
  # compile network
  model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
  # summarize defined model
  model.summary()
  plot_model(model, to_file= 'model2.png' , show_shapes=True)
  return model


# source text
# data = """ Jack and Jill went up the hill\n
#   To fetch a pail of water\n
#   Jack fell down and broke his crown\n
#   And Jill came tumbling after\n """

# source text
data = """ Theres dirt beneath my nails\n
  That vomes from my travails\n
  And though at supper I must clean\n
  The evidence of where Ive been\n """


# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])


# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print( 'Vocabulary Size: %d' % vocab_size)

# create line-based sequences
sequences = list()
for line in data.split( '\n' ):
  encoded = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

print( 'Total Sequences: %d' % len(sequences))

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding= 'pre' )
print( 'Max Sequence Length: %d' % max_length)


# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(X, y, epochs=500, verbose=2)
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Theres' , 4))
print(generate_seq(model, tokenizer, max_length-1, 'beneath' , 4))

Vocabulary Size: 23
Total Sequences: 19
Max Sequence Length: 7
Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 6, 10)             230       
                                                                 
 lstm_14 (LSTM)              (None, 50)                12200     
                                                                 
 dense_14 (Dense)            (None, 23)                1173      
                                                                 
Total params: 13,603
Trainable params: 13,603
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 2s - loss: 3.1362 - accuracy: 0.0000e+00 - 2s/epoch - 2s/step
Epoch 2/500
1/1 - 0s - loss: 3.1344 - accuracy: 0.0526 - 11ms/epoch - 11ms/step
Epoch 3/500
1/1 - 0s - loss: 3.1326 - accuracy: 0.1053 - 11ms/epoch - 11ms/step
Epoch 4/50