# ShLiDaNa TEXT GENERATOR v2

Create a model that will predict the next word in a text sequence, implementing and training using a corpus of Different datasets, while also creating some helper functions to pre-process the data.

OUR TEAM:

| Name | NIM |
|---|---|
|Shahran Kurnia Ramadhan|21/476650/PA/20592|
|Muhammad Linggar Ryanidha|21/475209/PA/20548|
|Daniel Ardi Chandra|21/479046/PA/20780|
|I Gusti Agung Premananda |21/473829/PA/20432|

In [1]:
import numpy as np
import re
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [2]:
import re

# Define path for file with datasets
dataset_path = '..\Datasets\VGCoST_VideoGameDialogue_Corpus\ENG\Portal_merged.txt'

# Read the data with the appropriate encoding
with open(dataset_path, encoding='ISO-8859-1') as f:
    data = f.read()

# Remove unwanted characters using regex
data = re.sub(r"[\"']", "", data)

# Convert to lower case and save as a list
corpus = data.lower().split("\n")

print(f"There are {len(corpus)} lines\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
    print(corpus[i])


There are 5591 lines

The first 5 lines look like this:

@[cameras]
@[glados says this if you place a portal on the wall under a camera.]
to ensure the safe performance of all authorized activities, do not destroy 
vital testing apparatus.
for your own safety, do not destroy vital testing apparatus.


## Tokenizing the text

Now fit the Tokenizer to the corpus and save the total number of words.

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [4]:
corpus[0]

'@[cameras]'

If you pass this text directly into the `texts_to_sequences` method you will get an unexpected result:

In [5]:
tokenizer.texts_to_sequences([corpus[0]])

[[1012]]

In [6]:
tokenizer.texts_to_sequences([corpus[0]])[0]

[1012]

## Generating n_grams

This function receives the fitted tokenizer and the corpus (which is a list of strings) and should return a list containing the `n_gram` sequences for each line in the corpus:

In [7]:
# GRADED FUNCTION: n_gram_seqs
def n_gram_seqs(corpus, tokenizer):
	input_sequences = []

	### START CODE HERE

	for line in corpus:
		token_list = tokenizer.texts_to_sequences([line])[0]

		for i in range(1, len(token_list)):
			# Generate subphrase
			n_gram_sequence = token_list[:i+1]
			# Append subphrase to input_sequences list
			input_sequences.append(n_gram_sequence)

	### END CODE HERE

	return input_sequences

In [8]:
# Test your function with one example
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)

print("n_gram sequences for first example look like this:\n")
first_example_sequence

n_gram sequences for first example look like this:



[]

In [9]:
# Test your function with a bigger corpus
next_3_examples_sequence = n_gram_seqs(corpus[1:4], tokenizer)

print("n_gram sequences for next 3 examples look like this:\n")
next_3_examples_sequence

n_gram sequences for next 3 examples look like this:



[[4, 377],
 [4, 377, 13],
 [4, 377, 13, 31],
 [4, 377, 13, 31, 2],
 [4, 377, 13, 31, 2, 187],
 [4, 377, 13, 31, 2, 187, 5],
 [4, 377, 13, 31, 2, 187, 5, 70],
 [4, 377, 13, 31, 2, 187, 5, 70, 17],
 [4, 377, 13, 31, 2, 187, 5, 70, 17, 1],
 [4, 377, 13, 31, 2, 187, 5, 70, 17, 1, 330],
 [4, 377, 13, 31, 2, 187, 5, 70, 17, 1, 330, 442],
 [4, 377, 13, 31, 2, 187, 5, 70, 17, 1, 330, 442, 5],
 [4, 377, 13, 31, 2, 187, 5, 70, 17, 1, 330, 442, 5, 1013],
 [3, 1014],
 [3, 1014, 1],
 [3, 1014, 1, 591],
 [3, 1014, 1, 591, 473],
 [3, 1014, 1, 591, 473, 6],
 [3, 1014, 1, 591, 473, 6, 34],
 [3, 1014, 1, 591, 473, 6, 34, 1886],
 [3, 1014, 1, 591, 473, 6, 34, 1886, 1485],
 [3, 1014, 1, 591, 473, 6, 34, 1886, 1485, 30],
 [3, 1014, 1, 591, 473, 6, 34, 1886, 1485, 30, 21],
 [3, 1014, 1, 591, 473, 6, 34, 1886, 1485, 30, 21, 555],
 [514, 71],
 [514, 71, 710]]

Apply the `n_gram_seqs` transformation to the whole corpus and save the maximum sequence length to use it later:

In [10]:
# Apply the n_gram_seqs transformation to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)

# Save max length
max_sequence_len = max([len(x) for x in input_sequences])

print(f"n_grams of input_sequences have length: {len(input_sequences)}")
print(f"maximum length of sequences is: {max_sequence_len}")

n_grams of input_sequences have length: 46418
maximum length of sequences is: 21


## Add padding to the sequences

Now code the `pad_seqs` function which will pad any given sequences to the desired maximum length. Notice that this function receives a list of sequences and should return a numpy array with the padded sequences:

In [11]:
# GRADED FUNCTION: pad_seqs
def pad_seqs(input_sequences, maxlen):
    ### START CODE HERE
    padded_sequences = pad_sequences(input_sequences, maxlen=maxlen, padding='pre')

    return padded_sequences
    ### END CODE HERE

In [12]:
# Test your function with the n_grams_seq of the first example
first_padded_seq = pad_seqs(first_example_sequence, len(first_example_sequence))
first_padded_seq

array([], shape=(0, 0), dtype=int32)

In [13]:
# Test your function with the n_grams_seq of the next 3 examples
next_3_padded_seq = pad_seqs(next_3_examples_sequence, max([len(s) for s in next_3_examples_sequence]))
next_3_padded_seq

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    4,  377],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           4,  377,   13],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    4,
         377,   13,   31],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    4,  377,
          13,   31,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    4,  377,   13,
          31,    2,  187],
       [   0,    0,    0,    0,    0,    0,    0,    4,  377,   13,   31,
           2,  187,    5],
       [   0,    0,    0,    0,    0,    0,    4,  377,   13,   31,    2,
         187,    5,   70],
       [   0,    0,    0,    0,    0,    4,  377,   13,   31,    2,  187,
           5,   70,   17],
       [   0,    0,    0,    0,    4,  377,   13,   31,    2,  187,    5,
          70,   17,    1],
       [   0,    0,    0,    4,  377,   13,   31,    2,  187,    5,   70,
          17,    

In [14]:
# Pad the whole corpus
input_sequences = pad_seqs(input_sequences, max_sequence_len)

print(f"padded corpus has shape: {input_sequences.shape}")

padded corpus has shape: (46418, 21)


## Split the data into features and labels

Before feeding the data into the neural network you should split it into features and labels. In this case the features will be the padded n_gram sequences with the last word removed from them and the labels will be the removed word.

In [15]:
# GRADED FUNCTION: features_and_labels
def features_and_labels(input_sequences, total_words):
    ### START CODE HERE
    features = input_sequences[:,:-1]
    labels = input_sequences[:,-1]
    one_hot_labels = to_categorical(labels, num_classes=total_words)
    ### END CODE HERE

    return features, one_hot_labels

In [16]:
# Test your function with the padded n_grams_seq of the first example
first_features, first_labels = features_and_labels(first_padded_seq, total_words)

print(f"labels have shape: {first_labels.shape}")
print("\nfeatures look like this:\n")
first_features

IndexError: index -1 is out of bounds for axis 1 with size 0

In [None]:
# Split the whole corpus
features, labels = features_and_labels(input_sequences, total_words)

print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")

## Create the model

- Should implement Transformer encoder

In [None]:
# GRADED FUNCTION: create_model
def create_model(total_words, max_sequence_len):

    model = Sequential()
    ### START CODE HERE
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dense(total_words, activation='softmax'))

    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    ### END CODE HERE

    return model

In [None]:
# Get the untrained model
model = create_model(total_words, max_sequence_len)

# Train the model
history = model.fit(features, labels, epochs=50, verbose=1)

In [None]:
# Take a look at the training curves of your model

acc = history.history['accuracy']
loss = history.history['loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.title('Training accuracy')

plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.title('Training loss')
plt.legend()

plt.show()

Download the `history.pkl` file which contains the information of the training history of your model and will be used to compute your grade. You can download this file by running the cell below:

In [None]:
def download_history():
  import pickle
  from google.colab import files

  with open('history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

  files.download('history.pkl')

download_history()

## See our model in action

After all our work it is finally time to see our model generating text.

Run the cell below to generate the next 100 words of a seed text.

In [None]:
seed_text = "what is that"
next_words = 10

for _ in range(next_words):
	# Convert the text into sequences
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	# Pad the sequences
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	# Get the probabilities of predicting a word
	predicted = model.predict(token_list, verbose=0)
	# Choose the next word based on the maximum probability
	predicted = np.argmax(predicted, axis=-1).item()
	# Get the actual word from the word index
	output_word = tokenizer.index_word[predicted]
	# Append to the current text
	seed_text += " " + output_word

print(seed_text)