<a href="https://colab.research.google.com/github/Aerospace87/ML-projects/blob/main/tensorflow/Text_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

# Mount your Google Drive to the local /content/drive directory
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
from os import listdir, path

local_folder = "/content/drive/My Drive/"

with open(path.join(local_folder ,"dataset/shakespeare.txt")) as f:
  shakespeare_dataset = f.read()

## Cleaning the dataset and spliting into training examples and lables

In [44]:
shakespeare_dataset = shakespeare_dataset.split('\n\n')
shakespeare_dataset = shakespeare_dataset[2:-1]

In [45]:
sos_token = "<SOS>"
eos_token = "<EOS>"
space = " "

dataset = [[sos_token + space + document ] for document in shakespeare_dataset if document != ""]
labels = [[document + space + eos_token] for document in shakespeare_dataset if document != ""]

number_documents = len(dataset)
print(number_documents)

141


In [46]:
max_doc_length = 0

for doc in dataset:
  if len(doc[0].split()) > max_doc_length:

    max_doc_length = len(doc[0].split())

print(max_doc_length)

238


In [47]:
from tensorflow.keras import layers
text_vectorization = layers.TextVectorization(
    standardize='lower',
    split='whitespace',
    output_mode='int'
)


## Text Vectorization

In [48]:
text_vectorization.adapt(dataset)

In [49]:
dataset = text_vectorization(dataset)
vocabulary_size = text_vectorization.vocabulary_size()
vocabulary = text_vectorization.get_vocabulary()

In [50]:
# Dictionay to map back from integer index to word
from_int_to_word ={}
from_word_to_int = {}

for idx,word in enumerate(vocabulary):
    from_int_to_word[idx] = word
    from_word_to_int[word] = idx

In [51]:
labels = text_vectorization(labels)

## Creation of a tensorflow dataset using training examples and labels

In [52]:
from tensorflow.data import Dataset
# Creation of the dataset to feed to the model
dataset = Dataset.from_tensor_slices(
    (dataset, labels)
)

# shuffle and split into batches of batch size BATCH_SIZE
BATCH_SIZE = 64

# Buffer size of the buffer to shuffle the dataset
# Otherfwise TF will have the full dataset in memory
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

## Model Creation

In [53]:
word_embedding_vector_dimension = 4

word_embedding = layers.Embedding(
    input_dim = vocabulary_size,
    output_dim=word_embedding_vector_dimension
)

In [58]:
lstm_layer_1 = layers.LSTM(128, input_shape = (1 ,number_documents, word_embedding_vector_dimension), return_sequences = True)
lstm_layer_2 = layers.LSTM(128, return_sequences = True)

In [59]:
from tensorflow.keras.activations import softmax
softmax_layer = layers.Dense(units=vocabulary_size, activation=softmax)

In [60]:
from tensorflow.keras import Sequential

steps = [word_embedding, lstm_layer_1, lstm_layer_2, softmax_layer]
model = Sequential(steps)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 4)           17180     
                                                                 
 lstm_6 (LSTM)               (None, None, 128)         68096     
                                                                 
 lstm_7 (LSTM)               (None, None, 128)         131584    
                                                                 
 dense_3 (Dense)             (None, None, 4295)        554055    
                                                                 
Total params: 770915 (2.94 MB)
Trainable params: 770915 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [61]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
loss = SparseCategoricalCrossentropy(from_logits=True)

In [62]:
from tensorflow.keras.optimizers import Adam
adam_optimizer = Adam(learning_rate=1e-3)

In [63]:
model.compile(optimizer = adam_optimizer, loss = loss)

## Testing the model works

In [64]:
first_batch = dataset.take(1)

In [65]:
for input_data, labels in first_batch:
  predictions = model(input_data)
  print(predictions.shape)

(64, 238, 4295)


## Configuring checkpoints

In [67]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [68]:
checkpoint_folder = path.join(local_folder,
                              "tf_checkpoints/Text Generation using RNN"
)

checkpoint_files_path = path.join(checkpoint_folder, "ckpt_{epoch}")

EPOCHS = 100

checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_files_path,
    save_weights_only=True
)