## V1 - SEQ2SEQ

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# Load the text data
with open(path_to_file, 'r') as f:
    text = f.read()



Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [2]:
# Tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts([text])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences([text])

In [3]:
# Create input-output pairs, incorporating context
def create_input_output_pairs(sequences, context_window=3):
    input_sequences = []
    output_sequences = []
    for i in range(context_window, len(sequences)):
        input_sequence = sequences[i-context_window:i]
        output_sequence = sequences[i]
        input_sequences.append(input_sequence)
        output_sequences.append(output_sequence)
    return input_sequences, output_sequences

input_sequences, output_sequences = create_input_output_pairs(sequences)

# Pad sequences to a fixed length
max_len = 100
input_sequences = pad_sequences(input_sequences, maxlen=max_len)
output_sequences = pad_sequences(output_sequences, maxlen=max_len)

In [4]:
# Get the vocabulary (word_index in older versions of Keras)
vocab = tokenizer.word_index
idx_enemy=vocab.get('enemy')
# Add 1 to account for the 0 index (reserved for padding)
oov_index = vocab.get('[UNK]', 1)  # Get the index of '[UNK]' or 1 if not found
print(f"OOV token index: {idx_enemy}")

# Get the word corresponding to the index 186
word = list(vocab.keys())[list(vocab.values()).index(88)]
print(word)

text_test="hello world you are the first enemy"
test_sequences = tokenizer.texts_to_sequences([text_test])  # Convert to numerical sequences

input_sequences, output_sequences = create_input_output_pairs(test_sequences[0])
print(input_sequences)
print(output_sequences)

print(len(vocab))

OOV token index: 580
first
[[186, 6, 40], [6, 40, 1], [40, 1, 88]]
[1, 88, 580]
12632


In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

def create_seq2seq_model(vocab_size, max_len):
    # Encoder
    encoder_inputs = Input(shape=(max_len,))
    encoder_emb = Embedding(vocab_size, 100)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(256, return_sequences=True, return_state=True)(encoder_emb)

    # Decoder
    decoder_inputs = Input(shape=(1,))  # One-step decoder
    decoder_emb = Embedding(vocab_size, 100)(decoder_inputs)
    decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

    # Output Layer
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

In [6]:
model = create_seq2seq_model(len(vocab), max_len)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [7]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [8]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

# ... (tokenization and vocabulary creation)


from sklearn.model_selection import train_test_split

# Assuming you have your input_sequences and output_sequences
X_train, X_val, y_train, y_val = train_test_split(input_sequences, output_sequences, test_size=0.2, random_state=42)

y_train = [[item] for item in y_train]
y_val = [[item] for item in y_val]


with tf.device("/GPU:0"):
  # Tokenize and pad sequences for training and validation sets
  encoder_input_data = pad_sequences(X_train, maxlen=max_len, padding='post')
  decoder_input_data = pad_sequences(y_train, maxlen=max_len, padding='post')
  decoder_target_data = pad_sequences(y_train, maxlen=max_len, padding='post')

  val_encoder_input_data = pad_sequences(X_val, maxlen=max_len, padding='post')
  val_decoder_input_data = pad_sequences(y_val, maxlen=max_len, padding='post')
  val_decoder_target_data = pad_sequences(y_val, maxlen=max_len, padding='post')

  model.fit([encoder_input_data, decoder_input_data], decoder_target_data, epochs=10, batch_size=64, validation_data=([val_encoder_input_data, val_decoder_input_data], val_decoder_target_data), callbacks=[early_stop, checkpoint])



Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - accuracy: 0.0000e+00 - loss: 9.4453 - val_accuracy: 0.9900 - val_loss: 9.4155
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 542ms/step - accuracy: 0.9900 - loss: 9.4154 - val_accuracy: 0.9900 - val_loss: 9.3760
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 515ms/step - accuracy: 0.9900 - loss: 9.3759 - val_accuracy: 0.9900 - val_loss: 9.3020
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 626ms/step - accuracy: 0.9900 - loss: 9.3017 - val_accuracy: 0.9900 - val_loss: 9.1220
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 618ms/step - accuracy: 0.9900 - loss: 9.1214 - val_accuracy: 0.9900 - val_loss: 8.6604
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 529ms/step - accuracy: 0.9900 - loss: 8.6593 - val_accuracy: 0.9900 - val_loss: 8.0570
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━

In [9]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model('./sample_data/best_model.keras')

ValueError: File not found: filepath=./sample_data/best_model.keras. Please ensure the file is an accessible `.keras` zip file.

In [10]:
import numpy as np

def generate_text(model, tokenizer, seed_text, max_length, temperature=1.0):
    input_seq = tokenizer.texts_to_sequences([seed_text])[0]
    input_seq = pad_sequences([input_seq], maxlen=max_len)

    # Get the initial decoder state
    # This assumes your decoder is an LSTM or GRU layer
    # Adjust accordingly for your specific model architecture
    decoder_layer = next((layer for layer in model.layers if isinstance(layer, (tf.keras.layers.LSTM, tf.keras.layers.GRU))), None)
    if decoder_layer:
        decoder_state = [np.zeros((1, decoder_layer.units))]  # Initialize state for the decoder layer
    else:
        # If no recurrent layer is found, skip state management
        decoder_state = None

    # Generate text
    output_text = []
    current_word = seed_text
    for _ in range(max_length):
        input_seq = tokenizer.texts_to_sequences([current_word])[0]
        input_seq = pad_sequences([input_seq], maxlen=1)

        # Predict the next word
        if decoder_state:
            predicted_probs, *new_decoder_state = model.predict([input_seq, decoder_state[0]], verbose=0)
            if new_decoder_state:
                decoder_state[0] = new_decoder_state[0]
        else:
            predicted_probs = model.predict([input_seq], verbose=0)

        # Apply temperature sampling
        predicted_probs = np.asarray(predicted_probs).astype('float64')
        predicted_probs = np.log(predicted_probs) / temperature
        exp_probs = np.exp(predicted_probs)
        predicted_probs = exp_probs / np.sum(exp_probs)
        probas = np.random.multinomial(1, predicted_probs[0], 1)
        predicted_index = np.argmax(probas)

        predicted_word = tokenizer.index_word.get(predicted_index, "<UNK>")


        output_text.append(predicted_word)
        current_word += ' ' + predicted_word

    return ' '.join(output_text)

In [11]:
# Start chatbot
context = ""
while True:
    question = input('You: ')
    answer = generate_text(model, tokenizer, question, max_len, temperature=0.7)
    print('Chatbot:', answer)
    # Update context for the next turn
    context += f"You: {question}\nChatbot: {answer}\n"

You: ji
Chatbot: eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids eyelids


KeyboardInterrupt: Interrupted by user

## V2 - TRANSFORMER BASED

In [37]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, PositionalEncoding, TransformerEncoder, TransformerDecoder, Dense

# ... (other imports and data preparation)

def create_transformer_model(vocab_size, max_len):
    # Input layers
    encoder_inputs = Input(shape=(max_len,))
    decoder_inputs = Input(shape=(max_len,))

    # Embedding layers
    encoder_embeddings = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    decoder_embeddings = Embedding(vocab_size, embedding_dim)(decoder_inputs)

    # Positional Encoding
    positional_encoding = PositionalEncoding(max_len, embedding_dim)
    encoder_inputs = positional_encoding(encoder_embeddings)
    decoder_inputs = positional_encoding(decoder_embeddings)

    # Encoder
    encoder_outputs = TransformerEncoder(num_layers=6, num_heads=8, d_model=embedding_dim, dropout=0.1)(encoder_inputs)

    # Decoder
    decoder_outputs = TransformerDecoder(num_layers=6, num_heads=8, d_model=embedding_dim, dropout=0.1)(decoder_inputs, encoder_outputs)

    # Output layer
    outputs = Dense(vocab_size, activation='softmax')(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], outputs)
    return model

ImportError: cannot import name 'PositionalEncoding' from 'tensorflow.keras.layers' (/usr/local/lib/python3.10/dist-packages/keras/_tf_keras/keras/layers/__init__.py)

In [18]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
import tensorflow as tf

# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")

# Prepare your input and target sequences
# Assuming 'sequences' contains pairs of input and target sequences
input_sequences = [seq[0] for seq in sequences]
output_sequences = [seq[1] for seq in sequences]

# Convert numerical sequences to text sequences using the tokenizer
input_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in input_sequences]  # Decode numerical sequences to text
output_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output_sequences]  # Decode numerical sequences to text


# Tokenize input and target sequences
input_ids = tokenizer(input_sequences, return_tensors="tf", padding=True, truncation=True)["input_ids"]
decoder_input_ids = tokenizer(output_sequences, return_tensors="tf", padding=True, truncation=True)["input_ids"]

# Alternatively, if you have target text instead of sequences:
# decoder_input_ids = tokenizer(target_text, return_tensors="tf", padding=True, truncation=True)["input_ids"]

# Prepare labels by shifting decoder_input_ids and replacing the last token with -100
labels = tf.concat([decoder_input_ids[:, 1:], tf.fill([decoder_input_ids.shape[0], 1], -100)], axis=1)


# Train the model
# Provide both input_ids and decoder_input_ids to the fit method
# model.compile(optimizer='adam', loss='categorical_crossentropy') # Changed loss function to 'categorical_crossentropy' as the output is likely categorical
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

# Include labels in the input dictionary
model.fit(
    {"input_ids": input_ids, "decoder_input_ids": decoder_input_ids, "labels": labels},
    epochs=10,
    batch_size=32
)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7c8094141630>

## V3 - PRE-TRAINED

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

def generate_text_hugg(model, tokenizer, seed_text, max_length, temperature=1.0):
    """Generates text using the provided model and tokenizer.

    Args:
        model: The trained language model.
        tokenizer: The tokenizer used for encoding and decoding text.
        seed_text: The initial text to start generation from.
        max_length: The maximum length of the generated text.
        temperature: The temperature for sampling.

    Returns:
        The generated text.
    """
    # Encode the seed text
    input_ids = tokenizer(seed_text, return_tensors="tf")["input_ids"]

    # Get the initial decoder state (if applicable)
    decoder_layer = next((layer for layer in model.layers if isinstance(layer, (tf.keras.layers.LSTM, tf.keras.layers.GRU))), None)
    if decoder_layer:
        decoder_state = [np.zeros((1, decoder_layer.units))]
    else:
        decoder_state = None

    # Generate text
    output_text = []
    for _ in range(max_length):
        # Predict the next token
        if decoder_state:
            predicted_probs, *new_decoder_state = model.predict([input_ids, decoder_state[0]], verbose=0)
            if new_decoder_state:
                decoder_state[0] = new_decoder_state[0]
        else:
            predicted_probs = model.predict([input_ids], verbose=0)

        # Apply temperature sampling
        predicted_probs = np.asarray(predicted_probs).astype('float64')
        predicted_probs = np.log(predicted_probs) / temperature
        exp_probs = np.exp(predicted_probs)
        predicted_probs = exp_probs / np.sum(exp_probs)
        probas = np.random.multinomial(1, predicted_probs[0], 1)
        predicted_index = np.argmax(probas)

        # Decode the predicted token
        predicted_word = tokenizer.decode(predicted_index, skip_special_tokens=True)

        output_text.append(predicted_word)

        # Update the input for the next step
        input_ids = tf.constant([[predicted_index]]) # Use the predicted token as input for the next step


    return ' '.join(output_text)

In [None]:
# Start chatbot
context = ""
while True:
    question = input('You: ')
    answer = generate_text_hugg(model, tokenizer, question, max_len, temperature=0.7)
    print('Chatbot:', answer)
    # Update context for the next turn
    context += f"You: {question}\nChatbot: {answer}\n"