In [57]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM,Dense,Input,Embedding,Dropout, BatchNormalization, TimeDistributed, Bidirectional, Concatenate

In [3]:
data = pd.read_csv('/content/Cleaned_Data.csv')
data.head()

Unnamed: 0,INPUT,RESPONSE
0,Hi there!,Hello!
1,How are you today?,"I'm doing well, thank you. How about yourself?"
2,"I'm good too, thanks. Just a bit tired.",I understand. Long day?
3,"Yeah, it was. Lots of meetings.","Oh, I hate those."
4,"Me too. So, what are you up to today?",Just relaxing. Any plans for you?


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3581 entries, 0 to 3580
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   INPUT     3581 non-null   object
 1   RESPONSE  3581 non-null   object
dtypes: object(2)
memory usage: 56.1+ KB


In [5]:
data_input = data['INPUT']
data_input = [str(item) for item in data_input]
data_input[:4]

['Hi there!',
 'How are you today?',
 "I'm good too, thanks. Just a bit tired.",
 'Yeah, it was. Lots of meetings.']

In [6]:
data_output = data['RESPONSE']
data_output = [str(item) for item in data_output]
data_output[:4]

['Hello!',
 "I'm doing well, thank you. How about yourself?",
 'I understand. Long day?',
 'Oh, I hate those.']

In [7]:
len(data_output)

3581

In [41]:
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(data_input)

output_tokenizer = Tokenizer()
output_tokenizer.fit_on_texts(data_output)

In [42]:
input_sequences = input_tokenizer.texts_to_sequences(data_input)
output_sequences = output_tokenizer.texts_to_sequences(data_output)

In [45]:
input_data = pad_sequences(input_sequences,maxlen=25,padding='post')
output_data = pad_sequences(output_sequences,maxlen=25,padding='post')

In [44]:
tf.config.run_functions_eagerly(True)

In [47]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(len(input_tokenizer.word_index)+1, 256, mask_zero=True)(encoder_inputs)

encoder_lstm = Bidirectional(LSTM(512, return_state=True, return_sequences=False))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

In [48]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(len(output_tokenizer.word_index)+1, 256, mask_zero=True)(decoder_inputs)

decoder_lstm = LSTM(1024, return_sequences=True, return_state=True, dropout=0.1, recurrent_dropout=0.1)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_bn = BatchNormalization()(decoder_lstm_outputs)
decoder_dropout = Dropout(0.4)(decoder_bn)
decoder_dense = TimeDistributed(Dense(len(output_tokenizer.word_index)+1, activation='softmax'))
decoder_outputs = decoder_dense(decoder_dropout)

In [51]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
optimizer = Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999, decay=1e-6)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [37]:
model.summary()

In [52]:
um_classes = len(output_tokenizer.word_index) + 1
output_data_cat = np.expand_dims(output_data[:, 1:], -1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [54]:
model.fit([input_data, output_data[:, :-1]],
          output_data_cat,
          batch_size=64,
          epochs=10,
          validation_split=0.2,
          callbacks=[early_stopping])

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 7s/step - accuracy: 0.7954 - loss: 4.6299 - val_accuracy: 0.7318 - val_loss: 7.1671
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 7s/step - accuracy: 0.8629 - loss: 2.5689 - val_accuracy: 0.7558 - val_loss: 7.0186
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 7s/step - accuracy: 0.9060 - loss: 1.7747 - val_accuracy: 0.7565 - val_loss: 6.8693
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 7s/step - accuracy: 0.9029 - loss: 1.4593 - val_accuracy: 0.7591 - val_loss: 6.7150
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 7s/step - accuracy: 0.9077 - loss: 1.2469 - val_accuracy: 0.7619 - val_loss: 6.5056
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 7s/step - accuracy: 0.9250 - loss: 1.0189 - val_accuracy: 0.7634 - val_loss: 6.3265
Epoch 7/10
[1m45/45[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7823886d06d0>

In [90]:
encoder_model = Model(encoder_inputs, encoder_states)

In [96]:
decoder_state_input_h = Input(shape=(1024,))
decoder_state_input_c = Input(shape=(1024,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_emb_inference = Embedding(len(output_tokenizer.word_index)+1, 256, mask_zero=True)(decoder_inputs)
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_emb_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [97]:
reverse_output_index = {i: word for word, i in output_tokenizer.word_index.items()}
reverse_output_index[0] = ''

In [98]:
def generate_response(input_text, tokenizer, max_len=25):
    # Tokenize and pad input text
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    # Get the encoder states
    states_value = encoder_model.predict(input_seq)

    # Initialize the target sequence with the start token (assuming index 1 is <START>)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '<END>' or len(decoded_sentence.split()) > max_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update target_seq and states_value
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

In [99]:
response = generate_response("Hello, how are you?", input_tokenizer)
print("Bot:", response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m