In [8]:
!pip install pandas scikit-learn nltk gtts requests Keras tensorflow pydub -q


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import os
import sys

# Define the package names
package_names = ['pandas', 'scikit-learn', 'nltk', 'gtts', 'requests', 'keras', 'tensorflow']

# Get the paths of the packages
package_paths = []
for package_name in package_names:
    path = !pip show {package_name} | grep Location | cut -d " " -f 2
    package_paths.append(path[0])

# Add the package paths to sys.path
for package_path in package_paths:
    if package_path not in sys.path:
        sys.path.append(package_path)

# Print the updated sys.path
# print("Updated sys.path:")
# for p in sys.path:
#    print(p)

In [10]:
import os
import pandas as pd
import nltk as nltk
from nltk import corpus, stem
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import keras
from keras.layers import Input, LSTM, Dense, Embedding
from keras.models import Model
import numpy as np
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio

In [11]:
# import the qna_chitchat_friendly data
url = "https://filedn.com/lJpzjOtA91quQEpwdrgCvcy/Business%20Data%20Mining%20and%20Knowledge%20Discovery/RNoteBook/qna_chitchat_friendly.csv"
data = pd.read_csv(url)
# size of the dataset
len(data)

9783

In [13]:
# download the punkt_tab resource,
nltk.download('punkt_tab')

# Select the "Question" column in the text document
corpus_Q = data["Question"]

# Preprocess the text
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = stem.SnowballStemmer('english')

def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove numbers
    text = ''.join([i for i in text if not i.isdigit()])
    # Remove punctuations
    text = ''.join([i for i in text if i.isalpha() or i.isspace()])
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [i for i in tokens if not i in stop_words]
    # Stem the tokens
    tokens = [stemmer.stem(i) for i in tokens]
    # Join the tokens into a string
    text = " ".join(tokens)
    return text

corpus_Q = corpus_Q.apply(preprocess_text);

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gentl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [14]:
# Tokenize the questions
tokenizer_q = Tokenizer()
tokenizer_q.fit_on_texts(corpus_Q)
vocab_size = len(tokenizer_q.word_index) + 1

# Convert question to sequences of tokens
input_sequences = tokenizer_q.texts_to_sequences(corpus_Q)

# Pad the sequences to have equal length
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

In [15]:
# Prepare the answer sequences
# Add start and end tokens to the answers
y = data["Answer"].apply(lambda x: 'start ' + x + ' end')

# Tokenize the answers
tokenizer_a = Tokenizer(filters='')
tokenizer_a.fit_on_texts(y)
num_classes = len(tokenizer_a.word_index) + 1

# Convert answers to sequences of tokens
y_encoded = tokenizer_a.texts_to_sequences(y)

# Pad the sequences to have equal length
max_ans_length = max([len(seq) for seq in y_encoded])
y_padded = pad_sequences(y_encoded, maxlen=max_ans_length, padding='post')

In [16]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(input_sequences, y_padded, test_size=0.2, random_state=42)

In [17]:
# create a seq2seq model with encoder and decoder LSTMs
embedding_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

encoder_model = Model(encoder_inputs, encoder_states)

# Decoder (Training)
decoder_inputs = Input(shape=(None,))
decoder_embedding1 = Embedding(num_classes, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding1, initial_state=encoder_states)
decoder_dense = Dense(num_classes, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Decoder (Inference)
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs2 = Input(shape=(None,))
decoder_embedding2 = Embedding(num_classes, embedding_dim)(decoder_inputs2)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embedding2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs2] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare the decoder input data with the same length as the output data, but shifted one timestep forward
decoder_input_data = np.zeros_like(y_padded)
decoder_input_data[:, 1:] = y_padded[:, :-1]

In [18]:
# train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit([X_train, decoder_input_data], np.expand_dims(y_train, -1), epochs=50, batch_size=64, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/50
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 133ms/step - accuracy: 0.6982 - loss: 2.4025 - val_accuracy: 0.7599 - val_loss: 1.3491
Epoch 2/50
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 128ms/step - accuracy: 0.7570 - loss: 1.2976 - val_accuracy: 0.7618 - val_loss: 1.2042
Epoch 3/50
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 133ms/step - accuracy: 0.7594 - loss: 1.1415 - val_accuracy: 0.7676 - val_loss: 1.0636
Epoch 4/50
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 127ms/step - accuracy: 0.7660 - loss: 1.0124 - val_accuracy: 0.7744 - val_loss: 0.9676
Epoch 5/50
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 125ms/step - accuracy: 0.7753 - loss: 0.9147 - val_accuracy: 0.7795 - val_loss: 0.9163
Epoch 6/50
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 127ms/step - accuracy: 0.7814 - loss: 0.8535 - val_accuracy: 0.7834 - val_loss: 0.8839
Epoch 7/50

Observe the improvement of the model's loss and accuracy on the training data fro iteration to iteration.

In [19]:
# Prepare decoder_input_data_test
decoder_input_data_test = np.zeros((len(y_test), max_sequence_length), dtype='int32')
for i, target_sequence in enumerate(y_test):
    for t, token in enumerate(target_sequence):
        if t < max_sequence_length - 1:
            decoder_input_data_test[i, t] = token

# Reshape y_test
y_test = pad_sequences(y_test, maxlen=max_sequence_length, padding='post')

loss, accuracy = model.evaluate([X_test, decoder_input_data_test], np.expand_dims(y_test, -1), verbose = 0)
print(f"Test set accuracy: {accuracy:.2f}")

Test set accuracy: 0.21


The poor performance of the model on the test data illustrates the problem of model overfitting, which is a common issue with network-based models as we discussed, especially when the size of the training dataset is small, as in the case of this study.

In [21]:
def predict_answer_seq2seq(question, play_audio=True):
    def decode_sequence(input_seq):
        states_value = encoder_model.predict(input_seq, verbose = 0)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = tokenizer_a.word_index['start']

        stop_condition = False
        decoded_sentence = []
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose = 0)

            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = tokenizer_a.index_word.get(sampled_token_index, '')

            if sampled_word == 'end' or len(decoded_sentence) > max_sequence_length:
                stop_condition = True
            else:
                decoded_sentence.append(sampled_word)

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            states_value = [h, c]

        return " ".join(decoded_sentence[1:])

    input_question = preprocess_text(question)
    input_seq = tokenizer_q.texts_to_sequences([input_question])
    input_seq = pad_sequences(input_seq, maxlen=max_sequence_length, padding='post')

    answer = decode_sequence(input_seq)

    # Synthesize speech
    tts = gTTS(answer)
    tts.save('answer.wav')

    # Play audio using ffplay
    if play_audio:
        os.system('ffplay -nodisp -autoexit answer.wav')

    return answer

In [31]:
# Example usage:
question = "What is your name?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

oh, i don't have a name.


In [32]:
# Example usage:
question = "where are you from?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

i'm digital, to chat and to here.


In [33]:
# Example usage:
question = "how old are you?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

i don't know you, but i enjoy


In [34]:
# Example usage:
question = "I really like you!"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

you're you're pretty 


In [35]:
# Example usage:
question = "where should we go for dinner?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

i don't you should follow your heart.


In [36]:
# Example usage:
question = "who will win 2028 presidental election?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

i having a hard time imagining how


In [37]:
# Example usage:
question = "why Maine the best state in the US?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

i only thing i'm committed to is


In [38]:
# Example usage:
question = "will AI change the world?"
answer = predict_answer_seq2seq(question)
print(f"{answer}")

i haven't of any other bots, but
