In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('data/chat.txt'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#ignore warning
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('data/chat.txt' , sep='\t' , names=['Question' , 'Answer'])

df

Unnamed: 0,Question,Answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [11]:
# Check for null values
null_question = df['Question'].isnull().sum()
null_answer = df['Answer'].isnull().sum()

if null_question > 0:
    print("There are", null_question, "null values in the 'Question' column.")
else:
    print("There are no null values in the 'Question' column.")

if null_answer > 0:
    print("There are", null_answer, "null values in the 'Answer' column.")
else:
    print("There are no null values in the 'Answer' column.")

# Check for whitespace values
whitespace_question = df['Question'].apply(lambda x: x.isspace()).sum()
whitespace_answer = df['Answer'].apply(lambda x: x.isspace()).sum()

if whitespace_question > 0:
    print("There are", whitespace_question, "whitespace values in the 'Question' column.")
else:
    print("There are no whitespace values in the 'Question' column.")

if whitespace_answer > 0:
    print("There are", whitespace_answer, "whitespace values in the 'Answer' column.")
else:
    print("There are no whitespace values in the 'Answer' column.")

There are no null values in the 'Question' column.
There are no null values in the 'Answer' column.
There are no whitespace values in the 'Question' column.
There are no whitespace values in the 'Answer' column.


In [12]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', ' ', text)  # Replace all digits with spaces
    text = re.sub(r'([^\w\s])', r' \1 ', text)  # Add a space before and after each punctuation character
    text = re.sub(r'\s+', ' ', text)  # Replace all consecutive spaces with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

df['Encoder Inputs']=df['Question'].apply(clean_text)
df['Decoder Inputs']="<sos> " + df['Answer'].apply(clean_text) + ' <eos>'
df["Decoder Targets"] = df['Answer'].apply(clean_text) + ' <eos>'

df.head()

Unnamed: 0,Question,Answer,Encoder Inputs,Decoder Inputs,Decoder Targets
0,"hi, how are you doing?",i'm fine. how about yourself?,"hi , how are you doing ?",<sos> i ' m fine . how about yourself ? <eos>,i ' m fine . how about yourself ? <eos>
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,i ' m fine . how about yourself ?,<sos> i ' m pretty good . thanks for asking . ...,i ' m pretty good . thanks for asking . <eos>
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,i ' m pretty good . thanks for asking .,<sos> no problem . so how have you been ? <eos>,no problem . so how have you been ? <eos>
3,no problem. so how have you been?,i've been great. what about you?,no problem . so how have you been ?,<sos> i ' ve been great . what about you ? <eos>,i ' ve been great . what about you ? <eos>
4,i've been great. what about you?,i've been good. i'm in school right now.,i ' ve been great . what about you ?,<sos> i ' ve been good . i ' m in school right...,i ' ve been good . i ' m in school right now ....


In [13]:
df['Question Length'] = df['Encoder Inputs'].apply(lambda x: len(x))
df['Answer Length'] = df['Decoder Inputs'].apply(lambda x: len(x))

df.head()

Unnamed: 0,Question,Answer,Encoder Inputs,Decoder Inputs,Decoder Targets,Question Length,Answer Length
0,"hi, how are you doing?",i'm fine. how about yourself?,"hi , how are you doing ?",<sos> i ' m fine . how about yourself ? <eos>,i ' m fine . how about yourself ? <eos>,24,45
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,i ' m fine . how about yourself ?,<sos> i ' m pretty good . thanks for asking . ...,i ' m pretty good . thanks for asking . <eos>,33,51
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,i ' m pretty good . thanks for asking .,<sos> no problem . so how have you been ? <eos>,no problem . so how have you been ? <eos>,39,47
3,no problem. so how have you been?,i've been great. what about you?,no problem . so how have you been ?,<sos> i ' ve been great . what about you ? <eos>,i ' ve been great . what about you ? <eos>,35,48
4,i've been great. what about you?,i've been good. i'm in school right now.,i ' ve been great . what about you ?,<sos> i ' ve been good . i ' m in school right...,i ' ve been good . i ' m in school right now ....,36,58


In [14]:
import plotly.express as px

fig1 = px.histogram(df, x='Question Length', nbins=50, opacity=0.7)
fig2 = px.histogram(df, x='Answer Length', nbins=50, opacity=0.7)

print("Maximum Question Length:", df['Question Length'].max())
print("Maximum Answer Length:", df['Answer Length'].max())

fig1.show()
fig2.show()

Maximum Question Length: 101
Maximum Answer Length: 113


In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum number of words to keep based on word frequency
num_words = 10000

# Define the maximum sequence length
max_seq_length = 10

# Create a tokenizer and fit it on the 'Encoder Inputs' and 'Decoder Inputs' columns of the DataFrame
tokenizer = Tokenizer(num_words=num_words, oov_token='<unk>')
tokenizer.fit_on_texts(df['Encoder Inputs'].tolist() + df['Decoder Inputs'].tolist())

# Convert the text data to sequences of integers using the tokenizer
encoder_inputs = tokenizer.texts_to_sequences(df['Encoder Inputs'].tolist())
decoder_inputs = tokenizer.texts_to_sequences(df['Decoder Inputs'].tolist())
decoder_targets = tokenizer.texts_to_sequences(df['Decoder Targets'].tolist())

# Pad the sequences to ensure they all have the same length
encoder_inputs = pad_sequences(encoder_inputs, maxlen=max_seq_length, padding='post', truncating='post')
decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_seq_length, padding='post', truncating='post')
decoder_targets = pad_sequences(decoder_targets, maxlen=max_seq_length, padding='post', truncating='post')

In [16]:
decoder_targets[1:3]

array([[  5,   4,  35, 161,  49, 245,  30, 481,   3,   0],
       [ 34, 173,  26,  42,  19,   6, 102,   3,   0,   0]])

In [17]:
df['Decoder Targets'][1:3]

1    i ' m pretty good . thanks for asking . <eos>
2        no problem . so how have you been ? <eos>
Name: Decoder Targets, dtype: object

In [18]:
# Get the vocabulary size of the tokenizer
vocab_size = len(tokenizer.word_index)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 2422


In [19]:
print(encoder_inputs.shape , "\n" , decoder_inputs.shape , "\n" , decoder_targets.shape)

(3725, 10) 
 (3725, 10) 
 (3725, 10)


In [20]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
encoder_inputs_train, encoder_inputs_test, decoder_inputs_train, decoder_inputs_test, decoder_targets_train, decoder_targets_test = train_test_split(encoder_inputs, decoder_inputs, decoder_targets, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("Train set shapes:", encoder_inputs_train.shape, decoder_inputs_train.shape, decoder_targets_train.shape)
print("Test set shapes:", encoder_inputs_test.shape, decoder_inputs_test.shape, decoder_targets_test.shape)

Train set shapes: (2980, 10) (2980, 10) (2980, 10)
Test set shapes: (745, 10) (745, 10) (745, 10)


In [21]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model

num_encoder_tokens = len(tokenizer.word_index) + 1
num_decoder_tokens = len(tokenizer.word_index) + 1
latent_dim = 32
embedding_dim = 50

# Define the input sequence
encoder_inputs = Input(shape=(max_seq_length,))

#_________________Embedding________________________

encoder_embedding = Embedding(num_encoder_tokens, embedding_dim , mask_zero=True)
encoder_inputs_embedded = encoder_embedding(encoder_inputs)


#_________________Encoder________________________

# Encoder - LSTM1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(encoder_inputs_embedded)

# Encoder - LSTM2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# Encoder - LSTM2
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
#______________________________________________

# Discard the encoder outputs and only keep the states
encoder_states = [state_h, state_c]

# Define the decoder input sequence
decoder_inputs = Input(shape=(max_seq_length,))

# Add an embedding layer
decoder_embedding = Embedding(num_decoder_tokens, embedding_dim , mask_zero=True)
decoder_inputs_embedded = decoder_embedding(decoder_inputs)

#_________________Decoder________________________

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# Get the decoder outputs and states
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_embedded, initial_state=encoder_states)

# Define the decoder output layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

# Get the decoder outputs
decoder_outputs = decoder_dense(decoder_outputs)

# Define the Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 10)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 10, 50)               121150    ['input_1[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 10, 32),             10624     ['embedding[0][0]']           
                              (None, 32),                                                         
                              (None, 32)]                                                         
                                                                                             

In [22]:
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)


batch_size = 32
epochs = 30

# One-hot encode the decoder targets
decoder_targets_train = to_categorical(decoder_targets_train, num_decoder_tokens)
decoder_targets_test = to_categorical(decoder_targets_test, num_decoder_tokens)

# Define the Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'] , sample_weight_mode='temporal')

# Train the model
model.fit([encoder_inputs_train, decoder_inputs_train], decoder_targets_train,
          validation_data=([encoder_inputs_test, decoder_inputs_test], decoder_targets_test),
          batch_size=batch_size, epochs=epochs , callbacks=[early_stopping])


Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x26b3ec3ea10>

In [23]:
from tensorflow.keras.models import Model

# Define encoder model to get encoder states
encoder_model = Model(encoder_inputs, encoder_states)

# Define decoder model with encoder states as initial state
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_embedded = decoder_embedding(decoder_inputs_single)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs_single_embedded, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs_single] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Helper function to generate a response given an input sequence
def generate_response(input_seq):
    # Encode the input sequence to get the initial decoder states
    states_value = encoder_model.predict(input_seq)

    # Initialize the target sequence with a start token
    target_seq = np.array([[tokenizer.word_index['sos']]])

    stop_condition = False
    response = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token from the output distribution
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
        # If the predicted word index is 0, use a period instead
        if sampled_token_index == 0:
            sampled_token = '.'
        else:
            sampled_token = tokenizer.index_word[sampled_token_index]
        
        response.append(sampled_token)

        # Exit condition: either hit max length or find stop token
        if sampled_token == 'eos' or len(response) > max_seq_length:
            stop_condition = True

        # Update the target sequence with the sampled token
        target_seq = np.array([[sampled_token_index]])

        # Update the decoder states
        states_value = [h, c]

    return ' '.join(response)

In [26]:
# Test the response generation
input_sequence = tokenizer.texts_to_sequences(["HI"])
input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post', truncating='post')
response = generate_response(input_sequence)
print("Input:", f'{input_sequence}')
print("Response:", response)


Response: i ' ll be to be to be to be to
