In [2]:
!pip install tensorflow pandas numpy

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

print("✅ Libraries loaded successfully")

✅ Libraries loaded successfully


In [3]:
df = pd.read_csv("dialogs.txt", sep="\t", header=None, names=["input", "target"])
df

Unnamed: 0,input,target
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [22]:
# Droping missing rows, strip text
df = df.dropna().astype(str)
df["input"] = df["input"].str.strip()
df["target"] = df["target"].str.strip()

In [23]:
# Add start/end tokens
start_token = "\t"
end_token = "\n"
df["decoder_target"] = start_token + df["target"] + end_token
print(df.head())
print(f"Total pairs: {len(df)}")

                                 input  \
0               hi, how are you doing?   
1        i'm fine. how about yourself?   
2  i'm pretty good. thanks for asking.   
3    no problem. so how have you been?   
4     i've been great. what about you?   

                                     target  \
0             i'm fine. how about yourself?   
1       i'm pretty good. thanks for asking.   
2         no problem. so how have you been?   
3          i've been great. what about you?   
4  i've been good. i'm in school right now.   

                                 decoder_target  
0             \ti'm fine. how about yourself?\n  
1       \ti'm pretty good. thanks for asking.\n  
2         \tno problem. so how have you been?\n  
3          \ti've been great. what about you?\n  
4  \ti've been good. i'm in school right now.\n  
Total pairs: 3725


In [24]:
#Preprocessing Data
num_words = 8000
enc_tokenizer = Tokenizer(num_words=num_words, filters='')
dec_tokenizer = Tokenizer(num_words=num_words, filters='')

In [25]:
enc_tokenizer.fit_on_texts(df["input"])
dec_tokenizer.fit_on_texts(df["decoder_target"])

In [26]:
encoder_sequences = enc_tokenizer.texts_to_sequences(df["input"])
decoder_sequences = dec_tokenizer.texts_to_sequences(df["decoder_target"])

max_enc_len = max(len(s) for s in encoder_sequences)
max_dec_len = max(len(s) for s in decoder_sequences)

In [27]:
encoder_input_data = pad_sequences(encoder_sequences, maxlen=max_enc_len, padding="post")
decoder_input_data = pad_sequences([s[:-1] for s in decoder_sequences], maxlen=max_dec_len-1, padding="post")
decoder_target_data = pad_sequences([s[1:] for s in decoder_sequences], maxlen=max_dec_len-1, padding="post")
decoder_target_data = np.expand_dims(decoder_target_data, -1)

In [28]:
#Creating Model
embedding_dim = 100
latent_dim = 256

enc_vocab_size = len(enc_tokenizer.word_index) + 1
dec_vocab_size = len(dec_tokenizer.word_index) + 1

In [29]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(enc_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
enc_states = [state_h, state_c]

In [30]:
# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(dec_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
dec_outputs, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=enc_states)
dec_dense = Dense(dec_vocab_size, activation="softmax")
dec_outputs = dec_dense(dec_outputs)

In [31]:
model = Model([encoder_inputs, decoder_inputs], dec_outputs)
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [14]:
#Training Model
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=20,
    validation_split=0.1)

Epoch 1/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 950ms/step - accuracy: 0.0082 - loss: 8.1932 - val_accuracy: 0.0119 - val_loss: 6.9823
Epoch 2/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 936ms/step - accuracy: 0.0121 - loss: 6.7302 - val_accuracy: 0.0118 - val_loss: 6.8972
Epoch 3/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 928ms/step - accuracy: 0.0134 - loss: 6.6009 - val_accuracy: 0.0128 - val_loss: 6.8978
Epoch 4/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 895ms/step - accuracy: 0.0138 - loss: 6.5336 - val_accuracy: 0.0121 - val_loss: 6.8943
Epoch 5/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 930ms/step - accuracy: 0.0138 - loss: 6.5172 - val_accuracy: 0.0136 - val_loss: 6.9071
Epoch 6/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 939ms/step - accuracy: 0.0140 - loss: 6.4920 - val_accuracy: 0.0136 - val_loss: 6.8999
Epoch 7/20
[1m53/53[

In [32]:
# ================================
# Step 6: Build Inference Models
# ================================
encoder_model = Model(encoder_inputs, enc_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_inf = Input(shape=(None,))
dec_emb2 = Embedding(dec_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs_inf) # reuse embedding
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states = [state_h2, state_c2]
decoder_dense = Dense(dec_vocab_size, activation="softmax")
dec_outputs2 = decoder_dense(dec_outputs2)

decoder_model = Model([decoder_inputs_inf] + decoder_states_inputs,
                      [dec_outputs2] + decoder_states)

reverse_dec_index = {i: w for w, i in dec_tokenizer.word_index.items()}
reverse_dec_index[0] = ""

In [33]:
def reply(text):
    seq = enc_tokenizer.texts_to_sequences([text])
    seq = pad_sequences(seq, maxlen=max_enc_len, padding="post")
    return decode_sequence(seq)

print("✅ Chatbot is ready!")

✅ Chatbot is ready!
