In [2]:
#Setup
import re, os, io, zipfile, requests, pickle, numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense,
    Concatenate, TimeDistributed, AdditiveAttention
)
from tensorflow.keras.models import Model

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.19.0


In [3]:
# Download the Cornell Movie Dialogs Corpus
url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("Datasets")

DATA_DIR = "Datasets/cornell movie-dialogs corpus"
LINE_FILE = os.path.join(DATA_DIR, "movie_lines.txt")
CONV_FILE = os.path.join(DATA_DIR, "movie_conversations.txt")
print("Dataset ready")

Dataset ready


In [4]:
#Load and clean data
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text.strip()

# Build id2line and conversations
id2line = {}
with open(LINE_FILE, encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.split(" +++$+++ ")
        if len(parts) == 5:
            id2line[parts[0]] = parts[4].strip()

conversations = []
with open(CONV_FILE, encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.split(" +++$+++ ")
        if len(parts) == 4:
            conv_ids = parts[3].strip()[1:-1].replace("'", "").replace(" ", "").split(',')
            conversations.append(conv_ids)

# Form question-answer pairs
questions, answers = [], []
for conv in conversations:
    for i in range(len(conv) - 1):
        q = id2line.get(conv[i])
        a = id2line.get(conv[i + 1])
        if q and a:
            questions.append(clean_text(q))
            answers.append(clean_text(a))

print("Total pairs before filtering:", len(questions))

# Keep only short sentences
MIN_LEN, MAX_LEN = 2, 20
filtered_qs, filtered_as = [], []
for q, a in zip(questions, answers):
    if MIN_LEN <= len(q.split()) <= MAX_LEN and MIN_LEN <= len(a.split()) <= MAX_LEN:
        filtered_qs.append(q)
        filtered_as.append(a)

questions, answers = filtered_qs, filtered_as
answers = ["<sos> " + a + " <eos>" for a in answers]
print("Pairs after filtering:", len(questions))


Total pairs before filtering: 221282
Pairs after filtering: 138164


In [5]:
#Tokenize and pad sequences
NUM_SAMPLES = 30000
questions = questions[:NUM_SAMPLES]
answers = answers[:NUM_SAMPLES]

tokenizer_enc = Tokenizer(filters='', oov_token="<out>")
tokenizer_enc.fit_on_texts(questions)
tokenizer_dec = Tokenizer(filters='', oov_token="<out>")
tokenizer_dec.fit_on_texts(answers)

encoder_seq = tokenizer_enc.texts_to_sequences(questions)
decoder_seq = tokenizer_dec.texts_to_sequences(answers)

max_enc_len = max(len(seq) for seq in encoder_seq)
max_dec_len = max(len(seq) for seq in decoder_seq)
print("Max lengths:", max_enc_len, max_dec_len)

encoder_input = pad_sequences(encoder_seq, maxlen=max_enc_len, padding='post')
decoder_input = pad_sequences([seq[:-1] for seq in decoder_seq], maxlen=max_dec_len-1, padding='post')
decoder_target = pad_sequences([seq[1:] for seq in decoder_seq], maxlen=max_dec_len-1, padding='post')

enc_vocab = len(tokenizer_enc.word_index) + 1
dec_vocab = len(tokenizer_dec.word_index) + 1
print("Vocab sizes:", enc_vocab, dec_vocab)


Max lengths: 20 22
Vocab sizes: 14104 14127


In [6]:
#Build RNN (Seq2Seq + Attention)
EMBED_DIM = 128
UNITS = 128

# Encoder
enc_inputs = Input(shape=(None,))
enc_emb = Embedding(enc_vocab, EMBED_DIM)(enc_inputs) # Removed mask_zero=True
enc_lstm = LSTM(UNITS, return_sequences=True, return_state=True)
enc_outs, enc_h, enc_c = enc_lstm(enc_emb)

# Decoder
dec_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(dec_vocab, EMBED_DIM) # Removed mask_zero=True
dec_emb = dec_emb_layer(dec_inputs)
dec_lstm = LSTM(UNITS, return_sequences=True, return_state=True)
dec_outs, _, _ = dec_lstm(dec_emb, initial_state=[enc_h, enc_c])

# Attention
attn = AdditiveAttention()
context = attn([dec_outs, enc_outs])
concat = Concatenate(axis=-1)([dec_outs, context])
dense = TimeDistributed(Dense(dec_vocab, activation='softmax'))
dec_pred = dense(concat)

model = Model([enc_inputs, dec_inputs], dec_pred)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

In [14]:
#Train the model
BATCH = 64
EPOCHS = 30

decoder_target_expanded = np.expand_dims(decoder_target, -1)

history = model.fit(
    [encoder_input, decoder_input],
    decoder_target_expanded,
    batch_size=BATCH,
    epochs=EPOCHS,
    validation_split=0.1
)

model.save("chatbot_rnn_tf2.h5")
print("Model saved")


Epoch 1/30
[1m154/422[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m11s[0m 41ms/step - loss: 0.5680

KeyboardInterrupt: 

In [12]:
#Build inference (chat) models
# Encoder inference model
enc_model = Model(enc_inputs, [enc_outs, enc_h, enc_c])

# Decoder inference
dec_state_in_h = Input(shape=(UNITS,))
dec_state_in_c = Input(shape=(UNITS,))
enc_outs_in = Input(shape=(None, UNITS))
dec_in_single = Input(shape=(1,))

dec_emb2 = dec_emb_layer(dec_in_single)
dec_out2, state_h2, state_c2 = dec_lstm(dec_emb2, initial_state=[dec_state_in_h, dec_state_in_c])
context2 = attn([dec_out2, enc_outs_in])
concat2 = Concatenate(axis=-1)([dec_out2, context2])
dec_pred2 = dense(concat2)
dec_model = Model([dec_in_single, enc_outs_in, dec_state_in_h, dec_state_in_c],
                  [dec_pred2, state_h2, state_c2])

# Index maps
index2word_dec = {v: k for k, v in tokenizer_dec.word_index.items()}
index2word_dec[0] = ""

sos_id = tokenizer_dec.word_index['<sos>']
eos_id = tokenizer_dec.word_index['<eos>']
print("Ready to chat")

Ready to chat


In [13]:
#Chat with the bot!
def clean_input(sentence):
    sentence = clean_text(sentence)
    seq = tokenizer_enc.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_enc_len, padding='post')
    return seq

def reply(sentence):
    seq = clean_input(sentence)
    enc_out, h, c = enc_model.predict(seq)
    target_seq = np.array([[sos_id]])
    decoded = []
    for _ in range(max_dec_len):
        pred, h, c = dec_model.predict([target_seq, enc_out, h, c])
        token = np.argmax(pred[0, -1, :])
        word = index2word_dec.get(token, '')
        if word == '' or word == '<eos>':
            break
        decoded.append(word)
        target_seq = np.array([[token]])
    return " ".join(decoded)

print("Start chatting! Type 'quit' to stop.")
while True:
    user = input("You: ")
    if user.lower() == "quit":
        print("Chatbot: Bye 👋")
        break
    print("Chatbot:", reply(user))


Start chatting! Type 'quit' to stop.
You: hi! how are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Chatbot: later where are you doing
You: how are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━