#Chatbot

###Importing Modules

In [33]:
import pandas as pd
import re
import nltk
import numpy as np
import tensorflow as tf
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, LayerNormalization, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

###Loading resources

In [19]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

lines = open("movie_lines.txt", encoding="utf-8", errors="ignore").read().split("\n")
conversations = open("movie_conversations.txt", encoding="utf-8", errors="ignore").read().split("\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
# Extracting dialogues
id2line = {}
for line in lines:
    parts = line.split(" +++$+++ ")
    if len(parts) == 5:
        id2line[parts[0]] = parts[4]

# Creating conversation pairs
conversations_data = []
for conv in conversations:
    parts = conv.split(" +++$+++ ")
    if len(parts) >= 4:
        line_ids = eval(parts[3])
        for i in range(len(line_ids) - 1):
            conversations_data.append((id2line.get(line_ids[i], ""), id2line.get(line_ids[i+1], "")))

df = pd.DataFrame(conversations_data, columns=["User_Input", "Bot_Response"])
stop_words = set(stopwords.words('english'))

In [21]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Processed_Input"] = df["User_Input"].apply(preprocess_text)
df["Processed_Output"] = df["Bot_Response"].apply(preprocess_text)

In [22]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Processed_Input"])

X_sequences = tokenizer.texts_to_sequences(df["Processed_Input"])
y_sequences = tokenizer.texts_to_sequences(df["Processed_Output"])

X_padded = pad_sequences(X_sequences, maxlen=20, padding="post")
y_padded = pad_sequences(y_sequences, maxlen=20, padding="post")

X_padded = np.array(X_padded)
y_padded = np.array(y_padded)

y_padded = np.expand_dims(y_padded, -1)

###Model Definition

In [38]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


embed_dim = 128
num_heads = 4
ff_dim = 128

inputs = Input(shape=(20,))
embedding_layer = Embedding(input_dim=5000, output_dim=embed_dim)(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)(embedding_layer)
output_layer = Dense(5000, activation="softmax")(transformer_block)

transformer_model = Model(inputs=inputs, outputs=output_layer)
transformer_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
transformer_model.summary()

###Model Training

In [39]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2, random_state=42)


In [40]:
# Train the model with the train data and validate it on the test data
transformer_model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
[1m 1402/11081[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m28:44[0m 178ms/step - accuracy: 0.7555 - loss: 2.4012

KeyboardInterrupt: 

In [None]:
test_loss, test_accuracy = transformer_model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


###Running Chatbot in Console

In [25]:
transformer_model = tf.keras.models.load_model("chatbot_transformer.h5", custom_objects={"TransformerBlock": TransformerBlock})
tokenizer = joblib.load("tokenizer.pkl")

def get_response(user_input):
    sequence = tokenizer.texts_to_sequences([user_input])
    padded = pad_sequences(sequence, maxlen=20, padding="post")
    predicted_sequence = transformer_model.predict(padded)
    predicted_index = np.argmax(predicted_sequence[0])
    response = tokenizer.index_word.get(predicted_index, "I don't understand.")
    return response

print("Chatbot is ready! Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    response = get_response(user_input)
    print("Chatbot:", response)




Chatbot is ready! Type 'exit' to quit.
You: hi
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step
Chatbot: require
You: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Chatbot: require
You: i love you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Chatbot: I don't understand.
You: give me few movie names
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Chatbot: I don't understand.
You: i love the smiths
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Chatbot: I don't understand.
You: exit
Chatbot: Goodbye!
