In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
import tensorflow as tf
import nltk
import re 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge




In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load the data
with open('../Final_Intents.json', 'r') as file:
    data = json.load(file)

In [4]:
# Function to pair each question with its corresponding response
def pair_questions_responses(data):
    paired_data = []
    for item in data:
        tag = item.get('tag', 'Unknown')
        questions = item.get('questions', [])
        responses = item.get('responses', [])

        for question, response in zip(questions, responses):
            paired_data.append({'tag': tag, 'question': question, 'response': response})

    return paired_data

In [5]:
paired_data = pair_questions_responses(data)
df = pd.DataFrame(paired_data)
df.head ()


Unnamed: 0,tag,question,response
0,cloud_computing,What is the fundamental starting point for und...,The fundamental starting point for understandi...
1,cloud_computing,How does Moringa School introduce the concept ...,Moringa School introduces the concept of Cloud...
2,cloud_computing,Why is Cloud Computing considered essential in...,Cloud Computing is considered essential in the...
3,cloud_computing,Can you provide an overview of the Cloud Compu...,The Cloud Computing program at Moringa School ...
4,cloud_computing,What are the eligibility requirements for the ...,To be eligible for the Cloud Computing program...


In [6]:
# Data Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Data Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Apply preprocessing to questions and responses
df['question'] = df['question'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

In [7]:
# Split data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['question'].tolist() + train_data['response'].tolist())


In [9]:
# Convert texts to sequences
train_questions_seq = tokenizer.texts_to_sequences(train_data['question'].tolist())
train_responses_seq = tokenizer.texts_to_sequences(train_data['response'].tolist())

In [10]:
# Finding the maximum sequence length
max_seq_length = max(max(len(seq) for seq in train_questions_seq), max(len(seq) for seq in train_responses_seq))

In [11]:
# Padding sequences
train_questions_padded = pad_sequences(train_questions_seq, maxlen=max_seq_length, padding='post')
train_responses_padded = pad_sequences(train_responses_seq, maxlen=max_seq_length, padding='post')


In [12]:
# Preparing test data
test_questions_seq = tokenizer.texts_to_sequences(test_data['question'].tolist())
test_responses_seq = tokenizer.texts_to_sequences(test_data['response'].tolist())
test_questions_padded = pad_sequences(test_questions_seq, maxlen=max_seq_length, padding='post')
test_responses_padded = pad_sequences(test_responses_seq, maxlen=max_seq_length, padding='post')

In [13]:
# One-hot encode the responses
vocab_size = len(tokenizer.word_index) + 1
train_responses_one_hot = np.zeros((len(train_responses_padded), max_seq_length, vocab_size))

for i, sequence in enumerate(train_responses_padded):
    for j, word_index in enumerate(sequence):
        train_responses_one_hot[i, j, word_index] = 1

In [14]:
from tensorflow.keras.layers import Layer
import tensorflow as tf

class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [15]:
# Updated Model Architecture
embedding_dim = 128  # Embedding dimensionality
latent_dim = 256 

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Bahdanau Attention
attention_layer = BahdanauAttention(latent_dim)
context_vector, attention_weights = attention_layer(state_h, encoder_outputs)

# Decoder with attention
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
context_vector_with_time_axis = tf.expand_dims(context_vector, 1)

# Broadcast the context vector to have the same shape as the decoder embedding
sequence_length = tf.shape(decoder_embedding)[1]
context_vector_broadcasted = tf.broadcast_to(context_vector_with_time_axis, [tf.shape(context_vector_with_time_axis)[0], sequence_length, tf.shape(context_vector_with_time_axis)[-1]])
decoder_input_combined = tf.concat([context_vector_broadcasted, decoder_embedding], axis=-1)

# Continue with the decoder LSTM, etc.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_input_combined, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)




In [16]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model with Adam optimizer and use categorical_crossentropy as loss
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [17]:
# Summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 148)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 148, 128)             189696    ['input_1[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 256),                394240    ['embedding[0][0]']           
                              (None, 256),                                                        
                              (None, 256)]                                                        
                                                                                              

In [18]:
# Training the model
history = model.fit([train_questions_padded, train_responses_padded], train_responses_one_hot,
                    batch_size=64,
                    epochs=20,
                    validation_split=0.2)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
# Evaluate the model on test data
test_responses_one_hot = np.zeros((len(test_responses_padded), max_seq_length, vocab_size))

for i, sequence in enumerate(test_responses_padded):
    for j, word_index in enumerate(sequence):
        test_responses_one_hot[i, j, word_index] = 1

test_loss, test_accuracy = model.evaluate([test_questions_padded, test_responses_padded], test_responses_one_hot)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.6787163615226746
Test Accuracy: 0.9083333611488342


In [20]:
train_loss, train_accuracy = model.evaluate([train_questions_padded, train_responses_padded], train_responses_one_hot)
print("Train Loss:", train_loss)
print("Train Accuracy:", train_accuracy)

Train Loss: 0.7295042276382446
Train Accuracy: 0.9007927775382996


In [21]:
def seq_to_text(sequence, tokenizer):
    # Convert a sequence of indices back to text
    return " ".join([word for word, index in tokenizer.word_index.items() if index in sequence])