<a href="https://colab.research.google.com/github/CodingYodha/QnA-bot-using-GRU/blob/main/QnA_with_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!git clone https://github.com/CodingYodha/QnA-bot-using-GRU.git

fatal: destination path 'QnA-bot-using-GRU' already exists and is not an empty directory.


Downlaoding the Dataset using convokit

In [19]:
# from convokit import Corpus, download
# corpus = Corpus(filename=download("movie-corpus"))

In [20]:
# corpus.print_summary_stats()

In [21]:
# corpus

Converting the Corpus into Pandas dataframe


In [22]:
import pandas as pd
from convokit import Corpus, download

# Download and load the corpus
corpus = Corpus(filename=download("movie-corpus"))
corpus.print_summary_stats()

# Extract utterance data
data = []
for utt in corpus.iter_utterances():
    data.append({
        "conversation_id": utt.conversation_id,
        "utterance_id": utt.id,
        "speaker": utt.speaker.id,
        "text": utt.text
    })




Downloading movie-corpus to /root/.convokit/saved-corpora/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


data is in the json format , we need to convert it into model trainable format


In [24]:
# Group utterances by conversation_id
from collections import defaultdict

conversations = defaultdict(list)
for entry in data:
    conversations[entry['conversation_id']].append((entry['utterance_id'], entry['speaker'], entry['text']))

# Extract question-answer pairs
questions = []
answers = []

for conv_id, utterances in conversations.items():
    for i in range(len(utterances) - 1):
        current_utterance = utterances[i][2]  # Current utterance text
        next_utterance = utterances[i + 1][2]  # Next utterance text
        questions.append(current_utterance)
        answers.append(next_utterance)

# Example output
print("Question:", questions[100])
print("Answer:", answers[100])

Question: What do you think?
Answer: Oh, I thought you might have a date  I don't know why I'm bothering to ask, but are you going to Bogey Lowenstein's party Saturday night?


Cleaning the text<br>
Removing unnecessary characters and normalize it

In [25]:
import re

In [26]:
def clean_text(text):
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"[^a-zA-Z?.!]+", " ", text)
    text = text.strip()
    return text.lower()

questions = [clean_text(q) for q in questions]
answers = [clean_text(a) for a in answers]

# Add <start> and <end> tokens to answers
answers = ['<start> ' + a + ' <end>' for a in answers]

Tokenization & Padding<br> Tokenizing is for Text and Padding is for Sequences

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenization : converting text to numerical representation
<br>
oov_token="OOV" this statement makes_sure that the words which are not in the vocabulary of <br> tensorflow are not ignored but are replaced with <OOV tag

In [28]:
# Tokenizer
tokenizer = Tokenizer(filters="", oov_token="<OOV>")
tokenizer.fit_on_texts(questions + answers)

tokenizer.word_index: A dictionary where keys are words and values are their respective integer indices.

len(tokenizer.word_index): Counts how many unique words are in the vocabulary.

+1: Adds 1 to account for indexing starting from 1, leaving room for special tokens like <OOV.

In [29]:
# Vocabulary size
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [30]:
# Convert text to sequences
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

texts_to_sequences: Converts each text into a sequence of integers based on the tokenizer's word index.

If a word exists in the vocabulary, it's replaced by its index.

If a word does not exist, it is replaced with the <OOV token's index.

For example:

Input: "How are you?"

Output: [12, 4, 7] (depending on the assigned indices).

In [31]:
# Pad sequences
MAX_LENGTH = 19  # Adjust based on your data
question_padded = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

pad_sequences: Ensures all sequences have the same length by adding padding or truncating.

maxlen=MAX_LENGTH: Specifies the maximum length of the sequences (e.g., 20). Longer sequences are truncated, and shorter ones are padded.

padding='post': Adds padding (e.g., zeros) to the end of the sequence.

truncating='post': Truncates longer sequences from the end.

## Building GRU Model (Seq2Seq with Attention)

In [32]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU, Dense, Embedding, Input, Attention
from tensorflow.keras.models import Model

In [33]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Attention, Concatenate
from tensorflow.keras.models import Model

# Define constants
VOCAB_SIZE = VOCAB_SIZE = len(tokenizer.word_index) + 1
MAX_LENGTH = 19    # Adjust as needed

# Encoder
encoder_inputs = Input(shape=(MAX_LENGTH,), name='encoder_inputs')
enc_emb = Embedding(VOCAB_SIZE, 256, name='encoder_embedding')(encoder_inputs)
# Set return_sequences=True so we get a 3D tensor for Attention
encoder_gru = GRU(256, return_sequences=True, return_state=True, name='encoder_gru')
encoder_outputs, encoder_state = encoder_gru(enc_emb)

# Decoder
decoder_inputs = Input(shape=(MAX_LENGTH-1,), name='decoder_inputs')
dec_emb = Embedding(VOCAB_SIZE, 256, name='decoder_embedding')(decoder_inputs)
decoder_gru = GRU(256, return_sequences=True, return_state=True, name='decoder_gru')
decoder_outputs, _ = decoder_gru(dec_emb, initial_state=encoder_state)

# Attention Layer: compare decoder outputs with full encoder outputs
attention_layer = Attention(name='attention_layer')
attention_output = attention_layer([decoder_outputs, encoder_outputs])

# Concatenate decoder outputs with attention context using the Concatenate layer
decoder_concat = Concatenate(axis=-1, name='decoder_concat')([decoder_outputs, attention_output])

# Dense output layer
decoder_dense = Dense(VOCAB_SIZE, activation='softmax', name='decoder_dense')
output = decoder_dense(decoder_concat)

# Build and compile the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Assume you already have question_padded and answer_padded ---
# For example, after tokenizing and padding, you have:
# question_padded: shape (num_samples, MAX_LENGTH)
# answer_padded: shape (num_samples, MAX_LENGTH)
# Here, MAX_LENGTH is set to 19 (as per your code).

# Prepare decoder data by shifting:
decoder_input_data = answer_padded[:, :-1]   # Removes the last token; shape (num_samples, MAX_LENGTH-1)
decoder_output_data = answer_padded[:, 1:]     # Removes the first token; shape (num_samples, MAX_LENGTH-1)

# Define batch size and number of epochs
BATCH_SIZE = 32
EPOCHS = 30

# Create a tf.data.Dataset from your numpy arrays
dataset = tf.data.Dataset.from_tensor_slices(((question_padded, decoder_input_data), decoder_output_data))
# Shuffle, batch, and prefetch to improve efficiency and reduce memory load
dataset = dataset.shuffle(buffer_size=10000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Option 1: If you want to train on all data:
model.fit(dataset, epochs=EPOCHS)

# Option 2: If you want to have a validation split, you can manually split the dataset.
# For example, assuming you want an 80/20 split:
num_samples = answer_padded.shape[0]
train_size = int(0.8 * num_samples)

# Convert dataset to a list of batches to allow splitting (only do this if your dataset is not huge)
all_batches = list(dataset)
num_batches = len(all_batches)
train_batches = int(0.8 * num_batches)
train_dataset = tf.data.Dataset.from_tensor_slices(all_batches[:train_batches]).unbatch().batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(all_batches[train_batches:]).unbatch().batch(BATCH_SIZE)

model.fit(train_dataset, validation_data=val_dataset, epochs=EPOCHS)


Epoch 1/30
