In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [23]:
df = pd.read_csv('datasets/other_data/conversation.csv', index_col=0)
df['answer'] = df['answer'].apply(lambda text: '<start> ' + str(text) + ' <end>')
df

Unnamed: 0,question,answer
0,"hi, how are you doing?",<start> i'm fine. how about yourself? <end>
1,i'm fine. how about yourself?,<start> i'm pretty good. thanks for asking. <end>
2,i'm pretty good. thanks for asking.,<start> no problem. so how have you been? <end>
3,no problem. so how have you been?,<start> i've been great. what about you? <end>
4,i've been great. what about you?,<start> i've been good. i'm in school right no...
...,...,...
3720,that's a good question. maybe it's not old age.,<start> are you right-handed? <end>
3721,are you right-handed?,<start> yes. all my life. <end>
3722,yes. all my life.,<start> you're wearing out your right hand. st...
3723,you're wearing out your right hand. stop using...,<start> but i do all my writing with my right ...


In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_data(input_texts, target_texts):
    # Create a tokenizer and fit on the input and target texts
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(input_texts + target_texts)

    # Convert input and target texts to sequences of integers
    encoder_input_sequences = tokenizer.texts_to_sequences(input_texts)
    decoder_input_sequences = tokenizer.texts_to_sequences(target_texts)

    # Calculate max sequence length
    max_sequence_length = max(max(len(seq) for seq in encoder_input_sequences),
                             max(len(seq) for seq in decoder_input_sequences))
    print('max sequence length:', max_sequence_length)
    # Pad sequences to have the same length
    encoder_input_data = pad_sequences(encoder_input_sequences, maxlen=max_sequence_length, padding='post')
    decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_sequence_length, padding='post')

    # Shift target sequences by one time step and convert to one-hot encoding
    decoder_target_data = np.zeros_like(decoder_input_data)
    decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
    decoder_target_data[:, -1] = tokenizer.word_index['<end>']

    # Return preprocessed data and tokenizer
    return encoder_input_data, decoder_input_data, decoder_target_data, tokenizer, max_sequence_length

In [26]:
input_texts = df['question'].astype(str).tolist()
target_texts = df['answer'].astype(str).tolist()
# Preprocess the data
encoder_input_data, decoder_input_data, decoder_target_data, tokenizer, max_sequence_length = preprocess_data(
    input_texts, target_texts)

max sequence length: 21


In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input, Embedding, Bidirectional, LSTM, Dense, Attention, Concatenate, Dot, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def create_model(vocab_size, embedding_dim, hidden_dim, max_sequence_length):

    # Encoder
    encoder_inputs = Input(shape=(max_sequence_length,))
    encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = Bidirectional(LSTM(hidden_dim, return_sequences=True, return_state=True))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    encoder_state_h = Concatenate()([forward_h, backward_h])
    encoder_state_c = Concatenate()([forward_c, backward_c])

    # Decoder
    decoder_inputs = Input(shape=(max_sequence_length,))
    decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(hidden_dim*2, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
    decoder_outputs = Dropout(0.2)(decoder_outputs)  # Adding Dropout layer for regularization


    # Attention mechanism
    attention = Dot(axes=[2, 2])
    attention_scores = attention([decoder_outputs, encoder_outputs])
    attention_weights = Activation('softmax')(attention_scores)
    context_vector = Dot(axes=[2, 1])([attention_weights, encoder_outputs])

    # Concatenate the context vector and decoder outputs
    decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

    # Dense layer for generating the final output
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_combined_context)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    model.summary()
    
    return model

def train_model(model, encoder_input_data, decoder_input_data, decoder_target_data, batch_size, epochs, validation_split):
     # Define the EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # train the model
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=validation_split,
              callbacks=[early_stopping])

In [28]:
# Create the model
vocab_size = len(tokenizer.word_index) + 1
print('Vocab size: ', vocab_size)

# Hyperparameters
embedding_dim = 256
hidden_units = 256
batch_size = 32
epochs = 100
validation_split = 0.2

model = create_model(vocab_size, embedding_dim, hidden_units, max_sequence_length)

# Train the model
train_model(model, encoder_input_data, decoder_input_data, decoder_target_data, batch_size, epochs, validation_split)

Vocab size:  4042
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 21)]                 0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 21, 256)              1034752   ['input_4[0][0]']             
                                                                                                  
 input_5 (InputLayer)        [(None, 21)]                 0         []                            
                                                                                                  
 bidirectional_2 (Bidirecti  [(None, 21, 512),            1050624   ['embedding_3[0][0]']         
 onal)                        (None, 256),                                

In [30]:
import pickle

# Save the trained model
model.save("models/Conv_Bidirectional_model.h5")

# Save the tokenizer
with open("models/Conv_Bidirectional_tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)