In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
df = pd.read_csv('datasets/patient-doctor.csv')
df['Doctor'] = df['Doctor'].apply(lambda text: '<start> ' + text + ' <end>')
df

Unnamed: 0,Patient,Doctor
0,hello good morning doctor,<start> good morning how are you feeling today...
1,ive been feeling quite anxious lately its been...,<start> i see can you tell me more about what ...
2,i think its mainly related to my job and the p...,<start> stress at work can definitely take a t...
3,i work in a highly demanding environment and i...,<start> that sounds tough do you have any supp...
4,i try to talk to my friends but they dont alwa...,<start> having a strong support system is impo...
...,...,...
1503,i find it difficult to cope and the grief ofte...,<start> coping with grief can be emotionally e...
1504,i havent been very open about my struggles as ...,<start> its common to feel hesitant about shar...
1505,lately i havent been actively practicing selfc...,<start> practicing selfcompassion and engaging...
1506,i havent sought professional help yet im unsur...,<start> seeking professional help such as ther...


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_data(input_texts, target_texts):
    # Create a tokenizer and fit on the input and target texts
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(input_texts + target_texts)

    # Convert input and target texts to sequences of integers
    encoder_input_sequences = tokenizer.texts_to_sequences(input_texts)
    decoder_input_sequences = tokenizer.texts_to_sequences(target_texts)
    
    # Calculate max sequence length
    max_sequence_length = max(max(len(seq) for seq in encoder_input_sequences),
                             max(len(seq) for seq in decoder_input_sequences))
    
    # Pad sequences to have the same length
    encoder_input_data = pad_sequences(encoder_input_sequences, maxlen=max_sequence_length, padding='post')
    decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_sequence_length, padding='post')
    
    # Shift target sequences by one time step and convert to one-hot encoding
    decoder_target_data = np.zeros_like(decoder_input_data)
    decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
    decoder_target_data[:, -1] = tokenizer.word_index['<end>']
    
    # Return preprocessed data and tokenizer
    return encoder_input_data, decoder_input_data, decoder_target_data, tokenizer, max_sequence_length

In [None]:
input_texts = df['Patient'].astype(str).tolist()
target_texts = df['Doctor'].astype(str).tolist()
# Preprocess the data
encoder_input_data, decoder_input_data, decoder_target_data, tokenizer, max_sequence_length = preprocess_data(
    input_texts, target_texts)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Input, Embedding, Bidirectional, LSTM, Dense, Attention, Concatenate, Dot, Dropout
from tensorflow.keras.callbacks import EarlyStopping


def create_model(vocab_size, embedding_dim, hidden_units, max_sequence_length):
    # Encoder
    encoder_inputs = Input(shape=(max_sequence_length,))
    encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    
    # Decoder
    decoder_inputs = Input(shape=(max_sequence_length,))
    decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
    decoder_outputs = Dropout(0.2)(decoder_outputs)  # Adding Dropout layer for regularization

    
    # Attention mechanism
    attention = Dot(axes=[2, 2])
    attention_scores = attention([decoder_outputs, encoder_outputs])
    attention_weights = Activation('softmax')(attention_scores)
    context_vector = Dot(axes=[2, 1])([attention_weights, encoder_outputs])

    # Concatenate the context vector and decoder outputs
    decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

    # Dense layer for generating the final output
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_combined_context)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


def create_model_2(vocab_size, embedding_dim, hidden_dim, max_sequence_length):

    # Encoder
    encoder_inputs = Input(shape=(max_sequence_length,))
    encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)    
    encoder_lstm = Bidirectional(LSTM(hidden_dim, return_sequences=True, return_state=True))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    encoder_state_h = Concatenate()([forward_h, backward_h])
    encoder_state_c = Concatenate()([forward_c, backward_c])
    
    # Decoder
    decoder_inputs = Input(shape=(max_sequence_length,))
    decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(hidden_dim*2, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
    decoder_outputs = Dropout(0.2)(decoder_outputs)  # Adding Dropout layer for regularization

    
    # Attention mechanism
    attention = Dot(axes=[2, 2])
    attention_scores = attention([decoder_outputs, encoder_outputs])
    attention_weights = Activation('softmax')(attention_scores)
    context_vector = Dot(axes=[2, 1])([attention_weights, encoder_outputs])

    # Concatenate the context vector and decoder outputs
    decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

    # Dense layer for generating the final output
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_combined_context)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

def train_model(model, encoder_input_data, decoder_input_data, decoder_target_data, batch_size, epochs, validation_split):
     # Define the EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # train the model
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_split=validation_split,
              callbacks=[early_stopping])

In [None]:
# Create the model
vocab_size = len(tokenizer.word_index) + 1
print('Vocab size: ', vocab_size)

# Hyperparameters
embedding_dim = 256
hidden_units = 256
batch_size = 32
epochs = 100
validation_split = 0.2

model = create_model_2(vocab_size, embedding_dim, hidden_units, max_sequence_length)

# Train the model
train_model(model, encoder_input_data, decoder_input_data, decoder_target_data, batch_size, epochs, validation_split)

In [None]:
def beam_search_decoder(model, input_sequence, tokenizer, beam_width, max_len):
    # Initialize beam search
    sequences = [[[], 0.0]]
    end_token_index = tokenizer.word_index['<end>']

    
    # Iterate through each prediction step
    for _ in range(max_len):
        all_candidates = []
        
        # Generate candidates for each sequence
        for seq, score in sequences:
            if len(seq) > 0:
                input_seq = pad_sequences([seq], maxlen=max_len)
                pred = model.predict([input_sequence, input_seq])[0][-1]
                top_scores_indices = np.argsort(pred)[-beam_width:]
                
                for index in top_scores_indices:
                    candidate = [seq + [index], score + np.log(pred[index])]
                    all_candidates.append(candidate)
            else:
                # Handle the initial empty sequence
                pred = model.predict([input_sequence, np.zeros((1, max_len))])[0][-1]
                top_scores_indices = np.argsort(pred)[-beam_width:]
                
                for index in top_scores_indices:
                    candidate = [[index], score + np.log(pred[index])]
                    all_candidates.append(candidate)
        
        # Select top-k candidates
        ordered = sorted(all_candidates, key=lambda x: x[1], reverse=True)
        sequences = ordered[:beam_width]
        
        # Check if any sequence ends with the end token
        end_flag = False
        for seq, _ in sequences:
            if seq[-1] == end_token_index:
                end_flag = True
                break
        
        if end_flag:
            break
    
    # Get the sequence with the highest score
    best_sequence = sequences[0][0]
    
    # Convert token indices to text
    decoded_sequence = tokenizer.sequences_to_texts([best_sequence])[0]
    
    return decoded_sequence

In [None]:
# Generate a response
def generate_response(input_text, model, tokenizer):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=50, padding='post')
    decoded_sequence = beam_search_decoder(model, input_seq, tokenizer, beam_width=3, max_len=50)
    return decoded_sequence

print("Bot: Hi, I am a learning psychiatrist. ask me anything.")
input_text = "I've noticed that my anxiety tends to escalate."
while input_text != '':
    response = generate_response(input_text, model, tokenizer)
    print("Bot:",  response)
    input_text = input('User: ')

In [None]:
import pickle

# Save the trained model
model.save("models/PD_Attention_trained_model.h5")

# Save the tokenizer
with open("models/PD_Attention_tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)