In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer

# Load and Preprocess Data
filename = "wizard_of_us.txt"
with open(filename, "r", encoding="utf-8", errors="ignore") as file:
    raw_text = file.read().lower()
import re

# Remove unnecessary characters and normalize text
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip().lower()  # Convert to lowercase
    return text

raw_text = clean_text(raw_text)

# Tokenization
##
tokenizer = Tokenizer(num_words=5000)  # Limit vocab size to 5000
tokenizer.fit_on_texts([raw_text])
##
sequences = tokenizer.texts_to_sequences([raw_text])[0]

vocab_size = len(tokenizer.word_index) + 1

##
seq_length = 150

# Create Input-Output Pairs
X = []
y = []
for i in range(0, len(sequences) - seq_length):
    X.append(sequences[i:i + seq_length])
    y.append(sequences[i + seq_length])

X = np.array(X)
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# Reshape for LSTM Input
X = np.reshape(X, (X.shape[0], X.shape[1]))

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to ensure uniform length
X = pad_sequences(X, maxlen=seq_length, padding='pre')

# Model Definition
##
model = Sequential([
    Embedding(vocab_size, 256, input_length=seq_length),
    Bidirectional(LSTM(512, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),
    Bidirectional(LSTM(512, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),
    Bidirectional(LSTM(256)),
    Dropout(0.4),
    Dense(vocab_size, activation="softmax")
])

def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler

model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])


In [7]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 150, 256)          1103616   
                                                                 
 bidirectional_5 (Bidirectio  (None, 150, 1024)        3149824   
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 150, 1024)         0         
                                                                 
 batch_normalization_3 (Batc  (None, 150, 1024)        4096      
 hNormalization)                                                 
                                                                 
 bidirectional_6 (Bidirectio  (None, 150, 1024)        6295552   
 nal)                                                            
                                                      

In [3]:
checkpoint = ModelCheckpoint("weights-best2.hdf5", monitor="loss", save_best_only=True, verbose=1)
early_stopping = EarlyStopping(monitor="loss", patience=5, restore_best_weights=True)
callbacks = [checkpoint, early_stopping]

# Train the model
model.fit(X, y, epochs=100, batch_size=128, callbacks=callbacks)

Epoch 1/100
Epoch 1: loss improved from inf to 6.29183, saving model to weights-best2.hdf5
Epoch 2/100
Epoch 2: loss improved from 6.29183 to 5.84772, saving model to weights-best2.hdf5
Epoch 3/100
Epoch 3: loss improved from 5.84772 to 5.61391, saving model to weights-best2.hdf5
Epoch 4/100
Epoch 4: loss improved from 5.61391 to 5.41750, saving model to weights-best2.hdf5
Epoch 5/100
Epoch 5: loss improved from 5.41750 to 5.27632, saving model to weights-best2.hdf5
Epoch 6/100
Epoch 6: loss improved from 5.27632 to 5.09218, saving model to weights-best2.hdf5
Epoch 7/100
Epoch 7: loss improved from 5.09218 to 4.93106, saving model to weights-best2.hdf5
Epoch 8/100
Epoch 8: loss improved from 4.93106 to 4.77773, saving model to weights-best2.hdf5
Epoch 9/100
Epoch 9: loss improved from 4.77773 to 4.61735, saving model to weights-best2.hdf5
Epoch 10/100
Epoch 10: loss improved from 4.61735 to 4.46305, saving model to weights-best2.hdf5
Epoch 11/100
Epoch 11: loss improved from 4.46305 to

<keras.callbacks.History at 0x27d429dfee0>

In [4]:
model.load_weights("weights-best2.hdf5")

# Generate Text
seed_idx = np.random.randint(0, len(X) - 1)
seed_sequence = X[seed_idx]

output = []
for _ in range(1000):  # Generate 1000 characters
    pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
    pred_probs = model.predict(pred_input, verbose=0)
    next_idx = np.argmax(pred_probs)
    output.append(tokenizer.index_word[next_idx])

    # Update seed sequence
    seed_sequence = np.append(seed_sequence[1:], next_idx)

print("Generated Text:")
print(" ".join(output))

Generated Text:
eat have no personal experience in such matters but i remember that our great poet once said to eat is sweet when hungers seat demands a treat of savory meat take this into consideration friends of the jury and you will readily decide that the kitten is wrongfully accused and should be set at liberty when the tin woodman sat down no one applauded him for his arguments had not been very convincing and few believed that he had proved eurekas innocence as for the jury the members whispered to each other for a few minutes and then they appointed the hungry tiger their spokesman the huge beast slowly arose and said kittens have no consciences so they eat whatever pleases them the jury believes the white kitten known as eureka is guilty of having eaten the piglet owned by princess ozma and recommends that she be put to death in punishment of the crime the judgment of the jury was received with great applause although dorothy was sobbing miserably at the fate of her pet the pr

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the trained model
model.load_weights("weights-best.hdf5")

# Define the knowledge base
with open('mental_h.txt', 'r', encoding='utf-8') as file:
    knowledge_base = file.read()

# Extract knowledge base sections
def find_relevant_info(user_input, knowledge_text):
    vectorizer = TfidfVectorizer().fit_transform([knowledge_text, user_input])
    similarity = vectorizer.toarray().dot(vectorizer.toarray().T)[0, 1]
    if similarity > 0.1:
        # Extract sentences with relevance
        return '\n'.join([sentence for sentence in knowledge_text.splitlines() if user_input.lower() in sentence.lower()])
    return "I don't have specific information about that. Let's explore general advice."

# Generate a creative response
def generate_response(user_input, tokenizer, model, max_sequence_length, output_length=100):
    input_sequence = tokenizer.texts_to_sequences([user_input])
    input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='pre')

    output = []
    for _ in range(output_length):
        pred_probs = model.predict(input_sequence, verbose=0)
        next_idx = np.argmax(pred_probs)
        output.append(tokenizer.index_word.get(next_idx, ""))

        input_sequence = np.append(input_sequence[0][1:], next_idx).reshape(1, max_sequence_length)

    return " ".join(output)

# Example usage
user_input = input("You: ")
relevant_info = find_relevant_info(user_input, knowledge_base)
creative_response = generate_response(user_input, tokenizer, model, max_sequence_length=100, output_length=100)

response = f"Here is some information related to your query:\n{relevant_info}\n\nChatbot: {creative_response}"
print(response)


You:  hi


Here is some information related to your query:
I don't have specific information about that. Let's explore general advice.

Chatbot: and worry in the same risk of mental health conditions are not not have a mental health condition is not not have a mental health condition is not not have a mental illness is not not not have a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person has a person


In [7]:
model.load_weights("weights-best.hdf5")

# Generate Text
seed_idx = np.random.randint(0, len(X) - 1)
seed_sequence = X[seed_idx]

output = []
temperature = 1.0  # Lower values make text more deterministic; higher values make it more diverse.

for i in range(2000):  # Generate 1000 characters
    if i<1500:
        pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
        pred_probs = model.predict(pred_input, verbose=0)
        next_idx = np.argmax(pred_probs)
        output.append(tokenizer.index_word[next_idx])

    # Update seed sequence
        seed_sequence = np.append(seed_sequence[1:], next_idx)
        
    else:
        if seed_idx == ".":
            break
        else:
            pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
            pred_probs = model.predict(pred_input, verbose=0)
            next_idx = np.argmax(pred_probs)
            output.append(tokenizer.index_word[next_idx])

            # Update seed sequence
            seed_sequence = np.append(seed_sequence[1:], next_idx)
            
        

print("Generated Text:")
print("".join(output)+".")

KeyboardInterrupt: 