In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer

# Load and Preprocess Data
filename = "mental_H.txt"
with open(filename, "r", encoding="utf-8", errors="ignore") as file:
    raw_text = file.read().lower()
import re

# Remove unnecessary characters and normalize text
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip().lower()  # Convert to lowercase
    return text

raw_text = clean_text(raw_text)

# Tokenization
##
tokenizer = Tokenizer(num_words=5000)  # Limit vocab size to 5000
tokenizer.fit_on_texts([raw_text])
##
sequences = tokenizer.texts_to_sequences([raw_text])[0]

vocab_size = len(tokenizer.word_index) + 1

##
seq_length = 150

# Create Input-Output Pairs
X = []
y = []
for i in range(0, len(sequences) - seq_length):
    X.append(sequences[i:i + seq_length])
    y.append(sequences[i + seq_length])

X = np.array(X)
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# Reshape for LSTM Input
X = np.reshape(X, (X.shape[0], X.shape[1]))

# Model Definition
##
model = Sequential([
    Embedding(vocab_size, 256, input_length=seq_length),
    Bidirectional(LSTM(256, return_sequences=True)),
    Dropout(0.3),
    BatchNormalization(),
    Bidirectional(LSTM(256)),
    Dropout(0.3),
    Dense(vocab_size, activation="softmax")
])


model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [13]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 150, 256)          1294592   
                                                                 
 bidirectional_8 (Bidirectio  (None, 150, 512)         1050624   
 nal)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 150, 512)          0         
                                                                 
 batch_normalization_4 (Batc  (None, 150, 512)         2048      
 hNormalization)                                                 
                                                                 
 bidirectional_9 (Bidirectio  (None, 512)              1574912   
 nal)                                                            
                                                      

In [14]:
checkpoint = ModelCheckpoint("weights-best1.hdf5", monitor="loss", save_best_only=True, verbose=1)
callbacks = [checkpoint]

# Train the Model
model.fit(X, y, epochs=100, batch_size=64, callbacks=callbacks)

Epoch 1/100
Epoch 1: loss improved from inf to 6.67208, saving model to weights-best1.hdf5
Epoch 2/100
Epoch 2: loss improved from 6.67208 to 6.20436, saving model to weights-best1.hdf5
Epoch 3/100
Epoch 3: loss improved from 6.20436 to 5.91173, saving model to weights-best1.hdf5
Epoch 4/100
Epoch 4: loss improved from 5.91173 to 5.65807, saving model to weights-best1.hdf5
Epoch 5/100
Epoch 5: loss improved from 5.65807 to 5.38665, saving model to weights-best1.hdf5
Epoch 6/100
Epoch 6: loss improved from 5.38665 to 5.10505, saving model to weights-best1.hdf5
Epoch 7/100
Epoch 7: loss improved from 5.10505 to 4.80767, saving model to weights-best1.hdf5
Epoch 8/100
Epoch 8: loss improved from 4.80767 to 4.49799, saving model to weights-best1.hdf5
Epoch 9/100
Epoch 9: loss improved from 4.49799 to 4.23794, saving model to weights-best1.hdf5
Epoch 10/100
Epoch 10: loss improved from 4.23794 to 3.90370, saving model to weights-best1.hdf5
Epoch 11/100
Epoch 11: loss improved from 3.90370 to

<keras.callbacks.History at 0x1688a3c4c40>

In [17]:
model.load_weights("weights-best1.hdf5")

# Generate Text
seed_idx = np.random.randint(0, len(X) - 1)
seed_sequence = X[seed_idx]

output = []
for _ in range(1000):  # Generate 1000 characters
    pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
    pred_probs = model.predict(pred_input, verbose=0)
    next_idx = np.argmax(pred_probs)
    output.append(tokenizer.index_word[next_idx])

    # Update seed sequence
    seed_sequence = np.append(seed_sequence[1:], next_idx)

print("Generated Text:")
print(" ".join(output))

Generated Text:
tends to run in families strongly suggests that the disease has a genetic component children who have adhd usually have at least one close relative who also has the disorder24 one group of researchers found that a child whose identical twin has adhd is 11 to 18 times more likely to develop the disorder than a nontwin sibling investigations of particular genes involved in adhd have focused on a dopamine receptor gene drd on chromosome 11 and the dopamine transporter gene dat1 on chromosome 544 ongoing studies continue to examine these genes and others as factors in adhd most likely a combination of several genes and environmental factors determines whether a person has adhd imaging studies have shown differences in the brains of boys with adhd compared with boys who do not have adhd researchers found that certain parts of the brain are on average smaller in boys with adhd8 other studies found that the total brain volume is smaller in girls who have adhd than in control s

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the trained model
model.load_weights("weights-best.hdf5")

# Define the knowledge base
with open('mental_h.txt', 'r', encoding='utf-8') as file:
    knowledge_base = file.read()

# Extract knowledge base sections
def find_relevant_info(user_input, knowledge_text):
    vectorizer = TfidfVectorizer().fit_transform([knowledge_text, user_input])
    similarity = vectorizer.toarray().dot(vectorizer.toarray().T)[0, 1]
    if similarity > 0.1:
        # Extract sentences with relevance
        return '\n'.join([sentence for sentence in knowledge_text.splitlines() if user_input.lower() in sentence.lower()])
    return "I don't have specific information about that. Let's explore general advice."

# Generate a creative response
def generate_response(user_input, tokenizer, model, max_sequence_length, output_length=100):
    input_sequence = tokenizer.texts_to_sequences([user_input])
    input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='pre')

    output = []
    for _ in range(output_length):
        pred_probs = model.predict(input_sequence, verbose=0)
        next_idx = np.argmax(pred_probs)
        output.append(tokenizer.index_word.get(next_idx, ""))

        input_sequence = np.append(input_sequence[0][1:], next_idx).reshape(1, max_sequence_length)

    return "".join(output)

# Example usage
user_input = input("You: ")
relevant_info = find_relevant_info(user_input, knowledge_base)
creative_response = generate_response(user_input, tokenizer, model, max_sequence_length=100, output_length=100)

response = f"Here is some information related to your query:\n{relevant_info}\n\nChatbot: {creative_response}"
print(response)


In [None]:
model.load_weights("weights-best.hdf5")

# Generate Text
seed_idx = np.random.randint(0, len(X) - 1)
seed_sequence = X[seed_idx]

output = []
temperature = 1.0  # Lower values make text more deterministic; higher values make it more diverse.

for i in range(2000):  # Generate 1000 characters
    if i<1500:
        pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
        pred_probs = model.predict(pred_input, verbose=0)
        next_idx = np.argmax(pred_probs)
        output.append(tokenizer.index_word[next_idx])

    # Update seed sequence
        seed_sequence = np.append(seed_sequence[1:], next_idx)
        
    else:
        if seed_idx == ".":
            break
        else:
            pred_input = np.reshape(seed_sequence, (1, len(seed_sequence)))
            pred_probs = model.predict(pred_input, verbose=0)
            next_idx = np.argmax(pred_probs)
            output.append(tokenizer.index_word[next_idx])

            # Update seed sequence
            seed_sequence = np.append(seed_sequence[1:], next_idx)
            
        

print("Generated Text:")
print("".join(output)+".")