In [1]:
import tensorflow as tf
import spacy
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
nlp = spacy.load("en_core_web_sm")

ValueError: Unable to compare versions for numpy>=1.17: need=1.17 found=None. This is unusual. Consider reinstalling numpy.

In [None]:
# Sample dataset
questions = [
    "How to install nodejs",
    "What are the benefits of meditation",
    "How to make a chocolate cake",
    # Add more questions here
]

tags = ['js', 'health', 'cooking']

# Preprocess the text using spaCy and remove stopwords using NLTK
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return " ".join(filtered_tokens)

# Preprocess all the questions
questions = [preprocess_text(question) for question in questions]

# Tokenize the text and convert to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
sequences = tokenizer.texts_to_sequences(questions)

# Pad sequences to ensure they have the same length
max_sequence_length = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert tags to numerical labels
tag_to_label = {tag: i for i, tag in enumerate(tags)}
labels = [tag_to_label[tag] for tag in tags]

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(tags), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add Early Stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with Early Stopping
model.fit(sequences, labels, epochs=100, batch_size=1, validation_split=0.2, callbacks=[early_stopping])

# Now, you can use the model to predict the tag of a new question
new_question = "How to debug JavaScript code"
new_question = preprocess_text(new_question)
new_sequence = tokenizer.texts_to_sequences([new_question])
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_sequence_length)
predicted_label = model.predict_classes(new_padded_sequence)
predicted_tag = tags[predicted_label[0]]

print("Predicted tag:", predicted_tag)
