# Comprehensive Analysis of Mental Health Dataset

## About Dataset

The Dataset for Mental Health is an invaluable asset in the realm of natural language processing (NLP), providing a structured foundation for the development of sophisticated mental health assistants. This dataset exemplifies the variety of user inputs of according to different different Mental Health conditions. Each category within the dataset—'greeting', 'morning', 'afternoon', 'evening', 'night', 'goodbye', 'thanks', 'no-response', 'neutral-response', 'about', 'skill', 'creation', 'name', 'help', 'sad', 'stressed', 'worthless', 'depressed', 'happy', 'casual', 'anxious', 'not-talking', 'sleep', 'scared', 'death', 'understand', 'done', 'suicide', 'hate-you', 'hate-me', 'default', 'jokes', 'repeat', 'wrong', 'stupid', 'location', 'something-else', 'friends', 'ask', 'problem', 'no-approach', 'learn-more', 'user-agree', 'meditation', 'user-meditation', 'pandora-useful', 'user-advice', 'learn-mental-health', 'mental-health-fact', 'fact-1', 'fact-2', 'fact-3', to 'fact-32'—has been carefully curated to encompass a diverse range of expressions, ensuring that the resulting models are versatile and robust. This scientific approach not only enhances the dataset's diversity to record user inputs and responses to user inputs accurately but also contributes to the broader field of AI-driven health technologies, pushing the boundaries of what conversational health assistants can achieve.

## Importing Required Libraries

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical

nltk.download('stopwords')
nltk.download('wordnet')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

##  Dataset Loading 

In [None]:
with open('/kaggle/input/mental-health-dataset/KB.json', 'r') as file:
    data = json.load(file)

tags = []
patterns = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        tags.append(intent['tag'])
        patterns.append(pattern)
        if 'responses' in intent:
            responses.append(intent['responses'][0])
        else:
            responses.append(None)

df = pd.DataFrame({'tag': tags, 'pattern': patterns, 'response': responses})
df.head()

##  Data Exploration

In [None]:
print(df.info())
print(df.describe())
print(df['tag'].value_counts())

min_count = 10 
tag_counts = df['tag'].value_counts()
filtered_tags = tag_counts[tag_counts >= min_count].index

plt.figure(figsize=(10, 8))
sns.countplot(y=df[df['tag'].isin(filtered_tags)]['tag'], order=tag_counts[filtered_tags].index)
plt.title('Distribution of Frequent Tags')
plt.xlabel('Count')
plt.ylabel('Tag')
plt.show()

## Data Preprocessing

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalpha() or char == ' '])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df['pattern_cleaned'] = df['pattern'].apply(preprocess_text)
df['response_cleaned'] = df['response'].apply(lambda x: preprocess_text(x) if x else "")

df.head()

## Feature Extraction

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['pattern_cleaned']).toarray()
y = df['tag']

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

## Model Training 

### Training of a Naive Bayes model

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)


### Training of a SVM model

In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)


### Training of a Random Forest model

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)


### Grid Search for hyperparameter tuning

In [None]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid_search.fit(X_train, y_train)
svm_best = grid_search.best_estimator_


### Cross-validation

In [None]:
nb_cv_scores = cross_val_score(nb_model, X, y, cv=5)
svm_cv_scores = cross_val_score(svm_best, X, y, cv=5)
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5)


## Model Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, y_pred):
    unique_labels = np.unique(np.concatenate((y_test, y_pred)))
    target_names = label_encoder.inverse_transform(unique_labels)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=target_names, zero_division=0))



### Evaluation of a Naive Bayes model

In [None]:

print("Naive Bayes Model")
evaluate_model(nb_model, X_test, y_test, nb_pred)
print("Cross-validation scores:", nb_cv_scores)
print("Mean CV score:", np.mean(nb_cv_scores))


### Evaluation of a SVM model

In [None]:

print("\nSVM Model")
evaluate_model(svm_model, X_test, y_test, svm_pred)
print("Cross-validation scores:", svm_cv_scores)
print("Mean CV score:", np.mean(svm_cv_scores))


### Evaluation of a Random Forest model

In [None]:

print("\nRandom Forest Model")
evaluate_model(rf_model, X_test, y_test, rf_pred)
print("Cross-validation scores:", rf_cv_scores)
print("Mean CV score:", np.mean(rf_cv_scores))


## Best Prediction

In [None]:

print("\nBest SVM Model (Grid Search)")
svm_best_pred = svm_best.predict(X_test)
evaluate_model(svm_best, X_test, y_test, svm_best_pred)


## Visualization of confusion matrix for the best model

In [None]:
conf_matrix = confusion_matrix(y_test, svm_best_pred)
plt.figure(figsize=(14, 10))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.inverse_transform(np.unique(y_test)), yticklabels=label_encoder.inverse_transform(np.unique(y_test)))
plt.title('Confusion Matrix for Best SVM Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()



## WordCloud for most common words in patterns

In [None]:
all_patterns = ' '.join(df['pattern_cleaned'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_patterns)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Patterns')
plt.show()

## Deep Learning Model with LSTM

In [None]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['pattern_cleaned'].values)
word_index = tokenizer.word_index
X_lstm = tokenizer.texts_to_sequences(df['pattern_cleaned'].values)
X_lstm = pad_sequences(X_lstm, maxlen=MAX_SEQUENCE_LENGTH)
y_lstm = pd.get_dummies(df['tag']).values

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)

model_lstm = Sequential()
model_lstm.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(len(y_lstm[0]), activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model_lstm.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=64, validation_data=(X_test_lstm, y_test_lstm), verbose=2)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='val accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

y_test_lstm_pred = model_lstm.predict(X_test_lstm)
y_test_lstm_pred_class = np.argmax(y_test_lstm_pred, axis=1)
y_test_lstm_class = np.argmax(y_test_lstm, axis=1)

print("LSTM Model")
evaluate_model(model_lstm, X_test_lstm, y_test_lstm_class, y_test_lstm_pred_class)


## Conclusion 

The analysis provides insights into the distribution of tags, common patterns, and the performance of advanced classification models, including a deep learning model (LSTM).