In [22]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder


import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from transformers import BertTokenizer, BertModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
import torch

In [23]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [24]:
df = pd.read_csv("/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [25]:
df['statement'] = df['statement'].fillna('')

In [26]:
def clean_text(text):
    text = text.lower()
    
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['statement'] = df['statement'].apply(clean_text)

df.sample(5)

Unnamed: 0,statement,status
36177,rt floweryhillside i donât want them to come f...,Suicidal
19558,i worked alone office of one for years it ate ...,Depression
1489,i love you mom,Normal
19679,okay so since i was young i love coffee i howe...,Depression
49603,starting to feel hypomanic i havent felt this ...,Bipolar


In [27]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [28]:
df['statement'] = df['statement'].apply(lambda x: remove_stopwords(x))
df.sample(5)

Unnamed: 0,statement,status
38096,never really experience sadness depressed thought,Depression
32794,latest fire killed someone,Normal
26301,hi everyone sorry put much baggage shoulders n...,Suicidal
42066,struggling hard inventory,Normal
38356,exhausted stop sleeping staying awake struggle...,Depression


In [29]:
label_encoder = LabelEncoder()

# Fit and transform the labels in the DataFrame
df['status'] = label_encoder.fit_transform(df['status'])
df.head()

Unnamed: 0,statement,status
0,oh gosh,0
1,trouble sleeping confused mind restless heart ...,0
2,wrong back dear forward doubt stay restless re...,0
3,ive shifted focus something else im still worried,0
4,im restless restless month boy mean,0


In [30]:
# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to apply lemmatization
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Function to apply stemming
def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

# Function to generate datasets with different vectorization methods
def generate_datasets(text_data, vectorization_methods=["tfidf", "count", "word2vec", "doc2vec", "bert"]):
    datasets = {}

    # Prepare different versions of the text data
    original_data = text_data
    lemmatized_data = [lemmatize_text(text) for text in text_data]
    stemmed_data = [stem_text(text) for text in text_data]

    text_variations = {
        'original': original_data,
        'lemmatized': lemmatized_data,
        'stemmed': stemmed_data
    }
    
    # Iterate over each version of text data
    for version, data in text_variations.items():
        print(f"Generating datasets for: {version}...")  # Progress tracking
        
        # TF-IDF Vectorization
        if "tfidf" in vectorization_methods:
            print(" - Applying TF-IDF Vectorization...")
            tfidf_vectorizer = TfidfVectorizer(max_features=5000)
            X_tfidf = tfidf_vectorizer.fit_transform(data).toarray()
            datasets[f'{version}_tfidf'] = X_tfidf

        # Count Vectorization
        if "count" in vectorization_methods:
            print(" - Applying Count Vectorization...")
            count_vectorizer = CountVectorizer(max_features=5000)
            X_count = count_vectorizer.fit_transform(data).toarray()
            datasets[f'{version}_count'] = X_count

        # Word2Vec Embeddings
        if "word2vec" in vectorization_methods:
            print(" - Applying Word2Vec Embeddings...")
            tokenized_data = [text.split() for text in data]
            word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)
            X_word2vec = np.array([
                np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) 
                for words in tokenized_data
            ])
            datasets[f'{version}_word2vec'] = X_word2vec

        # Doc2Vec Embeddings
        if "doc2vec" in vectorization_methods:
            print(" - Applying Doc2Vec Embeddings...")
            tagged_data = [TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(data)]
            doc2vec_model = Doc2Vec(vector_size=100, min_count=1, epochs=20)
            doc2vec_model.build_vocab(tagged_data)
            doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
            X_doc2vec = np.array([doc2vec_model.infer_vector(text.split()) for text in data])
            datasets[f'{version}_doc2vec'] = X_doc2vec
        
        print(f"Completed datasets for: {version}.\n")  # Completion notification

    return datasets
# Function to split datasets into train and test sets
def split_datasets(datasets, labels, test_size=0.2, random_state=42):
    split_datasets = {}
    
    for name, X in datasets.items():
        X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=test_size, random_state=random_state)
        split_datasets[name] = (X_train, X_test, y_train, y_test)
        
    return split_datasets

In [31]:
text_data = df['statement']
labels = df['status']

# Generate datasets with specified vectorizations and versions
datasets = generate_datasets(text_data, vectorization_methods=["tfidf", "count", "word2vec", "doc2vec"])

# Split the datasets into train and test sets
split_datasets = split_datasets(datasets, labels)

# Print shapes of each generated train/test split dataset
for name, (X_train, X_test, y_train, y_test) in split_datasets.items():
    print(f"{name} - Train shape: {X_train.shape}, Test shape: {X_test.shape}, YTrain shape: {y_train.shape}, YTest shape: {y_test.shape}")

Generating datasets for: original...
 - Applying TF-IDF Vectorization...
 - Applying Count Vectorization...
 - Applying Word2Vec Embeddings...
 - Applying Doc2Vec Embeddings...
Completed datasets for: original.

Generating datasets for: lemmatized...
 - Applying TF-IDF Vectorization...
 - Applying Count Vectorization...
 - Applying Word2Vec Embeddings...
 - Applying Doc2Vec Embeddings...
Completed datasets for: lemmatized.

Generating datasets for: stemmed...
 - Applying TF-IDF Vectorization...
 - Applying Count Vectorization...
 - Applying Word2Vec Embeddings...
 - Applying Doc2Vec Embeddings...
Completed datasets for: stemmed.

original_tfidf - Train shape: (42434, 5000), Test shape: (10609, 5000), YTrain shape: (42434,), YTest shape: (10609,)
original_count - Train shape: (42434, 5000), Test shape: (10609, 5000), YTrain shape: (42434,), YTest shape: (10609,)
original_word2vec - Train shape: (42434, 100), Test shape: (10609, 100), YTrain shape: (42434,), YTest shape: (10609,)
origina

In [32]:
# Print shapes of each generated train/test split dataset
for name, (X_train, X_test, y_train, y_test) in split_datasets.items():
    print(f"{name} - Train shape: {X_train.shape}, Test shape: {X_test.shape}, YTrain shape: {y_train.shape}, YTest shape: {y_test.shape}")

original_tfidf - Train shape: (42434, 5000), Test shape: (10609, 5000), YTrain shape: (42434,), YTest shape: (10609,)
original_count - Train shape: (42434, 5000), Test shape: (10609, 5000), YTrain shape: (42434,), YTest shape: (10609,)
original_word2vec - Train shape: (42434, 100), Test shape: (10609, 100), YTrain shape: (42434,), YTest shape: (10609,)
original_doc2vec - Train shape: (42434, 100), Test shape: (10609, 100), YTrain shape: (42434,), YTest shape: (10609,)
lemmatized_tfidf - Train shape: (42434, 5000), Test shape: (10609, 5000), YTrain shape: (42434,), YTest shape: (10609,)
lemmatized_count - Train shape: (42434, 5000), Test shape: (10609, 5000), YTrain shape: (42434,), YTest shape: (10609,)
lemmatized_word2vec - Train shape: (42434, 100), Test shape: (10609, 100), YTrain shape: (42434,), YTest shape: (10609,)
lemmatized_doc2vec - Train shape: (42434, 100), Test shape: (10609, 100), YTrain shape: (42434,), YTest shape: (10609,)
stemmed_tfidf - Train shape: (42434, 5000), Te

In [33]:
# Function to compare models (XGBoost and MLP)
def compare_models(X_train, X_test, y_train, y_test, label_encoder):
    # Initialize and fit XGBoost model
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)
    
    # Make predictions
    xgb_pred = xgb_model.predict(X_test)

    # Initialize and fit MLP model
    mlp_model = MLPClassifier(max_iter=500)
    mlp_model.fit(X_train, y_train)
    
    # Make predictions
    mlp_pred = mlp_model.predict(X_test)

    # Inverse transform the predictions
    xgb_pred_classes = label_encoder.inverse_transform(xgb_pred)
    mlp_pred_classes = label_encoder.inverse_transform(mlp_pred)

    # Generate classification reports
    xgb_report = classification_report(y_test, xgb_pred, target_names=label_encoder.classes_)
    mlp_report = classification_report(y_test, mlp_pred, target_names=label_encoder.classes_)

    results = {
        'XGBoost': {
            'predictions': xgb_pred_classes,
            'classification_report': xgb_report
        },
        'MLP': {
            'predictions': mlp_pred_classes,
            'classification_report': mlp_report
        }
    }

    return results

In [35]:
# Store results for each dataset comparison
comparison_results = {}

# Loop through each dataset
for name, (X_train, X_test, y_train, y_test) in split_datasets.items():
    print(f"Comparing models on dataset: {name}")
    results = compare_models(X_train, X_test, y_train, y_test, label_encoder)
    comparison_results[name] = results

# Print comparison results
for dataset_name, result in comparison_results.items():
    print(f"\nResults for {dataset_name}:")
    for model_name, metrics in result.items():
        print(f"{model_name} Classification Report:\n{metrics['classification_report']}")

Comparing models on dataset: original_tfidf
Comparing models on dataset: original_count
Comparing models on dataset: original_word2vec
Comparing models on dataset: original_doc2vec




Comparing models on dataset: lemmatized_tfidf
Comparing models on dataset: lemmatized_count
Comparing models on dataset: lemmatized_word2vec
Comparing models on dataset: lemmatized_doc2vec
Comparing models on dataset: stemmed_tfidf
Comparing models on dataset: stemmed_count
Comparing models on dataset: stemmed_word2vec
Comparing models on dataset: stemmed_doc2vec

Results for original_tfidf:
XGBoost Classification Report:
                      precision    recall  f1-score   support

             Anxiety       0.81      0.79      0.80       779
             Bipolar       0.88      0.75      0.81       580
          Depression       0.74      0.73      0.74      3100
              Normal       0.83      0.95      0.88      3327
Personality disorder       0.85      0.54      0.66       248
              Stress       0.68      0.54      0.60       557
            Suicidal       0.68      0.63      0.66      2018

            accuracy                           0.77     10609
           mac

Best Performing Dataset
XGBoost Classifier:

The original_tfidf and original_count datasets both achieved the highest accuracy of 77% and have strong F1-scores across most classes.

The lematized_tfidf and lematized_count datasets also achieved 77% accuracy, but with lower F1-scores for some classes.

MLP Classifier:

The original_tfidf and original_count datasets achieved the highest accuracy of 74%.

The lemmatized_tfidf and lemmatized_count datasets showed similar performance with 74% accuracy.