***Introduction & Dataset Overview***

1.   **Dataset:** Mental Health Sentiment analysis.
2.   **Goal:** Classify text related data into mental health sentiment categories.
3.   **Problem:** Multi-class classification.
4.   **Target labels:**Anxiety, Stress, Suicidal, Normal, Depression, Bi-polar, Personality disorder.
5.   **Link for the dataset:** https://huggingface.co/datasets/AhmedSSoliman/sentiment-analysis-for-mental-health-Combined-Data


***Importing necessary packages.***

In [None]:
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
import re
import random
import nltk
import torch

from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os
import warnings
warnings.filterwarnings("ignore")


In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

***Data Loading & Cleaning***



***1. Reading Data from External Site***

In [None]:
# Load dataset
dataset = load_dataset("AhmedSSoliman/sentiment-analysis-for-mental-health-Combined-Data")

# Convert to DataFrame and display first 5 rows
data = pd.DataFrame(dataset["train"]).drop(columns=["Unnamed: 0"])
print(data.sample(5))

(…)is-for-mental-health-Combined%20Data.csv:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/53043 [00:00<?, ? examples/s]

                                               statement      status
33794                     my doctor said i need surgery.      Normal
33053                              are your nails clean?      Normal
37039  sooo cool thanks so much, our stuff just shipp...      Normal
8731   Is anybody else here plagued by constant, stre...  Depression
15192  I have been prepping to kill myself for the la...    Suicidal


***2. Checking the dimensions of the DataFrame (number of rows and columns).***

In [None]:
data.shape

(53043, 2)

***3. Checking whether the null values are present in each column of the dataframe and removing it***

In [None]:
data.isnull().sum()

Unnamed: 0,0
statement,362
status,0


In [None]:
# removing null values
data.dropna(inplace = True)
data.isna().sum()

Unnamed: 0,0
statement,0
status,0


***4. Preprocessing the text data by converting statements to lowercase.***

In [None]:
data['statement']=data['statement'].str.lower()
data.sample(5)

Unnamed: 0,statement,status
9696,i do not see any other option. i cannot achiev...,Suicidal
3843,you have to be extra careful in choosing a rel...,Normal
37875,i live at home and i love my parent but i m be...,Depression
46260,suicide hotline doesn't answer. i've exhausted...,Bipolar
43238,onlinemigration me too although im still going...,Normal


***5.Cleaning the data of the 'statement' column based on Regex pattern.***

In [None]:
def remove_expression(text):
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove markdown-style links
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    # Remove handles (that start with '@')
    text = re.sub(r'@\w+', '', text)
    # Remove punctuation and other special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

# Apply the function to the 'statement' column
data['statement'] = data['statement'].apply(remove_expression)
data.sample(5)

Unnamed: 0,statement,status
52588,the girl i like also suffers from anxiety the ...,Anxiety
25533,after abt 34 months of feeling ok i relapsed a...,Depression
44567,i have to wake up in hour laameeee,Normal
37402,thousands of dead children as he put it sigh,Suicidal
44921,just a head up site s being wonky so will like...,Normal


***6. Tokenizing the statements,Removing stopwords and Stemming using Porter stemmer from the tokens***

In [None]:
# Apply word_tokenize to each element in the 'statement' column(tokenization)
data['tokens_list'] = data['statement'].apply(word_tokenize)
data.sample(5)

Unnamed: 0,statement,status,tokens_list
2464,cant get it thrðÿ,Normal,"[cant, get, it, thrðÿ]"
43403,djginaturner no le gusta house of house min intro,Normal,"[djginaturner, no, le, gusta, house, of, house..."
28792,link to donate url below you can find the miss...,Normal,"[link, to, donate, url, below, you, can, find,..."
46507,current status manic sleep deprived tired but ...,Bipolar,"[current, status, manic, sleep, deprived, tire..."
46928,advice for terminal insomnia im recently out o...,Bipolar,"[advice, for, terminal, insomnia, im, recently..."


In [None]:
# Initialize the porter stemmer
stopwords_list = stopwords.words('english')
stemmer = PorterStemmer()

# Function to stem tokens and convert them to strings
def stem_tokens_list(tokens_list):
    return ' '.join(stemmer.stem(str(token)) for token in tokens_list if token not in stopwords_list )

# Apply the function to the 'tokens' column
data['stemmed_tokens'] = data['tokens_list'].apply(stem_tokens_list)
data.sample(5)

Unnamed: 0,statement,status,tokens_list,stemmed_tokens
51011,the sound of my phone dinging makes me feel si...,Personality disorder,"[the, sound, of, my, phone, dinging, makes, me...",sound phone ding make feel sick someon call fe...
37052,i think ive run out of jack bauer jokes or pau...,Normal,"[i, think, ive, run, out, of, jack, bauer, jok...",think ive run jack bauer joke paul oconnel one
30682,no i didnt feel like it,Normal,"[no, i, didnt, feel, like, it]",didnt feel like
21122,i broke up with my girlfriend of almost eight ...,Suicidal,"[i, broke, up, with, my, girlfriend, of, almos...",broke girlfriend almost eight month today anni...
6195,â lets be mutual with gamoras father cung,Normal,"[â, lets, be, mutual, with, gamoras, father, c...",â let mutual gamora father cung


***7. Encoding the Status Categories to Label***

In [None]:
Lb = LabelEncoder()
data['status'] = Lb.fit_transform(data['status'])
print(data['status'].unique())

[0 3 2 6 5 1 4]


In [None]:
row_counts = []

for i in range(7):  # For each status from 0 to 6
    subset = data[data['status'] == i]
    count = len(subset)
    row_counts.append(count)  # Store the count
    print(f"Status {i}: {count} rows")

# Find the maximum value
max_rows = max(row_counts)
print(f"The maximum number of rows is: {max_rows}")

Status 0: 3841 rows
Status 1: 2777 rows
Status 2: 15404 rows
Status 3: 16343 rows
Status 4: 1077 rows
Status 5: 2587 rows
Status 6: 10652 rows
The maximum number of rows is: 16343


In [None]:
x = data["stemmed_tokens"]
y = data["status"]
y.unique()

array([0, 3, 2, 6, 5, 1, 4])

In [None]:
y_encode = to_categorical(y)
y_encode

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

***9.Splitting the data into Training Set, Validation Set and Testing set***

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y_encode, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)
x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape

((36876,), (7902,), (7903,), (36876, 7), (7902, 7), (7903, 7))

***10. Initialize the tokenizer and fiting to Train Data.***

In [None]:
tk = Tokenizer()
tk.fit_on_texts(x_train)

***11. Convert the text data to sequences of integers.***

In [None]:
x_train_number = tk.texts_to_sequences(x_train)
x_val_number = tk.texts_to_sequences(x_val)
x_test_number = tk.texts_to_sequences(x_test)

***12. Pad sequences to make them have the same length***

In [None]:
max_len = 100
x_train_pad = pad_sequences(x_train_number, maxlen=max_len, padding="post")
x_test_pad = pad_sequences(x_test_number, maxlen=max_len, padding="post")
x_val_pad = pad_sequences(x_val_number, maxlen=max_len, padding="post")
x_train_pad.shape, x_val_pad.shape, x_test_pad.shape

((36876, 100), (7902, 100), (7903, 100))

***Convert Dataset into Tensor Format.***

In [None]:
# Convert to PyTorch tensors
x_train_tensor = torch.tensor(x_train_pad, dtype=torch.long)
x_val_tensor = torch.tensor(x_val_pad, dtype=torch.long)
x_test_tensor = torch.tensor(x_test_pad, dtype=torch.long)

y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Check the shapes of the tensors
print(f"x_train_tensor shape: {x_train_tensor.shape}")
print(f"y_train_tensor shape: {y_train_tensor.shape}")
print(f"x_val_tensor shape: {x_val_tensor.shape}")
print(f"y_val_tensor shape: {y_val_tensor.shape}")
print(f"x_test_tensor shape: {x_test_tensor.shape}")
print(f"y_test_tensor shape: {y_test_tensor.shape}")

x_train_tensor shape: torch.Size([36876, 100])
y_train_tensor shape: torch.Size([36876, 7])
x_val_tensor shape: torch.Size([7902, 100])
y_val_tensor shape: torch.Size([7902, 7])
x_test_tensor shape: torch.Size([7903, 100])
y_test_tensor shape: torch.Size([7903, 7])


***Storing the processed tensor data in Drive for future use.***

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
# Define the save path in Google Drive
save_path = "/content/drive/My Drive/Colab Notebooks/tensor_data/"

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Save the tensors
torch.save(x_train_tensor, save_path + "x_train_tensor.pt")
torch.save(y_train_tensor, save_path + "y_train_tensor.pt")
torch.save(x_val_tensor, save_path + "x_val_tensor.pt")
torch.save(y_val_tensor, save_path + "y_val_tensor.pt")
torch.save(x_test_tensor, save_path + "x_test_tensor.pt")
torch.save(y_test_tensor, save_path + "y_test_tensor.pt")

print("Tensors saved successfully in Google Drive!")

Tensors saved successfully in Google Drive!
