In [None]:
import pandas as pd
df = pd.read_csv("cleaned_mental_health_data.csv")

In [None]:
print(df["status"].value_counts())

status
Depression    8023
Suicidal      7700
Normal        6242
Anxiety        730
Name: count, dtype: int64


In [None]:
df.isnull().sum()

Unnamed: 0,0
statement,0
status,0
cleaned_text,16
sentiment,1


In [None]:
df.info

In [None]:
df.tail()

Unnamed: 0,statement,status,cleaned_text,sentiment
22690,"TW: Selfharm, mention of suicide This is a thr...",Depression,tw selfharm mention suicide throwaway account ...,-0.8589
22691,"I just SIT, mindless, wasting away Honestly, w...",Depression,sit mindless wasting away honestly wtf wrong f...,-0.2764
22692,I am a 34 year old woman who is a virgin. I ha...,Depression,34 year old woman virgin kissed guy nine year ...,0.0204
22693,No matter what I will feel just so down. Even ...,Depression,matter feel even everything good day keep phon...,-0.0478
22694,My life has taken a huge turn for the better. ...,Depression,life taken huge turn better though high school...,


In [None]:
df = df.dropna(subset=["cleaned_text","sentiment"]).reset_index(drop=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
statement,0
status,0
cleaned_text,0
sentiment,0


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^\w\s']", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [None]:
label_mapping = {label: i for i, label in enumerate(df["status"].unique())}
df["label"] = df["status"].map(label_mapping)

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


In [None]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["statement"].tolist(), df["label"].tolist(), test_size=0.3, random_state=42
)

In [None]:
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), np.array(train_labels))).batch(8)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), np.array(val_labels))).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), np.array(test_labels))).batch(8)


In [None]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [None]:
model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7e215240e190>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


model.save_pretrained("/content/drive/MyDrive/mental_health_bert")
tokenizer.save_pretrained("/content/drive/MyDrive/mental_health_bert")


('mental_health_bert/tokenizer_config.json',
 'mental_health_bert/special_tokens_map.json',
 'mental_health_bert/vocab.txt',
 'mental_health_bert/added_tokens.json')

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

Test Accuracy: 79.42%


In [None]:
import pickle

with open("dataset4.pkl", "rb") as f:
    df = pickle.load(f, encoding="latin1")


  df = pickle.load(f, encoding="latin1")


In [None]:
df.shape

(58110, 4)

In [None]:
df.head()

Unnamed: 0,statement,status,cleaned_text,sentiment
0,oh my gosh,Anxiety,oh gosh,0.0
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...,-0.7269
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...,-0.7351
3,I've shifted my focus to something else but I'...,Anxiety,'ve shifted focus something else 'm still worried,-0.296
4,"I'm restless and restless, it's been a month n...",Anxiety,'m restless restless 's month boy mean,-0.4939


In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments


In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['status'])

In [None]:
from sklearn.model_selection import train_test_split

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

In [None]:
train_texts, val_texts, test_texts = train_texts.tolist(), val_texts.tolist(), test_texts.tolist()
train_labels, val_labels, test_labels = train_labels.tolist(), val_labels.tolist(), test_labels.tolist()

In [None]:
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(df['status'].value_counts()))


def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

# Apply tokenization
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings=tokenize_function(test_texts)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [None]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), np.array(train_labels))).batch(8)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), np.array(val_labels))).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), np.array(test_labels))).batch(8)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [None]:
model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x781ae4ea1fd0>