# Preprocessing

In [None]:
import pandas as pd

mental_subs = ['depression', 'healthanxiety', 'suicidewatch', 'ptsd', 'bipolarreddit', 'mentalhealth']
control_subs = ['parenting', 'relationships', 'jokes', 'fitness', 'teaching', 'conspiracy']

dataframes = []

for sub in mental_subs + control_subs:
  df = pd.read_csv(f"./{sub}_2019_features_tfidf_256.csv")

  df.rename(columns={'post': 'text'}, inplace=True)
  df = df[df['text'].str.strip().astype(bool)] #rename empty posts-just in case

  df['subreddit'] = sub
  df['label'] = 1 if sub in mental_subs else 0

  dataframes.append(df[['text', 'label', 'subreddit']])

df_all = pd.concat(dataframes, ignore_index=True)
df_all = df_all.sample(frac=1, random_state=42)
df_all.to_csv("bert_reddit_mental_health_2019.csv", index=False)

print(df_all['label'].value_counts())

In [None]:
import re

def clean_text(text):
    text = re.sub(r"http\S+|www.\S+", "", text)           #remove URLs
    text = re.sub(r"<.*?>", "", text)                     #remove HTML tags
    text = re.sub(r"[^A-Za-z0-9\s.,!?']", " ", text)      #keep basic punctuation
    text = re.sub(r"\s+", " ", text)                      #remove extra spaces
    return text.strip().lower()                           #strip and lowercase

df_all['text'] = df_all['text'].astype(str).apply(clean_text)

In [None]:
from sklearn.utils import resample

df_majority = df_all[df_all['label'] == 0]
df_minority = df_all[df_all['label'] == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

df_balanced.to_csv("bert_clean_balanced_reddit.csv", index=False)
print(df_balanced['label'].value_counts())

# BERT Training

In [5]:
!pip install transformers datasets scikit-learn
!pip install --upgrade transformers

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine =

In [1]:
import transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import Dataset

print(transformers.__version__)
df = pd.read_csv("/kaggle/input/redditmentalhealth-dataset/bert_clean_balanced_reddit.csv")

df_0 = df[df['label']==0].sample(n=60000,random_state=42)
df_1 = df[df['label']==1].sample(n=60000,random_state=42)
df_small = pd.concat([df_0,df_1]).sample(frac=1,random_state=42).reset_index(drop=True)
df_small.to_csv("bert_small_10.csv", index=False)

df_1 = pd.read_csv("bert_small_10.csv")

train_texts, test_texts, train_labels, test_labels = train_test_split(df_1['text'].tolist(), df_1['label'].tolist(), test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels})


4.51.3


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)
help(TrainingArguments)

In [None]:
import torch
import os
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertConfig

config = BertConfig.from_pretrained("bert-base-uncased", hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3, num_labels=2)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)

for name, param in model.bert.named_parameters():
    if any(f"layer{i}." in name for i in range(8)):
        param.requires_grad = False

early_stopping = EarlyStoppingCallback(early_stopping_patience=1)
def compute_metrics(p: EvalPrediction):
  preds = torch.argmax(torch.tensor(p.predictions), axis=1)
  labels = torch.tensor(p.label_ids)
  precision, recall, f1, _ = precision_recall_fscore_support(labels,preds, average='binary')
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    output_dir="./bert_mh_results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    learning_rate = 2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

trainer.train()

2025-07-07 00:11:35.191168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751847095.674142      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751847095.796065      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1197,0.087362,0.972375,0.973583,0.971066,0.972323
2,0.0839,0.089921,0.973083,0.961523,0.985575,0.9734




In [None]:
trainer.save_model("/kaggle/working/bert_mental_health_model")
tokenizer.save_pretrained("/kaggle/working/bert_mental_health_model")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids

print(classification_report(y_true,y_pred))

#7 points 
train_loss = [log['loss'] for log in trainer.state.log_history if 'loss' in log and 'epoch' in log]
train_epochs = [log['epoch'] for log in trainer.state.log_history if 'loss' in log and 'epoch' in log]
#3 points
eval_loss = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log and 'epoch' in log]
eval_epochs = [log['epoch'] for log in trainer.state.log_history if 'eval_loss' in log and 'epoch' in log]

plt.plot(train_epochs, train_loss, label='Train Loss', marker='o')
plt.plot(eval_epochs, eval_loss, label='Eval Loss', marker='x')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("BERT Training vs. Evaluation Loss")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from wordcloud import WordCloud

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Neutral", "Mental Health"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

mh_text = df_1[df_1['label'] == 1]['text'].str.cat(sep=' ')
neutral_text = df_1[df_1['label'] == 0]['text'].str.cat(sep=' ')

mh_wc = WordCloud(width=800, height=400, background_color='white').generate(mh_text)
neutral_wc = WordCloud(width=800, height=400, background_color='white').generate(neutral_text)

plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plt.imshow(mh_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Mental Health Class")

plt.subplot(1,2,2)
plt.imshow(neutral_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Neutral Class")
plt.show()