Model page: https://huggingface.co/google-bert/bert-base-uncased

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/google-bert/bert-base-uncased)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
# !pip install -U transformers

Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.52.3-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.52.3


In [None]:
import pandas as pd
import numpy as np
import torch
import warnings
warnings.filterwarnings('ignore')

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'GPU is ready? {device}')

GPU is ready? cuda


## Load Data

In [2]:
df = pd.read_csv("depression-dataset-combined.csv")
df

Unnamed: 0,text,is_depression
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1
...,...,...
35703,is that snow,0
35704,moulin rouge mad me cry once again,0
35705,trying to shout but can t find people on the list,0
35706,ughh can t find my red sox hat got ta wear thi...,0


In [4]:
df['is_depression'].value_counts()

is_depression
0    18039
1    17669
Name: count, dtype: int64

**Label Indicator**
- `0` indicating non-depression
- `1` indicating depression

In [5]:
train, test = train_test_split(
    df,
    test_size=0.2,
    shuffle=True,
    random_state=42
)

# reset index
train = train.reset_index()[['text', 'is_depression']]
test = test.reset_index()[['text', 'is_depression']]

# dataframes to datadict --> nyesuaiin format huggingface
train_ds = Dataset.from_pandas(train)
test_ds = Dataset.from_pandas(test)

dataset = DatasetDict()
dataset['train'] = train_ds
dataset['test'] = test_ds

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'is_depression'],
        num_rows: 28566
    })
    test: Dataset({
        features: ['text', 'is_depression'],
        num_rows: 7142
    })
})

## Word Tokenizing

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

# tokenize the whole dataset
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

# Rename 'is_depression' to 'labels' for the model
dataset_encoded = dataset_encoded.rename_column('is_depression', 'labels')
dataset_encoded

Map:   0%|          | 0/28566 [00:00<?, ? examples/s]

Map:   0%|          | 0/7142 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 28566
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7142
    })
})

## Using pretrained model

In [7]:
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)
model.to(device) # set device to GPU

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Training the model

In [8]:
def compute_metrics(preds):
    labels = preds.label_ids
    predictions = preds.predictions.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions)
    precision = precision_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    
    return {
        "accuracy-score": accuracy,
        "precision-score": precision,
        "recall-score": recall,
        "f1-score": f1
    }

In [13]:
batch_size = 64
logging_steps = len(dataset_encoded['train']) // batch_size

training_args = TrainingArguments(
    output_dir = 'bert-base-uncased-depression-detector',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # lower batch size if memory issues occur
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=logging_steps,
    logging_strategy="steps",
    logging_first_step=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model= model,
    args= training_args,
    train_dataset = dataset_encoded['train'],
    eval_dataset = dataset_encoded['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/10713 [00:00<?, ?it/s]

{'loss': 0.6933, 'grad_norm': 3.8943581581115723, 'learning_rate': 1.0000000000000001e-07, 'epoch': 0.0}
{'loss': 0.3948, 'grad_norm': 20.534740447998047, 'learning_rate': 4.46e-05, 'epoch': 0.12}
{'loss': 0.3948, 'grad_norm': 20.534740447998047, 'learning_rate': 4.46e-05, 'epoch': 0.12}
{'loss': 0.2895, 'grad_norm': 0.29641634225845337, 'learning_rate': 4.8080877313228236e-05, 'epoch': 0.25}
{'loss': 0.2895, 'grad_norm': 0.29641634225845337, 'learning_rate': 4.8080877313228236e-05, 'epoch': 0.25}
{'loss': 0.2305, 'grad_norm': 20.884241104125977, 'learning_rate': 4.589738568491139e-05, 'epoch': 0.37}
{'loss': 0.2305, 'grad_norm': 20.884241104125977, 'learning_rate': 4.589738568491139e-05, 'epoch': 0.37}
{'loss': 0.2356, 'grad_norm': 6.044149398803711, 'learning_rate': 4.371389405659454e-05, 'epoch': 0.5}
{'loss': 0.2356, 'grad_norm': 6.044149398803711, 'learning_rate': 4.371389405659454e-05, 'epoch': 0.5}
{'loss': 0.2369, 'grad_norm': 4.318938732147217, 'learning_rate': 4.1530402428277

TrainOutput(global_step=10713, training_loss=0.17014115288245804, metrics={'train_runtime': 7404.7332, 'train_samples_per_second': 11.573, 'train_steps_per_second': 1.447, 'total_flos': 2.254809122224128e+16, 'train_loss': 0.17014115288245804, 'epoch': 3.0})

In [14]:
model.save_pretrained('depression-bert-model')
tokenizer.save_pretrained('depression-bert-model')

('depression-bert-model\\tokenizer_config.json',
 'depression-bert-model\\special_tokens_map.json',
 'depression-bert-model\\vocab.txt',
 'depression-bert-model\\added_tokens.json',
 'depression-bert-model\\tokenizer.json')

## Model Evaluation

In [15]:
evaluation_results = trainer.evaluate()
evaluation_results

  0%|          | 0/893 [00:00<?, ?it/s]

{'eval_loss': 0.19086241722106934,
 'eval_accuracy-score': 0.9581349761971436,
 'eval_precision-score': 0.9557670772676372,
 'eval_recall-score': 0.960337552742616,
 'eval_f1-score': 0.9580468640381647,
 'eval_runtime': 239.2396,
 'eval_samples_per_second': 29.853,
 'eval_steps_per_second': 3.733,
 'epoch': 3.0}