<a href="https://colab.research.google.com/github/CombustingRats/mental_health_classifier/blob/main/mental_health_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Dependencies

In [15]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, AutoConfig, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, ClassLabel, load_metric
import torch
import pandas as pd
import numpy as np

In [53]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [18]:
mental_health_ds = load_dataset('solomonk/reddit_mental_health_posts')

Using custom data configuration solomonk--reddit_mental_health_posts-954e1c5cc1be8399
Reusing dataset csv (/root/.cache/huggingface/datasets/solomonk___csv/solomonk--reddit_mental_health_posts-954e1c5cc1be8399/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
mental_health_ds

DatasetDict({
    train: Dataset({
        features: ['author', 'body', 'created_utc', 'id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', 'url'],
        num_rows: 151288
    })
})

## Basic cleaning

In [20]:
df = pd.DataFrame(mental_health_ds['train'])

In [21]:
df = df[df['body'] != '[removed]']

In [22]:
df.isnull().sum()

author             0
body            1609
created_utc        0
id                 0
num_comments       0
score              0
subreddit          0
title              0
upvote_ratio       0
url                0
dtype: int64

In [23]:
df['subreddit'].unique()

array(['ADHD', 'aspergers', 'depression', 'OCD', 'ptsd'], dtype=object)

In [24]:
df.dropna(inplace=True, axis=0)

In [25]:
mental_health_ds = Dataset.from_pandas(df)
mental_health_ds

Dataset({
    features: ['author', 'body', 'created_utc', 'id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', 'url', '__index_level_0__'],
    num_rows: 125653
})

In [26]:
mental_health_ds = mental_health_ds.train_test_split(test_size=0.3)
mental_health_ds

DatasetDict({
    train: Dataset({
        features: ['author', 'body', 'created_utc', 'id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', 'url', '__index_level_0__'],
        num_rows: 87957
    })
    test: Dataset({
        features: ['author', 'body', 'created_utc', 'id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', 'url', '__index_level_0__'],
        num_rows: 37696
    })
})

In [27]:
print(mental_health_ds['train'][1]['body'])

Hi all!!! I was diagnosed as bipolar 2 and put in the mood stabilizer Lamictal (100 mg) about 3 years now and it has been an absolute life changer. I’m stable with very little side effects. I just had a visit with my psychiatrist and was diagnosed with ADHD as well. Because I’m bipolar and recovering from an eating disorder she was hesitant to prescribe any stimulants. Instead she’s having me start on 25 mg of Strattera. Has anyone been on a combination of these two medications or just Strattera?? I’m excited that I’m finally getting help for something that makes living so god damn hard but I’m not looking forward to the battle of finding the right medication/dose. Tell me your experiences! :)


## Tokenize Text

In [28]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentence = tokenizer(mental_health_ds['train'][3]['body'])
print(tokenized_sentence)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

{'input_ids': [101, 4931, 4364, 2003, 5760, 1051, 19797, 2204, 9526, 3085, 2043, 2017, 2288, 2009, 3232, 3134, 3283, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [29]:
print(tokenizer.convert_ids_to_tokens(tokenized_sentence['input_ids']))

['[CLS]', 'hey', 'guys', 'is', 'pure', 'o', '##cd', 'good', 'cure', '##able', 'when', 'you', 'got', 'it', 'couple', 'weeks', 'ago', '?', '[SEP]']


In [30]:
str_to_int = {
    'ADHD' : 0,
    'aspergers': 1,
    'depression': 2,
    'OCD': 3,
    'ptsd': 4
}

int_to_str = {item: key for key,item in str_to_int.items()}

In [31]:
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch["body"], truncation=True)
    tokenized_batch['label'] = [str_to_int[label] for label in batch['subreddit']]
    return tokenized_batch

In [32]:
tokenized_dataset = mental_health_ds.map(tokenize_function, batched=True)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/38 [00:00<?, ?ba/s]

In [33]:
# Casting the label column to Classlabel

tokenized_dataset = tokenized_dataset.cast_column('label', ClassLabel(num_classes=5, names=list(str_to_int.keys())))

Casting the dataset:   0%|          | 0/9 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [34]:
tokenized_dataset['train']['label'][0]

0

## Padding

In [35]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [36]:
tokenized_dataset['train'][3]['body'], tokenized_dataset['train'][3]['label']

('Hey guys\nIs Pure OCD good cureable when you got it couple weeks ago?', 3)

In [37]:
samples = tokenized_dataset['train'][:10]
samples = {k:v for k,v in samples.items() if k in ['input_ids', 'token_type_ids', 'attention_mask','label']}

In [38]:
[len(x) for x in samples['input_ids']]

[95, 168, 5, 19, 202, 197, 5, 280, 103, 329]

In [39]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [40]:
batch = data_collator(samples)

In [41]:
{k : v.shape for k,v in batch.items()}

{'attention_mask': torch.Size([10, 329]),
 'input_ids': torch.Size([10, 329]),
 'labels': torch.Size([10]),
 'token_type_ids': torch.Size([10, 329])}

In [42]:
batch['input_ids']

tensor([[  101,  1045,  2109,  ...,     0,     0,     0],
        [  101,  7632,  2035,  ...,     0,     0,     0],
        [  101,  1031, 17159,  ...,     0,     0,     0],
        ...,
        [  101,  2823,  1045,  ...,     0,     0,     0],
        [  101,  1045,  2428,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,  2870,  1012,   102]])

In [43]:
tokenized_dataset = tokenized_dataset.remove_columns(['author', 'body', 'created_utc', 'id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', 'url', '__index_level_0__'])

In [44]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 87957
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 37696
    })
})

## Model Building

In [45]:
train_sample = tokenized_dataset['train'][:10000]
test_sample = tokenized_dataset['test'][:100]

In [46]:
tokenized_dataset['train'].select(list(range(0,1000)))

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 1000
})

In [47]:
training_args = TrainingArguments('mental_health_trainer', 
                                  save_strategy='epoch', 
                                  push_to_hub=True)

In [48]:
config = AutoConfig.from_pretrained(checkpoint, label2id=str_to_int, id2label=int_to_str)

In [49]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           config=config,)
                                                           #num_labels=5)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [50]:
tokenized_dataset['train']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 87957
})

In [51]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='micro')

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [54]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_dataset['train'].select(list(range(0,10000))),
    eval_dataset = tokenized_dataset['test'].select(list(range(0,100))),
    data_collator = data_collator,
    tokenizer = tokenizer
    # compute_metrics = compute_metrics
)

# model.config.pad_token_id = model.config.eos_token_id

Cloning https://huggingface.co/edmundhui/mental_health_trainer into local empty directory.


In [55]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3750


Step,Training Loss
500,1.1655
1000,0.8943
1500,0.8164
2000,0.7294
2500,0.7397
3000,0.6137
3500,0.569


Saving model checkpoint to mental_health_trainer/checkpoint-1250
Configuration saved in mental_health_trainer/checkpoint-1250/config.json
Model weights saved in mental_health_trainer/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in mental_health_trainer/checkpoint-1250/tokenizer_config.json
Special tokens file saved in mental_health_trainer/checkpoint-1250/special_tokens_map.json
tokenizer config file saved in mental_health_trainer/tokenizer_config.json
Special tokens file saved in mental_health_trainer/special_tokens_map.json
Saving model checkpoint to mental_health_trainer/checkpoint-2500
Configuration saved in mental_health_trainer/checkpoint-2500/config.json
Model weights saved in mental_health_trainer/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in mental_health_trainer/checkpoint-2500/tokenizer_config.json
Special tokens file saved in mental_health_trainer/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to mental_health_trainer/che

TrainOutput(global_step=3750, training_loss=0.7743068562825521, metrics={'train_runtime': 2309.9039, 'train_samples_per_second': 12.988, 'train_steps_per_second': 1.623, 'total_flos': 6091698900663600.0, 'train_loss': 0.7743068562825521, 'epoch': 3.0})

In [56]:
trainer.push_to_hub()

Saving model checkpoint to mental_health_trainer
Configuration saved in mental_health_trainer/config.json
Model weights saved in mental_health_trainer/pytorch_model.bin
tokenizer config file saved in mental_health_trainer/tokenizer_config.json
Special tokens file saved in mental_health_trainer/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/418M [00:00<?, ?B/s]

Upload file runs/Jun08_20-35-23_0f8f2f4d7ead/events.out.tfevents.1654720599.0f8f2f4d7ead.70.0:  66%|######5   …

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/edmundhui/mental_health_trainer
   b12c2b3..553db7a  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}}
remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/edmundhui/mental_health_trainer
   553db7a..916f822  main -> main



'https://huggingface.co/edmundhui/mental_health_trainer/commit/553db7a1662dc7bab92f887e1cffa24b542f019b'

## Testing

In [57]:
test_set = tokenized_dataset['test'].select(list(range(0,100)))

In [58]:
len(test_set)

100

In [59]:
test_set['label']

[1,
 3,
 3,
 1,
 4,
 4,
 2,
 0,
 3,
 4,
 2,
 4,
 4,
 0,
 0,
 0,
 2,
 0,
 1,
 2,
 4,
 4,
 2,
 0,
 0,
 4,
 0,
 1,
 0,
 1,
 1,
 0,
 3,
 0,
 3,
 3,
 3,
 4,
 3,
 3,
 2,
 0,
 1,
 3,
 1,
 1,
 4,
 0,
 3,
 3,
 0,
 4,
 0,
 1,
 3,
 3,
 0,
 4,
 3,
 3,
 3,
 4,
 3,
 4,
 3,
 3,
 1,
 1,
 1,
 0,
 3,
 1,
 3,
 3,
 1,
 3,
 2,
 4,
 4,
 4,
 0,
 0,
 0,
 3,
 1,
 2,
 0,
 3,
 4,
 1,
 3,
 0,
 3,
 2,
 0,
 3,
 3,
 4,
 0,
 3]

In [60]:
predictions = trainer.predict(test_set)

***** Running Prediction *****
  Num examples = 100
  Batch size = 8


In [61]:
preds = np.argmax(predictions.predictions, axis=-1)

In [62]:
sum(preds == test_set['label'])

74

In [63]:
test_set

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 100
})

In [64]:
ground_truth = [int_to_str[label] for label in test_set['label']]

In [65]:
test_sentences = []
for i in range(len(test_set)):
  test_sentences.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(test_set[i]['input_ids'])))

In [67]:
str_preds = [int_to_str[pred] for pred in preds]

In [68]:
test_dataframe = pd.DataFrame({"sentence":test_sentences, "prediction":str_preds, "ground_truth": ground_truth})

In [69]:
test_dataframe.head()

Unnamed: 0,sentence,prediction,ground_truth
0,[CLS] [ deleted ] [SEP],OCD,aspergers
1,[CLS] [ deleted ] [SEP],OCD,OCD
2,[CLS] [ deleted ] [SEP],OCD,OCD
3,"[CLS] hello all, my name is james. i am a 23 y...",aspergers,aspergers
4,[CLS] i assume this is because of all the abus...,ptsd,ptsd


In [None]:
test_dataframe.iloc[14]['sentence']

In [None]:
test_dataframe.loc[test_dataframe['prediction'] != test_dataframe['ground_truth']]

In [None]:
tokenizer.convert_ids_to_tokens(test_set[0]['input_ids'])

## Using Pipeline

In [None]:
classifier = pipeline(model="edmundhui/mental_health_trainer")

In [None]:
classifier("I have ADHD")