In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(context='notebook', style='whitegrid')

from tqdm.auto import tqdm
tqdm.pandas()
from time import sleep

In [None]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

## Data Loading and Preprocessing

In [None]:
df = pd.read_csv('Mental Health Dataset.csv')
display(df.head())
df.info()

Unnamed: 0,posts,predicted,intensity
0,I know as parent of child with down syndrome t...,negative,-1
1,but in my heart I know this is the future prom...,neutral,0
2,I have mylefibrosis which turn to leukemia the...,negative,-1
3,from one of my health group subject wayne dyer...,neutral,0
4,gmos now link to leukemia http nsnbc I 2013 07...,neutral,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10392 entries, 0 to 10391
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   posts      10391 non-null  object
 1   predicted  10392 non-null  object
 2   intensity  10392 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 243.7+ KB


### Drop useless columns and Use postive labels

In [None]:
df = (df.drop(columns='predicted')
        .dropna()
        .rename(columns={'posts': 'text', 'intensity': 'sentiment'}))

In [None]:
df['sentiment'] = df['sentiment'].replace({1: 3})
df['sentiment'] = df['sentiment'].replace({0: 2})
df['sentiment'] = df['sentiment'].replace({-1: 1})
df['sentiment'] = df['sentiment'].replace({-2: 0})

In [None]:
label_dict = {"very negative":0, "negative":1, "neutral":2, "positive":3}

### Clean Text (no links, html tags, punctuation, numbers, extra white space & lower case)

In [None]:
import re
import string

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /home/cb4344/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cb4344/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_text(text):
    # Remove links
    text = re.sub(r'https?\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers and words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)

    # Remove new line
    text = re.sub(r'\n', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # No Need to remove stopwords for BERT
    # additional_stop_words = [
    #     'cancer',
    #     'go',
    #     'get',
    #     'year',
    #     'know',
    # ]

    # # Combine NLTK's English stop words with additional stop words
    # stop_words = set(stopwords.words('english') + additional_stop_words)

    # # Remove stop words
    # processed_text = ' '.join(word for word in text.split() if word.lower() not in stop_words)

    return text.lower().strip()

df['text'] = df['text'].apply(clean_text)

In [None]:
import statistics
import numpy as np

third_quartile = np.percentile([len(text) for text in df['text']], 75)
print("3rd Quartile:", third_quartile)
statistics.median([len(text) for text in df['text']])

3rd Quartile: 1327.5


777

In [None]:
# No need For BERT
# def preprocess_text(text):
#     tokens = word_tokenize(text)
#     stemmer = PorterStemmer()

#     additional_stop_words = [
#         'cancer',
#         'go',
#         'get',
#         'year',
#         'know',
#     ]

#     stop_words = set(stopwords.words('english') + additional_stop_words)

#     filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
#     return ' '.join(filtered_tokens)

# df['text'] = df['text'].apply(preprocess_text)

### Train, Val, Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [None]:
X = train_df['text']
y = train_df['sentiment']

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  stratify=y,
                                                  random_state=42)


In [None]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
val_df = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

# Assign data types
train_df['data_type'] = 'train'
val_df['data_type'] = 'val'
test_df['data_type'] = 'test'

# Concatenate all subsets into one DataFrame
df = pd.concat([train_df, val_df, test_df])
df.groupby(['sentiment', 'data_type']).size()

sentiment  data_type
0          test          231
           train         739
           val           185
1          test          823
           train        2631
           val           658
2          test          875
           train        2799
           val           700
3          test          150
           train         480
           val           120
dtype: int64

## Find CUDA device

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla V100-SXM2-32GB


## Define metrics

In [None]:
# Test 4
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Bert Tokenization

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512, # Modify Max_Len here
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512, # Modify Max_Len here
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].sentiment.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].sentiment.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels_train.numpy()), y=labels_train.numpy())
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float, device=device)

## Bert Model

In [None]:
%%time
# Test 5
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

CPU times: user 0 ns, sys: 2.55 ms, total: 2.55 ms
Wall time: 57 ms


In [None]:
%%time
# Test 2
from collections import deque
import torch.nn as nn
from sklearn.metrics import classification_report
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      problem_type="single_label_classification"
                                                      )
model.to(device)

# Use class_weights loss function for imbalnce data
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Early stopping parameters
early_stopping_patience = 3
early_stopping_counter = 0
best_val_loss = float('inf')
loss_queue = deque(maxlen=early_stopping_patience)  # to store the last few losses

#Use Grid Search to Fine-tune hyperparameters.
epochs = 3
learning_rates = [3e-5] # [3e-5, 5e-5, 1e-4]
epsilons = [1e-8, 1e-6]
batch_size_list = [32]
for b in batch_size_list:
    dataloader_train = DataLoader(dataset_train,
                                  sampler=RandomSampler(dataset_train),
                                  batch_size=b)

    dataloader_validation = DataLoader(dataset_val,
                                       sampler=SequentialSampler(dataset_val),
                                       batch_size=b)
    for lr in learning_rates:
      for eps in epsilons:
        print(f"\n======================batch size = {b}, learning_rate = {lr}, epsilon = {eps}======================")
        optimizer = AdamW(model.parameters(),
                            lr=lr,
                            eps=eps)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                          num_warmup_steps=0,
                                          num_training_steps=len(dataloader_train)*epochs)

        for epoch in range(1, epochs+1):
            model.train()

            loss_train_total = 0
            for batch in dataloader_train:
                model.zero_grad()
                batch = tuple(b.to(device) for b in batch)
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'labels':         batch[2]
                          }
                outputs = model(**{k: v for k, v in inputs.items() if k != 'labels'})
                logits = outputs.logits

                # Calculate loss with class weights
                loss = loss_fn(logits, inputs['labels'])  # Apply the weighted loss function directly

                loss_train_total += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

            # torch.save(model.state_dict(), f'/home/cb4344/ML Project Bert/finetuned_BERT_epoch_{epoch}.model')
            tqdm.write(f'\nEpoch {epoch}')
            loss_train_avg = loss_train_total/len(dataloader_train)
            tqdm.write(f'Training loss: {loss_train_avg}')
            val_loss, predictions, true_vals = evaluate(dataloader_validation)
            val_f1 = f1_score_func(predictions, true_vals)
            tqdm.write(f'Validation loss: {val_loss}')
            tqdm.write(f'F1 Score (Weighted): {val_f1}')
            accuracy_per_class(predictions, true_vals)

            # Implement Early Stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                # torch.save(model.state_dict(), 'model.pth')  # Save the best model
                early_stopping_counter = 0
            else:
                early_stopping_counter += 1

            if early_stopping_counter >= early_stopping_patience:
              print("Stopping early due to no improvement in validation loss.")
              break

        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        preds_class = np.argmax(predictions, axis=1)
        print(classification_report(true_vals, preds_class, target_names=[label_dict_inverse[i] for i in range(num_labels)]))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch 1
Training loss: 0.7445425131859688
Validation loss: 0.581117797929507
F1 Score (Weighted): 0.7504887102749176
Class: very negative
Accuracy: 139/185

Class: negative -1
Accuracy: 422/658

Class: neutral
Accuracy: 650/700

Class: positive
Accuracy: 49/120


Epoch 2
Training loss: 0.4521609958834373
Validation loss: 0.5235066620203165
F1 Score (Weighted): 0.7927696950009196
Class: very negative
Accuracy: 142/185

Class: negative -1
Accuracy: 523/658

Class: neutral
Accuracy: 573/700

Class: positive
Accuracy: 75/120


Epoch 3
Training loss: 0.2994536400700991
Validation loss: 0.5582929826699771
F1 Score (Weighted): 0.7998293030192632
Class: very negative
Accuracy: 146/185

Class: negative -1
Accuracy: 553/658

Class: neutral
Accuracy: 572/700

Class: positive
Accuracy: 60/120



Epoch 1
Training loss: 0.3251801163602907
Validation loss: 0.604258197431381
F1 Score (Weighted): 0.7845720855134217
Class: very negative
Accuracy: 120/185

Class: negative -1
Accuracy: 553/658

Class: n