# Transformer Model
In this notebook, we will use a Transformer model trained on our labeled data to predict whether some test data contains suicidal ideology or not, then compare to the actual labels and measure our metrics.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [2]:
# file path to train data set
train_path = 'train.csv'

# Convert the labels to binary
df = pd.read_csv(train_path)
df['label'] = (df['label'] == 'self.SuicideWatch').astype(int)

print(df.head())
df.describe()

                                                text  label
0  wanting to skip or postpone my exam my exam is...      0
1  Do other bipolar folks have problems with subs...      0
2  Wanted to share some revelations I just had to...      0
3  I feel deader than dead. I find that I don't h...      1
4  I'm pretty sure my friends suicidal what do I ...      1


Unnamed: 0,label
count,45706.0
mean,0.187459
std,0.390284
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [3]:
df.isna().sum()

text     0
label    0
dtype: int64

In [4]:
# Get majority and minority classes
df_majority = df[df['label'] == 0]
df_minority = df[df['label'] == 1]

# upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# downsample majority class
df_majority_downsampled = resample(df_majority,
                                    replace=False,
                                    n_samples=len(df_minority),
                                    random_state=42)

# recreate the data set
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled = df_downsampled.sample(frac=1, random_state=42)

In [5]:
class SuicideDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(
    df_upsampled['text'].values,
    pd.factorize(df_upsampled['label'])[0],
    test_size=0.2,
    random_state=42,
    stratify=pd.factorize(df_upsampled['label'])[0]
)

In [7]:
X_train_downsampled, X_test_downsampled, y_train_downsampled, y_test_downsampled = train_test_split(
    df_downsampled['text'].values,
    pd.factorize(df_downsampled['label'])[0],
    test_size=0.2,
    random_state=42,
    stratify=pd.factorize(df_downsampled['label'])[0]
)

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
train_dataset_upsampled = SuicideDataset(X_train_upsampled, y_train_upsampled, tokenizer, max_length=128)
test_dataset_upsampled = SuicideDataset(X_test_upsampled, y_test_upsampled, tokenizer, max_length=128)

train_loader_upsampled = DataLoader(train_dataset_upsampled, batch_size=16, shuffle=True)
test_loader_upsampled = DataLoader(test_dataset_upsampled, batch_size=16, shuffle=False)

In [10]:
train_dataset_downsampled = SuicideDataset(X_train_downsampled, y_train_downsampled, tokenizer, max_length=128)
test_dataset_downsampled = SuicideDataset(X_test_downsampled, y_test_downsampled, tokenizer, max_length=128)

train_loader_downsampled = DataLoader(train_dataset_downsampled, batch_size=16, shuffle=True)
test_loader_downsampled = DataLoader(test_dataset_downsampled, batch_size=16, shuffle=False)

In [11]:
model_upsampled = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['label'].unique()))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_upsampled.to(device)
optimizer = optim.AdamW(model_upsampled.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
criterion = nn.CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    model_upsampled.train()
    total_loss = 0
    for batch in train_loader_upsampled:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model_upsampled(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader_upsampled)

    model_upsampled.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []
    for batch in test_loader_upsampled:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model_upsampled(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    avg_val_loss = total_val_loss / len(test_loader_upsampled)

    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print(f'Accuracy: {accuracy:.4f}')

    print(f'Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

model_upsampled.save_pretrained("transformer_model_upsampled")

Epoch 1/3
Precision: 0.9088
Recall: 0.9054
F1-score: 0.9052
Accuracy: 0.9054
Loss: 0.3717, Val Loss: 0.2605
Epoch 2/3
Precision: 0.9303
Recall: 0.9293
F1-score: 0.9293
Accuracy: 0.9293
Loss: 0.1982, Val Loss: 0.2078
Epoch 3/3
Precision: 0.9433
Recall: 0.9432
F1-score: 0.9432
Accuracy: 0.9432
Loss: 0.1183, Val Loss: 0.2208


In [15]:
model_downsampled = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['label'].unique()))

model_downsampled.to(device)
optimizer = optim.AdamW(model_downsampled.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
criterion = nn.CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    model_downsampled.train()
    total_loss = 0
    for batch in train_loader_downsampled:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model_downsampled(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader_downsampled)

    model_downsampled.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []
    for batch in test_loader_downsampled:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model_downsampled(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    avg_val_loss = total_val_loss / len(test_loader_downsampled)

    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print(f'Accuracy: {accuracy:.4f}')

    print(f'Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

model_downsampled.save_pretrained("transformer_model_downsampled")

Epoch 1/3
Precision: 0.8151
Recall: 0.8145
F1-score: 0.8144
Accuracy: 0.8145
Loss: 0.4777, Val Loss: 0.4205
Epoch 2/3
Precision: 0.8170
Recall: 0.8133
F1-score: 0.8128
Accuracy: 0.8133
Loss: 0.3506, Val Loss: 0.4211
Epoch 3/3
Precision: 0.7976
Recall: 0.7975
F1-score: 0.7975
Accuracy: 0.7975
Loss: 0.2397, Val Loss: 0.5178
