# Suicidal Prediction From Scratch


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import  AdamW
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pandas as pd
import numpy as np
import spacy
import re
import math
import gc
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from transformers import BertTokenizer
import ast
from torch.nn.utils.rnn import pad_sequence
import os
import matplotlib.pyplot as plt

## Model CLasses 
####  
---


In [2]:
class PositionalEncoding(nn.Module):
    """
    Positional Encoding module for Transformer models.

    Args:
        d_model (int): The number of expected features in the input.
        max_len (int, optional): The maximum length of the input sequence. Default is 23187.

    Attributes:
        dropout (nn.Dropout): Dropout layer.
        pe (torch.Tensor): Positional encoding tensor.

    """

    def __init__(self, d_model, max_len=23187):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros((max_len, d_model)) 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Forward pass of the PositionalEncoding module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after adding positional encoding and applying dropout.

        """
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerTextClassifier(nn.Module):
    """
    Transformer-based Text Classifier.

    Args:
        ntoken (int): The number of unique tokens in the input.
        ninp (int): The number of expected features in the input.
        nhead (int): The number of heads in the multiheadattention models.
        nhid (int): The dimension of the feedforward network model.
        nlayers (int): The number of sub-encoder-layers in the encoder.
        num_classes (int): The number of classes for classification.
        dropout (float, optional): The dropout value. Default is 0.5.
        norm_first (bool, optional): Whether to apply layer normalization before the first encoder layer. Default is True.

    Attributes:
        pos_encoder (PositionalEncoding): PositionalEncoding module.
        transformer_encoder (nn.TransformerEncoder): TransformerEncoder module.
        encoder (nn.Embedding): Embedding layer.
        ninp (int): The number of expected features in the input.
        fc (nn.Linear): Linear layer for classification.
        activation (nn.GELU): Activation function.

    """

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, num_classes, dropout=0.5, norm_first=True):
        super(TransformerTextClassifier, self).__init__()
        self.pos_encoder = PositionalEncoding(ninp)
        encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout, norm_first=norm_first)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.fc = nn.Linear(ninp, num_classes-1)
        self.activation = nn.GELU() 

        self.init_weights()

    def forward(self, src, src_mask):
        """
        Forward pass of the TransformerTextClassifier module.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor): Mask tensor for the input.

        Returns:
            torch.Tensor: Output tensor after classification.

        """
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = output.mean(dim=1)  # Pooling layer (e.g., mean pooling)
        output = self.fc(output)
        return output

    def init_weights(self):
        """
        Initialize the weights of the model.

        """
        initrange = 0.1
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
                m.bias.data.zero_()


In [9]:
nlp = spacy.load("en_core_web_sm")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

####  
---
### Load the saved data from csv file:

In [14]:
data = pd.read_csv('preprocessed_data.csv')

In [15]:
def convert_string_to_array(s):
    return np.array(ast.literal_eval(s))

data['token_id']  = data['token_id'] .apply(convert_string_to_array)


In [17]:
# Split the data into training, validation, and test sets
X = data['token_id']
y = data['label_prep']
SEED = 1235
train_ids_0, test_ids, train_labels_0, test_labels = tts(X, y, test_size = 0.1, random_state = SEED)
train_ids, val_ids, train_labels, val_labels = tts(train_ids_0, train_labels_0, test_size = 0.2, random_state = SEED)

### Load saved trained model
####  
---
Make sure all model hyperparams are the same as the corresponding config file:

In [43]:
## run on a pretrained checkpoint: 
num_attention_heads= 8
embedding_size= 160
nhidden= 70
nlayers= 4
dropout= 0.24840944810773966
batch_size= 16
ntoken= 37585
learning_rate= 0.0002
model = TransformerTextClassifier( ntoken, embedding_size, num_attention_heads, nhidden, nlayers,2, Dropout, norm_first=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

checkpoint = torch.load("model_checkpoint10.pth")
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [None]:
ntoken= 37585 # the size of vocabulary after tokenization
val_labels_array = val_labels.values.astype(float)
test_labels_array = test_labels.values.astype(float)

# Pad the sequences to create a tensor of shape (batch_size, max_seq_length)
padded_train = pad_sequence([torch.tensor(seq) for seq in train_ids], batch_first=True, padding_value=0)
padded_val = pad_sequence([torch.tensor(seq) for seq in val_ids], batch_first=True, padding_value=0)
padded_test = pad_sequence([torch.tensor(seq) for seq in test_ids], batch_first=True, padding_value=0)

# Create DataLoader objects for the training, validation, and test sets
train_dataset = TensorDataset(torch.tensor(padded_train, dtype=torch.float), torch.tensor(train_labels, dtype=torch.float))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(torch.tensor(padded_val, dtype=torch.float), torch.tensor(val_labels_array, dtype=torch.float))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset =TensorDataset(torch.tensor(padded_test, dtype=torch.float), torch.tensor(test_labels_array, dtype=torch.float))


### Evaluate saved model
####  
---


In [18]:
# Final training results
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for texts, labels in train_loader:
        texts = texts.to(device).long()  
        labels = labels.to(device)
        outputs = model(texts, None)
        outputs = torch.sigmoid(outputs) 
        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        # print(predictions[:,0])
        # print(f'total_correct [{predictions.shape}, total_samples : {labels.shape}')
print(f'total_correct {total_correct}, total_samples : {total_samples}, Val Acc: {100. * accuracy:.2f}%')

total_correct 11282, total_samples : 14651, Val Acc: 77.00%


In [17]:
# Validation
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for texts, labels in val_loader:
        texts = texts.to(device).long()  
        labels = labels.to(device)
        outputs = model(texts, None)
        outputs = torch.sigmoid(outputs) 
        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        # print(predictions[:,0])
        # print(f'total_correct [{predictions.shape}, total_samples : {labels.shape}')
print(f'total_correct {total_correct}, total_samples : {total_samples}, Val Acc: {100. * accuracy:.2f}%')

total_correct 2600, total_samples : 3663, Val Acc: 70.98%


In [16]:
# Test results
model.eval()
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
with torch.no_grad():
    total_correct = 0
    total_samples = 0

    for texts, labels in test_loader:
        texts = texts.to(device).long()  
        labels = labels.to(device)
        outputs = model(texts, None)
        outputs = torch.sigmoid(outputs) 
        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
print(f'total_correct {total_correct}, total_samples : {total_samples}, test Acc: {100. * accuracy:.2f}%')

total_correct 1456, total_samples : 2035, test Acc: 71.55%
