In [None]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("/content/Suicide_Detection.csv")

# Drop unnecessary columns and clean up the data
data_cleaned = data.drop(columns=["Unnamed: 0"])
data_cleaned.rename(columns={"text": "text", "class": "label"}, inplace=True)
data_cleaned["label"] = data_cleaned["label"].map({"suicide": 1, "non-suicide": 0})

# Split the data
train_data, temp_data = train_test_split(data_cleaned, test_size=0.2, random_state=42, stratify=data_cleaned["label"])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data["label"])


In [None]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_data(data):
    return tokenizer(
        data["text"].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Tokenize all splits
train_tokens = tokenize_data(train_data)
val_tokens = tokenize_data(val_data)
test_tokens = tokenize_data(test_data)


In [None]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = TextDataset(train_tokens, train_data["label"].tolist())
val_dataset = TextDataset(val_tokens, val_data["label"].tolist())
test_dataset = TextDataset(test_tokens, test_data["label"].tolist())


In [None]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
from tqdm import tqdm

# Training loop
epochs = 2
for epoch in range(epochs):
    # Training phase
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += len(batch["labels"])
    print(f"Validation Accuracy: {correct / total:.4f}")


Epoch 0: 100%|██████████| 11604/11604 [1:06:29<00:00,  2.91it/s, loss=0.227]


Validation Accuracy: 0.9677


Epoch 1: 100%|██████████| 11604/11604 [1:06:35<00:00,  2.90it/s, loss=0.0168]


Validation Accuracy: 0.9769


In [None]:
from sklearn.metrics import classification_report

# Test evaluation
test_loader = DataLoader(test_dataset, batch_size=16)
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch["labels"].cpu().numpy())

print(classification_report(true_labels, predictions, target_names=["non-suicide", "suicide"]))


              precision    recall  f1-score   support

 non-suicide       0.98      0.98      0.98     11604
     suicide       0.98      0.98      0.98     11604

    accuracy                           0.98     23208
   macro avg       0.98      0.98      0.98     23208
weighted avg       0.98      0.98      0.98     23208



In [None]:
model.save_pretrained("./suicide_detection_model")
tokenizer.save_pretrained("./suicide_detection_model")

('./suicide_detection_model/tokenizer_config.json',
 './suicide_detection_model/special_tokens_map.json',
 './suicide_detection_model/vocab.txt',
 './suicide_detection_model/added_tokens.json')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the saved model and tokenizer
model = BertForSequenceClassification.from_pretrained("./suicide_detection_model")
tokenizer = BertTokenizer.from_pretrained("./suicide_detection_model")

# Set the model to evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def predict_text(text):
    # Tokenize the input text
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Move inputs to the correct device
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    # Map numerical prediction to labels
    label_map = {0: "non-suicide", 1: "suicide"}
    return label_map[prediction]


In [None]:
sample_text_1 = "I feel hopeless and don't want to live anymore."
sample_text_2 = "I had a great day today and feel very happy!"

print("Prediction for Sample 1:", predict_text(sample_text_1))  # Expected: "suicide"
print("Prediction for Sample 2:", predict_text(sample_text_2))  # Expected: "non-suicide"

Prediction for Sample 1: suicide
Prediction for Sample 2: non-suicide
