In [13]:
!pip install transformers torch pyspellchecker



In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import pandas as pd
import numpy as np
import torch
import string
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import re
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from spellchecker import SpellChecker

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from tqdm import tqdm
import time

from sklearn.utils.class_weight import compute_class_weight

In [16]:
# Path to your CSV file on Google Drive
file_path = '/content/drive/MyDrive/SameSame/filtered_data.csv'

# Read the CSV file
data = pd.read_csv(file_path)

# Display the DataFrame to confirm it's loaded correctly
print(data.head())

                                                Text
0                             Ways to feel better ❤️
1  How can I know to separate the real love and f...
2                              Questions + answers 🤔
3                             Ways to feel better ❤️
4  Oky sometimes I feel like my girl friend is st...


In [18]:
# Initialize spellchecker
# spell = SpellChecker()

# Function for text preprocessing without spell correction
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the dataset and create a new column for cleaned messages
tqdm.pandas(desc="Preprocessing")
data['cleaned_message'] = data['Text'].progress_apply(preprocess_text)

Preprocessing: 100%|██████████| 117488/117488 [00:01<00:00, 67436.60it/s]


In [19]:
keywords = ['hopeless', 'hard', 'cant go on', 'point of living', 'suicide', 'kill myself', 'end my life', 'worthless', 'it hurts', 'cant stop using',
            'no way out', 'kill', 'hurt', 'end it all', 'ending my life', 'harmed', 'kms', 'sh', 'thinspiration', 'hitting me', 'abuses me']

def label_message(message, keywords):
    # Check if any keyword is in the message
    for keyword in keywords:
        if keyword in message.lower():
            return 1
    return 0

# Apply heuristic labeling
data['label'] = data['Text'].apply(label_message, keywords=keywords)

In [20]:
count_label_1 = (data['label'] == 1).sum()

# Print the count
print("Number of rows with label = 1:", count_label_1)

Number of rows with label = 1: 13774


In [23]:
# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['cleaned_message'], data['label'], test_size=0.2, random_state=42, stratify=data['label'])

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1], y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).cuda()

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data
class MessageDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
# Create DataLoader
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = MessageDataset(
        texts=texts.to_numpy(),
        labels=labels.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

MAX_LEN = 128
BATCH_SIZE = 16

train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_texts, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.cuda()

# Optimizer and loss function with class weights
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(weight=class_weights).to('cuda')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Training function
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader, desc="Training", leave=False):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
# Training loop with progress bar and timing
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    start_time = time.time()

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        'cuda',
        len(train_texts)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        test_data_loader,
        loss_fn,
        'cuda',
        len(test_texts)
    )

    print(f'Validation loss {val_loss} accuracy {val_acc}')

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'Epoch time: {elapsed_time // 60:.0f}m {elapsed_time % 60:.0f}s')

Epoch 1/3
----------




Train loss 0.12108359391129814 accuracy 0.9656346419831896




Validation loss 0.05118910677962758 accuracy 0.9935313643714359
Epoch time: 8m 36s
Epoch 2/3
----------




Train loss 0.03918990218485557 accuracy 0.9936801787424193




Validation loss 0.05730218553771468 accuracy 0.9920418759043322
Epoch time: 8m 35s
Epoch 3/3
----------




Train loss 0.025530936102999592 accuracy 0.9955739972337482


Evaluating:  49%|████▊     | 713/1469 [00:20<00:21, 35.79it/s]

In [40]:
# Function to preprocess and predict a new message
def predict_message(model, tokenizer, message, max_len=128):
    # Preprocess the message
    cleaned_message = preprocess_text(message)

    # Tokenize the message
    encoding = tokenizer.encode_plus(
        cleaned_message,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True,
    )

    input_ids = encoding['input_ids'].to('cuda')
    attention_mask = encoding['attention_mask'].to('cuda')

    # Put the model in evaluation mode
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).item()

    return predicted_class, probs

# Example usage
message = "I want to kill myself"
predicted_class, probs = predict_message(model, tokenizer, message)

print(f"Predicted class: {'At risk' if predicted_class == 1 else 'Not at risk'}")
print(f"Probabilities: {probs}")

Predicted class: At risk
Probabilities: tensor([[8.9686e-05, 9.9991e-01]], device='cuda:0')


In [43]:
# Function to classify messages using the trained model
def classify_messages(model, tokenizer, messages, max_len=128):
    model.eval()
    results = []

    with torch.no_grad():
        for message in tqdm(messages, desc="Classifying"):
            # Preprocess and tokenize the message
            cleaned_message = preprocess_text(message)
            encoding = tokenizer.encode_plus(
                cleaned_message,
                add_special_tokens=True,
                max_length=max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
                truncation=True,
            )
            input_ids = encoding['input_ids'].to('cuda')
            attention_mask = encoding['attention_mask'].to('cuda')

            # Get model predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=1)
            predicted_class = torch.argmax(probs, dim=1).item()

            results.append(predicted_class)

    return results

# Classify all messages in the dataset
classified_labels = classify_messages(model, tokenizer, data['Text'])

# Count the number of messages classified as 'At risk' by the model
at_risk_count = sum(classified_labels)
total_count = len(classified_labels)
not_at_risk_count = total_count - at_risk_count

print(f"Total messages: {total_count}")
print(f"Messages classified as 'At risk': {at_risk_count}")
print(f"Messages classified as 'Not at risk': {not_at_risk_count}")

Classifying: 100%|██████████| 117488/117488 [17:19<00:00, 113.05it/s]

Total messages: 117488
Messages classified as 'At risk': 13919
Messages classified as 'Not at risk': 103569





In [45]:
# Save the trained model and tokenizer
model.save_pretrained('/content/drive/MyDrive/saved_model')
tokenizer.save_pretrained('/content/drive/MyDrive/saved_model')

('/content/drive/MyDrive/saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/saved_model/vocab.txt',
 '/content/drive/MyDrive/saved_model/added_tokens.json')